<a href="https://colab.research.google.com/github/Yasaman-A/Assign1-1.0/blob/master/data_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --quiet --upgrade tensorflow_federated

In [None]:
import collections
import functools
import os
import time

import numpy as np
import tensorflow as tf
import tensorflow_federated as tff

tf.compat.v1.enable_v2_behavior()

np.random.seed(0)

# Test the TFF is working:
tff.federated_computation(lambda: 'Hello, World!')()

b'Hello, World!'

# Data Generation
Required knowledge: 
1. TF Data Set
2. TFF Simulation APIs
3. Seeing how simulation data is used in TFF examples
4. What is IID and Non-IDD Data
5. ClientData class in TFF Github
6. Python Generators

The following path should be changed based on your own google drive folder structure to point to where your csv file is. Google Drive needs to be mounted first.

In [None]:
cd '/content/drive/My Drive/workspace/Federated/'

/content/drive/My Drive/workspace/Federated


In [None]:
import pandas as pd
import tensorflow as tf
import tensorflow_federated as tff

# A dummy dataset - data file exist in my google drive. 
# You can see the file format here: https://drive.google.com/file/d/11Y_5o-jPgmkauYYsts8GhXj__SLkaqJk/view?usp=sharing
csv_url = "data-simple.csv"

df = pd.read_csv(csv_url, na_values=("?",))

client_id_colname = 'id' # the column that represents client ID
SHUFFLE_BUFFER = 1000
NUM_EPOCHS = 1

# split client id into train and test clients
client_ids = df[client_id_colname].unique()
client_ids = pd.DataFrame(data=client_ids)

from sklearn.model_selection import train_test_split
train, test = train_test_split(client_ids, test_size=0.1)

In [None]:
train_client_ids = train[0].values.tolist()
train_client_ids

[1, 2, 4, 5]

In [None]:
test_client_ids = test[0].values.tolist()
test_client_ids

[3]

In [None]:
def create_tf_dataset_for_client_fn(client_id):
  # a function which takes a client_id and returns a
  # tf.data.Dataset for that client
  client_data = df[df[client_id_colname] == client_id]
  dataset = tf.data.Dataset.from_tensor_slices(client_data.to_dict('list'))
  dataset = dataset.shuffle(SHUFFLE_BUFFER).batch(1).repeat(NUM_EPOCHS)
  return dataset

In [None]:
train_data = tff.simulation.ClientData.from_clients_and_fn(
        client_ids=train_client_ids,
        create_tf_dataset_for_client_fn=create_tf_dataset_for_client_fn
    )
test_data = tff.simulation.ClientData.from_clients_and_fn(
        client_ids=test_client_ids,
        create_tf_dataset_for_client_fn=create_tf_dataset_for_client_fn
    )

In [None]:
dd = train_data.create_tf_dataset_from_all_clients()

dd

<FlatMapDataset shapes: {id: (None,), a: (None,), b: (None,)}, types: {id: tf.int32, a: tf.int32, b: tf.int32}>

In [None]:
example_dataset = train_data.create_tf_dataset_for_client(
        train_data.client_ids[1]
    )

ed_iter = iter(example_dataset)

#len(list(ed_iter))

example_element = next(ed_iter)
print(example_element)

example_element = next(ed_iter)
print(example_element)

{'id': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([2], dtype=int32)>, 'a': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([2222], dtype=int32)>, 'b': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>}
{'id': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([2], dtype=int32)>, 'a': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([222], dtype=int32)>, 'b': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>}


In [None]:
import pickle
pickle.dump(train_data, open('train_yas.h5', 'wb'))
pickle.dump(train_data, open('test_yas.h5', 'wb'))

another_data = pickle.load(open('train_yas.h5', 'rb'))

In [None]:
import tarfile
tar = tarfile.open("sample_yas.tar.gz", "w:gz")
for name in ["train_yas.h5", "test_yas.h5"]:
    tar.add(name)
tar.close()

In [None]:
example2_dataset = another_data.create_tf_dataset_for_client(
        another_data.client_ids[0])

ed_iter2 = iter(example2_dataset)

example2_element = next(ed_iter2)
print(example2_element)

{'id': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>, 'a': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>, 'b': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>}


Build and view iid data:

In [None]:
i_data = train_data
all_new_clients = iter(tff.simulation.datasets.build_synthethic_iid_datasets(i_data, 6))
# https://wiki.python.org/moin/Generators

In [None]:
client_0 = next(all_new_clients)
client_1 = next(all_new_clients)

In [None]:
ds = tf.data.Dataset.zip((client_0['id'], client_0['a'], client_0['b']))

for c in ds:
    print(c[2])

tf.Tensor([0], shape=(1,), dtype=int32)
tf.Tensor([7], shape=(1,), dtype=int32)
tf.Tensor([7], shape=(1,), dtype=int32)
tf.Tensor([0], shape=(1,), dtype=int32)
tf.Tensor([7], shape=(1,), dtype=int32)
tf.Tensor([7], shape=(1,), dtype=int32)


In [None]:
for c in client_0['a'] :
    print(c)
    print(type(c))

tf.Tensor([10], shape=(1,), dtype=int32)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor([2], shape=(1,), dtype=int32)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor([22], shape=(1,), dtype=int32)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor([222], shape=(1,), dtype=int32)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor([55], shape=(1,), dtype=int32)
<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor([22], shape=(1,), dtype=int32)
<class 'tensorflow.python.framework.ops.EagerTensor'>


In [None]:
class customClientData(ClientData):
    def __init__(self, non_iid_data, per_client_data_size, number_user):
        self.ds = dict()
        all_new_clients = iter(tff.simulation.datasets.build_synthethic_iid_datasets(non_iid_data, per_client_data_size))
        for user in range(number_user):
            client = next(all_new_clients)
            print(client)
            #print('------------------')
            self.ds[user] = client

    @property
    def client_ids(self) -> List[str]:
        return list(self.ds.keys())

    def create_tf_dataset_for_client(self, client_id: str) -> tf.data.Dataset:
        return self.ds[client_id]

    @property
    def element_type_structure(self):
#        return tf.TensorSpec(shape=(3,), dtype=tf.dtypes.int32)
        return self.ds[0].element_spec


In [None]:
def convert(client):
    res = []
    zipped_client = tf.data.Dataset.zip((client['id'], client['a'], client['b']))
    for c in zipped_client:
        res.append({'id': c[0], 'a': c[1], 'b': c[2]})
    return res

In [None]:
#Question: How can I run create_tf_dataset_for_client based on the iid datasets we created?
iid_all_Client_dataset = customClientData(train_data, 3, 2)
#iid_all_Client_dataset.client_ids

{'id': <_VariantDataset shapes: (None,), types: tf.int32>, 'a': <_VariantDataset shapes: (None,), types: tf.int32>, 'b': <_VariantDataset shapes: (None,), types: tf.int32>}
{'id': <_VariantDataset shapes: (None,), types: tf.int32>, 'a': <_VariantDataset shapes: (None,), types: tf.int32>, 'b': <_VariantDataset shapes: (None,), types: tf.int32>}


In [None]:
iid_all_Client_dataset.element_type_structure()

In [None]:
orig_iid_client_dataset = iid_all_Client_dataset.create_tf_dataset_for_client(
        iid_all_Client_dataset.client_ids[1]
)

##
example_element = next(iter(orig_iid_client_dataset))
print(example_element)

id


In [None]:
iid_client_dataset = convert(orig_iid_client_dataset)

for data in iid_client_dataset:
    print(data['id'])

tf.Tensor([2], shape=(1,), dtype=int32)
tf.Tensor([1], shape=(1,), dtype=int32)
tf.Tensor([2], shape=(1,), dtype=int32)
tf.Tensor([1], shape=(1,), dtype=int32)
tf.Tensor([4], shape=(1,), dtype=int32)
tf.Tensor([2], shape=(1,), dtype=int32)


In [None]:
example_element = next(ed_iter)
print(example_element)

example_element = next(ed_iter)
print(example_element)

example_element = next(ed_iter)
print(example_element)

## Parameterizing the data generation process
Possible parameters (from the leaf project)

* -s := 'iid' to sample in an i.i.d. manner, or 'niid' to sample in a non-i.i.d. manner; In the i.i.d. sampling scenario, each datapoint is equally likely to be sampled. Thus, all users have the same underlying distribution of data. In the non-i.i.d. sampling scenario, the underlying distribution of data for each user is consistent with the raw data. Since we assume that data distributions vary between user in the raw data, we refer to this sampling process as non-i.i.d.

* --iu := number of users, if iid sampling; expressed as a fraction of the total number of users; default is 0.01

* --sf := fraction of data to sample, written as a decimal; default is 0.1

* -k := minimum number of samples per user

* -t := 'user' to partition users into train-test groups, or 'sample' to partition each user's samples into train-test groups

* --tf := fraction of data in training set, written as a decimal; default is 0.9

* --smplseed := seed to be used before random sampling of data

* --spltseed := seed to be used before random split of data