# Using Reprosyn as the generator

This notebook provides an example of using `TAPAS's` generator class to generate census synthetic data using `reprosyn`.

We assume that you have installed reprosyn into whichever python environment you are working in using `pip install git+https://github.com/alan-turing-institute/reprosyn`.

First we load the census dataset.

In [1]:
import sys
import json
import pandas as pd

In [2]:
census = pd.read_csv('https://raw.githubusercontent.com/alan-turing-institute/reprosyn/main/src/reprosyn/datasets/2011-census-microdata/2011-census-microdata-small.csv')
census = census.drop(columns = ['Person ID'])
census.head()

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000001,H,2,1,2,6,2,2,1,2,1,2,5,8,2,-9,4
1,E12000001,H,5,1,1,4,1,2,1,1,1,2,1,8,6,4,3
2,E12000001,H,3,1,2,4,1,2,1,1,1,1,1,6,11,3,4
3,E12000001,H,3,1,2,2,1,2,1,2,1,2,1,7,7,3,2
4,E12000001,H,3,1,1,5,4,2,1,1,1,2,1,1,4,3,2


In [3]:
with open('data/2011 Census Microdata Teaching File.json', 'r') as f:
    census_description = json.load(f)

census_description

[{'name': 'Region',
  'type': 'finite',
  'representation': ['E12000001',
   'E12000002',
   'E12000003',
   'E12000004',
   'E12000005',
   'E12000009',
   'E12000006',
   'E12000008',
   'E12000007',
   'W92000004']},
 {'name': 'Residence Type', 'type': 'finite', 'representation': ['H', 'C']},
 {'name': 'Family Composition',
  'type': 'finite',
  'representation': ['-9', '1', '2', '3', '4', '5', '6']},
 {'name': 'Population Base',
  'type': 'finite',
  'representation': ['1', '2', '3']},
 {'name': 'Sex', 'type': 'finite', 'representation': ['1', '2']},
 {'name': 'Age',
  'type': 'finite',
  'representation': ['1', '2', '3', '4', '5', '6', '7', '8']},
 {'name': 'Marital Status',
  'type': 'finite',
  'representation': ['1', '2', '3', '4', '5']},
 {'name': 'Student', 'type': 'finite', 'representation': ['1', '2']},
 {'name': 'Country of Birth',
  'type': 'finite',
  'representation': ['-9', '1', '2']},
 {'name': 'Health',
  'type': 'finite',
  'representation': ['-9', '1', '2', '3', '4

Convert this dataset to a `tapas.TabularDataset`.

In [4]:
from tapas.datasets import TabularDataset
from tapas.datasets.data_description import DataDescription

In [5]:
tab_census = TabularDataset(census.sample(10000), description = DataDescription(census_description))

Next, instantiate the generator.

In [6]:
from tapas.generators import ReprosynGenerator

In [7]:
from reprosyn.methods import MST

  from .autonotebook import tqdm as notebook_tqdm
2022-12-01 15:03:36.313326: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
non-resource variables are not supported in the long term


In [8]:
gen = ReprosynGenerator(MST, label="MST")

In [9]:
gen.fit(tab_census)

In [10]:
tab_data = gen.generate(10)
tab_data.data

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000009,H,2,1,1,6,2,2,1,2,1,2,1,6,11,4,3
1,E12000005,H,2,1,2,3,2,2,1,1,1,2,3,7,4,-9,2
2,E12000009,H,2,1,1,2,1,2,1,3,1,9,8,-9,-9,-9,4
3,E12000002,H,2,1,1,8,2,2,1,3,1,2,5,8,2,-9,4
4,E12000003,H,1,1,1,5,4,2,1,2,1,2,1,8,4,3,4
5,E12000002,H,2,1,1,4,2,2,1,2,1,2,7,9,6,-9,4
6,E12000003,H,1,1,2,3,1,2,1,1,1,1,1,3,8,2,2
7,E12000004,H,2,1,1,5,2,1,1,1,1,9,9,8,12,-9,2
8,E12000005,H,2,1,2,1,1,1,2,1,3,6,-9,-9,-9,-9,-9
9,E12000001,H,5,1,1,1,1,1,1,1,1,1,-9,-9,-9,-9,-9


In [11]:
datasets = [] 
samples = 10
epsilon = [1,10,100]
for e in epsilon:
    gen = ReprosynGenerator(MST, epsilon = e)
    gen.fit(TabularDataset(census, description = DataDescription(census_description)))
    tab_data = gen.generate(samples)
    datasets.append([gen, tab_data.data])

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby(list(proj)).apply(foo)
To preserve the previous behavior, use

	>>> .groupby(..., group

In [12]:
datasets

[[<tapas.generators.generator.ReprosynGenerator at 0x146675f40>,
        Region Residence Type Family Composition Population Base Sex Age  \
  0  E12000005              H                  3               1   1   1   
  1  E12000005              H                  5               1   1   2   
  2  E12000002              H                  3               1   2   5   
  3  E12000003              H                  2               1   2   1   
  4  E12000001              H                  2               1   1   6   
  5  E12000009              H                  2               1   2   4   
  6  E12000003              H                  3               1   1   4   
  7  E12000009              H                  2               1   2   7   
  8  E12000003              H                  1               1   2   3   
  9  E12000009              H                  2               1   1   1   
  
    Marital Status Student Country of Birth Health Ethnic Group Religion  \
  0              1  