# Using Reprosyn as the generator

This notebook provides an example of using `TAPAS's` generator class to generate census synthetic data using `reprosyn`.

We assume that you have installed reprosyn into whichever python environment you are working in using `pip install git+https://github.com/alan-turing-institute/reprosyn`.

First we load the census dataset.

In [1]:
import sys
import json
sys.path.append('../..')
import pandas as pd
from tapas.generators import ReprosynGenerator
from tapas.datasets import TabularDataset
from tapas.datasets.data_description import DataDescription

In [2]:
census = pd.read_csv('https://raw.githubusercontent.com/alan-turing-institute/reprosyn/main/src/reprosyn/datasets/2011-census-microdata/2011-census-microdata-small.csv')
census = census.drop(columns = ['Person ID'])
census.head()


Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000001,H,2,1,2,6,2,2,1,2,1,2,5,8,2,-9,4
1,E12000001,H,5,1,1,4,1,2,1,1,1,2,1,8,6,4,3
2,E12000001,H,3,1,2,4,1,2,1,1,1,1,1,6,11,3,4
3,E12000001,H,3,1,2,2,1,2,1,2,1,2,1,7,7,3,2
4,E12000001,H,3,1,1,5,4,2,1,1,1,2,1,1,4,3,2


In [3]:
with open('../datasets/examples/census.json', 'r') as f:
    census_description = json.load(f)

census_description

[{'name': 'Region',
  'type': 'finite',
  'representation': ['E12000001',
   'E12000002',
   'E12000003',
   'E12000004',
   'E12000005',
   'E12000009',
   'E12000006',
   'E12000008',
   'E12000007',
   'W92000004']},
 {'name': 'Residence Type', 'type': 'finite', 'representation': ['H', 'C']},
 {'name': 'Family Composition',
  'type': 'finite',
  'representation': ['2', '5', '3', '1', '6', '-9', '4']},
 {'name': 'Population Base',
  'type': 'finite',
  'representation': ['1', '2', '3']},
 {'name': 'Sex', 'type': 'finite', 'representation': ['2', '1']},
 {'name': 'Age',
  'type': 'finite',
  'representation': ['6', '4', '2', '5', '1', '7', '3', '8']},
 {'name': 'Marital Status',
  'type': 'finite',
  'representation': ['2', '1', '4', '3', '5']},
 {'name': 'Student', 'type': 'finite', 'representation': ['2', '1']},
 {'name': 'Country of Birth',
  'type': 'finite',
  'representation': ['1', '2', '-9']},
 {'name': 'Health',
  'type': 'finite',
  'representation': ['2', '1', '3', '4', '-9

In [4]:
tab_census = TabularDataset(census, description = DataDescription(census_description))

In [5]:
gen = ReprosynGenerator()
config = gen.get_default_config()

In [6]:
config

{'domain': None, 'epsilon': 1.0, 'delta': 1e-09, 'degree': 2}

In [8]:
gen.fit(tab_census)


In [9]:
tab_data = gen.generate(10)
tab_data.data



Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,2,0,2,0,1,7,1,0,1,1,1,2,6,9,4,0,2
1,2,0,2,0,0,1,0,1,1,2,1,9,3,7,2,0,4
2,1,0,2,0,1,6,1,1,1,3,1,2,5,6,11,0,3
3,3,0,5,0,0,4,4,1,1,1,1,2,1,9,12,3,4
4,5,0,5,0,1,1,0,0,1,5,1,1,4,3,4,1,2
5,3,0,3,0,1,7,3,1,1,2,1,9,5,4,3,0,2
6,0,0,2,0,1,5,1,1,1,2,1,2,1,9,6,3,4
7,0,0,1,0,1,0,0,0,2,1,1,1,0,0,0,0,0
8,4,0,2,0,0,2,0,1,1,1,3,7,1,0,0,1,2
9,5,0,3,0,1,0,0,0,1,1,1,2,0,0,0,0,0


In [None]:
datasets = [] 
samples = 10
epsilon = [1,10,100]
for e in epsilon:
    config['epsilon'] = e
    gen = ReprosynGenerator(config = config)
    gen.fit(TabularDataset(census, description = DataDescription(census_description)))
    tab_data = gen.generate(samples)
    datasets.append([gen, tab_data.data])



In [None]:
datasets