# Using Reprosyn as the generator

This notebook provides an example of using `prive's` generator class to generate census synthetic data using `reprosyn`.

We assume that you have installed reprosyn into whichever python environment you are working in using `pip install git+https://github.com/alan-turing-institute/reprosyn`.

First we load the census dataset.

In [1]:
import sys
import json
sys.path.append('../..')
import pandas as pd
from prive.generators import ReprosynGenerator
from prive.datasets import TabularDataset
from prive.datasets.data_description import DataDescription

In [2]:
census = pd.read_csv('https://raw.githubusercontent.com/alan-turing-institute/reprosyn/main/src/reprosyn/datasets/2011-census-microdata/2011-census-microdata-small.csv')
census.head()

Unnamed: 0,Person ID,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,7394816,E12000001,H,2,1,2,6,2,2,1,2,1,2,5,8,2,-9,4
1,7394745,E12000001,H,5,1,1,4,1,2,1,1,1,2,1,8,6,4,3
2,7395066,E12000001,H,3,1,2,4,1,2,1,1,1,1,1,6,11,3,4
3,7395329,E12000001,H,3,1,2,2,1,2,1,2,1,2,1,7,7,3,2
4,7394712,E12000001,H,3,1,1,5,4,2,1,1,1,2,1,1,4,3,2


In [3]:
with open('../datasets/examples/census.json', 'r') as f:
    census_description = json.load(f)

census_description

[{'name': 'Person ID', 'type': 'countable', 'representation': 'integer'},
 {'name': 'Region',
  'type': 'finite',
  'representation': ['E12000001',
   'E12000002',
   'E12000003',
   'E12000004',
   'E12000005',
   'E12000009',
   'E12000006',
   'E12000008',
   'E12000007',
   'W92000004']},
 {'name': 'Residence Type', 'type': 'finite', 'representation': ['H', 'C']},
 {'name': 'Family Composition',
  'type': 'finite',
  'representation': ['2', '5', '3', '1', '6', '-9', '4']},
 {'name': 'Population Base',
  'type': 'finite',
  'representation': ['1', '2', '3']},
 {'name': 'Sex', 'type': 'finite', 'representation': ['2', '1']},
 {'name': 'Age',
  'type': 'finite',
  'representation': ['6', '4', '2', '5', '1', '7', '3', '8']},
 {'name': 'Marital Status',
  'type': 'finite',
  'representation': ['2', '1', '4', '3', '5']},
 {'name': 'Student', 'type': 'finite', 'representation': ['2', '1']},
 {'name': 'Country of Birth',
  'type': 'finite',
  'representation': ['1', '2', '-9']},
 {'name': 

In [4]:
tab_census = TabularDataset(census, description = DataDescription(census_description))

In [5]:
gen = ReprosynGenerator()
config = gen.get_default_config()

In [6]:
config

{'domain': None, 'epsilon': 1.0, 'delta': 1e-09, 'degree': 2}

In [7]:
gen.fit(tab_census)


In [8]:
tab_data = gen.generate(10)
tab_data.data



Unnamed: 0,Region Residence Type ... Hours worked per week Approximated Social Grade
0,0 3 0 ... ...
1,1 4 0 ... ...
2,2 2 0 ... ...
3,3 4 0 ... ...
4,4 0 0 ... ...
5,5 0 0 ... ...
6,6 2 0 ... ...
7,7 5 0 ... ...
8,8 3 0 ... ...
9,9 5 0 ... ...


In [9]:
datasets = [] 
samples = 10
epsilon = [1,10,100]
for e in epsilon:
    config['epsilon'] = e
    gen = ReprosynGenerator(config = config)
    gen.fit(TabularDataset(census, description = DataDescription(census_description)))
    tab_data = gen.generate(samples)
    datasets.append([gen, tab_data.data])



In [10]:
datasets

[[<prive.generators.generator.ReprosynGenerator at 0x7f87e8fbd520>,
        Region  Residence Type  ...  Hours worked per week  Approximated Social Grade
  0   0       0               0  ...                ...                              
  1   1       1               0  ...                ...                              
  2   2       2               0  ...                ...                              
  3   3       4               0  ...                ...                              
  4   4       5               0  ...                ...                              
  5   5       4               0  ...                ...                              
  6   6       2               0  ...                ...                              
  7   7       5               0  ...                ...                              
  8   8       4               0  ...                ...                              
  9   9       1               0  ...                ...                 