# Reprosyn within Python

Here we briefly provide examples of using generators within Reprosyn, with the 1% Census.

In [1]:
import pandas as pd
from reprosyn.methods import MST, IPF, CTGAN, PRIVBAYES, DS_INDHIST, DS_BAYNET, DS_PRIVBAYES

size = 10
epsilon = 1

In [2]:
census = pd.read_csv('https://raw.githubusercontent.com/alan-turing-institute/reprosyn/main/src/reprosyn/datasets/2011-census-microdata/2011-census-microdata-small.csv')

census.drop(columns=['Person ID'], inplace=True)

census.head()

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000001,H,2,1,2,6,2,2,1,2,1,2,5,8,2,-9,4
1,E12000001,H,5,1,1,4,1,2,1,1,1,2,1,8,6,4,3
2,E12000001,H,3,1,2,4,1,2,1,1,1,1,1,6,11,3,4
3,E12000001,H,3,1,2,2,1,2,1,2,1,2,1,7,7,3,2
4,E12000001,H,3,1,1,5,4,2,1,1,1,2,1,1,4,3,2


## DS_INDHIST

In [3]:
indhist = DS_INDHIST(dataset=census.copy(), size = size)

In [4]:
indhist.run()

In [5]:
indhist.output

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000003,H,5,1,2,1,5,1,2,2,5,-9,2,5,7,3,1
1,E12000002,H,1,1,1,3,4,1,-9,2,5,1,4,8,4,-9,4
2,E12000001,H,3,2,1,3,1,2,-9,3,-9,7,5,-9,8,1,2
3,E12000001,H,5,1,1,7,1,1,1,3,1,5,2,8,10,-9,-9
4,E12000003,H,5,1,1,5,2,2,2,2,4,8,5,9,1,-9,1
5,E12000009,H,4,2,1,4,4,2,-9,3,2,2,-9,4,-9,1,3
6,E12000004,H,3,2,2,7,5,2,-9,-9,2,-9,7,7,12,2,2
7,E12000003,H,2,1,2,4,5,2,2,1,4,-9,7,5,11,3,1
8,E12000004,H,2,3,2,2,2,1,1,-9,3,9,7,9,10,2,-9
9,E12000005,H,4,1,1,3,4,1,2,-9,2,3,-9,4,3,-9,3


## DS_BAYNET

In [6]:
baynet = DS_BAYNET(dataset=census.copy(), size = size)

In [7]:
baynet.run()

In [8]:
baynet.output

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000009,H,2,1,1,5,2,2,1,1,1,1,7,5,4,-9,3
1,E12000004,H,2,1,2,4,2,2,1,1,1,1,1,4,7,2,2
2,E12000004,H,2,1,2,4,2,2,1,1,1,2,1,6,11,3,3
3,E12000001,H,2,1,1,7,2,2,1,3,1,2,5,9,5,-9,4
4,E12000009,H,2,1,2,5,2,2,1,1,1,1,1,7,4,3,3
5,E12000002,H,-9,2,2,1,1,1,-9,1,-9,-9,-9,-9,-9,-9,-9
6,E12000005,H,2,1,2,6,2,2,1,1,1,2,1,6,10,3,3
7,E12000003,H,2,1,2,7,2,2,1,3,1,1,5,-9,-9,-9,2
8,E12000002,H,5,1,1,1,1,1,1,4,1,1,-9,-9,-9,-9,-9
9,E12000004,H,2,1,2,6,2,2,1,3,1,2,7,9,11,-9,4


## DS_PRIVBAYES

In [9]:
privbayes = DS_PRIVBAYES(dataset=census.copy(), size = size, epsilon=epsilon)

In [10]:
privbayes.run()

In [11]:
privbayes.output

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000009,H,2,1,1,6,2,2,1,3,1,2,1,8,2,3,3
1,E12000009,H,2,1,2,6,2,2,1,2,1,2,1,7,11,3,2
2,E12000003,H,2,1,1,4,2,2,1,2,1,2,1,5,8,3,3
3,E12000009,H,2,1,2,5,2,2,1,1,1,1,2,6,10,3,3
4,E12000003,H,2,1,1,4,2,2,1,2,1,2,1,1,4,2,2
5,E12000003,H,2,1,1,5,2,2,1,1,1,2,8,2,11,2,2
6,E12000003,H,2,1,1,4,1,2,1,1,1,2,2,2,10,4,1
7,E12000001,H,2,1,2,4,2,2,1,3,1,1,1,4,7,4,2
8,E12000003,H,2,1,2,6,2,2,1,3,1,2,8,9,11,-9,4
9,E12000003,H,5,1,2,3,1,2,1,1,1,2,1,4,2,3,2


## PRIVBAYES

In [12]:
pbayes = PRIVBAYES(dataset=census.copy(), size=10, epsilon=1)

In [None]:
pbayes.run()

In [None]:
pbayes.output

## MST

In [None]:
size = 10

mst_gen = MST(dataset=census.copy(), size=size, epsilon = 1)

In [None]:
# can check the settable parameters. Set using gen.params['param'] = val
mst_gen.get_parameters()

In [None]:
mst_gen.run()

In [None]:
mst_gen.output

## IPF

IPF only has one parameter, `marginals`. Additionally, IPF doesn't scale well and doesn't cope with all 17 features, so we drop a few of the larger features.

In [None]:
dropped = ['Industry','Economic Activity','Occupation','Approximated Social Grade']
census_slim = census.drop(columns = dropped, inplace=False).copy()

In [None]:
ipf_gen = IPF(dataset=census_slim.copy(), size=size, marginals = [(0,1),(2,3),(1,2,4)])

In [None]:
ipf_gen.run()

In [None]:
ipf_gen.output

# CTGAN



In [None]:
ctgan = CTGAN(dataset=census.copy(), size=size, epochs = 10)

In [None]:
ctgan.run()

In [None]:
ctgan.output