# Reprosyn within Python

Here we briefly provide examples of using generators within Reprosyn, with the 1% Census.

In [1]:
import pandas as pd
from reprosyn.methods import MST, IPF, CTGAN, PRIVBAYES, DS_INDHIST, DS_BAYNET, DS_PRIVBAYES, PATEGAN

size = 10
epsilon = 1

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
census = pd.read_csv('https://raw.githubusercontent.com/alan-turing-institute/reprosyn/main/src/reprosyn/datasets/2011-census-microdata/2011-census-microdata-small.csv')

census.drop(columns=['Person ID'], inplace=True)

census.head()

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000001,H,2,1,2,6,2,2,1,2,1,2,5,8,2,-9,4
1,E12000001,H,5,1,1,4,1,2,1,1,1,2,1,8,6,4,3
2,E12000001,H,3,1,2,4,1,2,1,1,1,1,1,6,11,3,4
3,E12000001,H,3,1,2,2,1,2,1,2,1,2,1,7,7,3,2
4,E12000001,H,3,1,1,5,4,2,1,1,1,2,1,1,4,3,2


## PATEGAN

In [3]:
pategan = PATEGAN(dataset=census.copy(), size=size)

In [4]:
pategan.run()

In [5]:
pategan.output

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000003,H,3,3,1,3,2,1,1,-9,2,5,2,4,11,-9,4
1,E12000009,H,4,1,2,4,4,2,-9,5,3,6,7,8,10,1,2
2,E12000002,C,1,1,2,2,2,1,2,2,4,-9,1,-9,8,1,-9
3,E12000007,H,2,2,2,4,1,2,-9,3,3,9,3,9,1,4,2
4,E12000006,H,3,2,2,6,5,1,-9,1,-9,4,1,2,10,4,3
5,E12000007,H,2,2,2,3,5,1,1,2,-9,2,9,9,6,-9,1
6,E12000004,C,6,2,1,4,5,2,2,2,1,7,1,9,2,1,3
7,W92000004,C,2,1,2,7,2,2,2,2,4,7,1,-9,4,3,3
8,E12000009,H,3,3,2,2,2,2,2,3,5,6,7,8,4,1,-9
9,W92000004,C,1,2,2,8,4,1,-9,3,5,7,5,-9,3,-9,1


## DS_INDHIST

In [6]:
indhist = DS_INDHIST(dataset=census.copy(), size = size)

In [7]:
indhist.run()

In [8]:
indhist.output

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000001,H,6,1,1,8,5,1,-9,-9,-9,9,1,8,10,1,1
1,E12000009,H,6,2,1,7,5,1,-9,1,5,6,1,3,10,4,2
2,E12000005,H,1,2,2,8,3,2,-9,-9,5,-9,8,7,10,3,4
3,E12000004,H,-9,1,2,7,3,1,1,2,1,7,1,8,9,1,2
4,E12000004,H,6,2,2,8,2,2,-9,-9,3,3,2,3,8,-9,3
5,E12000009,H,4,3,1,5,5,1,1,3,-9,2,9,5,1,-9,2
6,E12000009,H,1,1,2,4,5,2,1,4,2,6,6,5,3,2,3
7,E12000003,H,3,1,2,8,5,2,-9,1,1,5,4,7,6,-9,1
8,E12000004,H,3,3,2,8,5,1,-9,-9,2,3,5,4,-9,1,4
9,E12000002,H,2,1,1,6,5,2,-9,3,2,5,6,2,11,2,2


## DS_BAYNET

In [9]:
baynet = DS_BAYNET(dataset=census.copy(), size = size)

In [10]:
baynet.run()

In [11]:
baynet.output

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000003,H,2,1,2,5,2,2,1,1,1,1,1,7,4,2,2
1,E12000009,H,3,1,2,1,1,1,1,1,1,1,-9,-9,-9,-9,-9
2,E12000004,H,2,1,1,5,2,2,2,1,3,2,1,8,2,3,4
3,E12000001,H,1,1,2,6,4,2,1,2,1,2,1,9,2,2,4
4,E12000003,H,5,1,2,3,1,2,1,2,1,2,1,7,8,3,2
5,E12000003,H,2,1,1,7,2,2,1,2,1,1,5,9,2,-9,4
6,E12000003,H,2,1,1,1,1,1,1,1,1,2,-9,-9,-9,-9,-9
7,E12000009,H,3,1,1,1,1,1,1,2,2,2,-9,-9,-9,-9,-9
8,E12000001,H,1,1,2,5,4,2,1,2,1,9,1,5,8,3,3
9,E12000001,H,1,1,2,8,5,2,1,1,1,2,5,4,7,-9,1


## DS_PRIVBAYES

In [12]:
privbayes = DS_PRIVBAYES(dataset=census.copy(), size = size, epsilon=epsilon)

In [13]:
privbayes.run()

In [14]:
privbayes.output

Unnamed: 0,Region,Residence Type,Family Composition,Population Base,Sex,Age,Marital Status,Student,Country of Birth,Health,Ethnic Group,Religion,Economic Activity,Occupation,Industry,Hours worked per week,Approximated Social Grade
0,E12000004,H,2,1,2,4,2,2,1,1,1,2,7,9,8,-9,2
1,E12000001,H,2,1,2,6,2,2,1,4,1,2,8,2,7,-9,1
2,E12000005,H,2,1,2,3,1,2,1,3,1,2,1,3,8,3,2
3,E12000001,H,2,1,1,1,1,1,1,1,1,5,-9,-9,-9,-9,-9
4,E12000001,H,3,1,2,2,1,2,2,2,1,2,3,1,8,-9,1
5,E12000003,H,2,1,1,5,2,2,1,2,1,2,1,9,9,3,4
6,E12000003,H,2,1,2,5,2,2,1,2,1,1,1,-9,-9,4,2
7,E12000002,H,2,1,1,1,1,1,1,1,3,9,-9,-9,-9,-9,-9
8,E12000005,H,2,1,1,1,1,1,1,1,1,1,-9,-9,-9,-9,-9
9,E12000004,H,3,1,2,2,1,2,2,2,5,5,1,9,11,3,4


## PRIVBAYES

In [15]:
pbayes = PRIVBAYES(dataset=census.copy(), size=10, epsilon=1)

In [16]:
pbayes.run()

KeyError: 1

In [None]:
pbayes.output

## MST

In [None]:
size = 10

mst_gen = MST(dataset=census.copy(), size=size, epsilon = 1)

In [None]:
# can check the settable parameters. Set using gen.params['param'] = val
mst_gen.get_parameters()

In [None]:
mst_gen.run()

In [None]:
mst_gen.output

## IPF

IPF only has one parameter, `marginals`. Additionally, IPF doesn't scale well and doesn't cope with all 17 features, so we drop a few of the larger features.

In [None]:
dropped = ['Industry','Economic Activity','Occupation','Approximated Social Grade']
census_slim = census.drop(columns = dropped, inplace=False).copy()

In [None]:
ipf_gen = IPF(dataset=census_slim.copy(), size=size, marginals = [(0,1),(2,3),(1,2,4)])

In [None]:
ipf_gen.run()

In [None]:
ipf_gen.output

# CTGAN



In [None]:
ctgan = CTGAN(dataset=census.copy(), size=size, epochs = 10)

In [None]:
ctgan.run()

In [None]:
ctgan.output