# Generate synthetic dataset

In [1]:
import json
import numpy as np
import pandas as pd
import pickle

from pathlib import Path
from scripts.preparation_fcns import *

In [2]:
%load_ext autoreload
%autoreload 2

pd.options.mode.chained_assignment = None

## Import raw data

In [3]:
train_file = Path('data/raw/train.csv')

In [4]:
df = process_dataset(train_file)

## Prepare table Metadata

In [5]:
fields_dic = prepare_dict(df)

In [6]:
table_dic = {"path": "",
        "headers": True,
        "name": "titanic",
        "path": "titanic.csv",
        "primary_key": "PassengerId",
        "use": True,
        'fields': fields_dic
}    

In [7]:
meta_dic = {'tables':[table_dic],
           'path':''}

In [8]:
with open('data/temp/meta_titanic.json', 'w') as f:
    json.dump(meta_dic, f)

In [9]:
df.to_csv('data/temp/titanic.csv', index_label='PassengerId')

## Load data in the model and fit it

In [10]:
from sdv.data_navigator import CSVDataLoader
from sdv.data_navigator import Table
from sdv.modeler import Modeler
from sdv.sampler import Sampler

In [11]:
data_loader = CSVDataLoader('data/temp/meta_titanic.json')
data_navigator = data_loader.load_data()

In [12]:
# Transform data
data_navigator.transform_data();

In [13]:
modeler = Modeler(data_navigator)

# Model the dataset/database
modeler.model_database()
sampler = Sampler(data_navigator, modeler)

In [14]:
print(modeler.models['titanic'])


Age
Distribution Type: Gaussian
Variable name: None
Mean: 29.36158249158249
Standard deviation: 13.012388272793666

Fare
Distribution Type: Gaussian
Variable name: None
Mean: 32.204207968574636
Standard deviation: 49.6655344447741

Pclass
Distribution Type: Gaussian
Variable name: None
Mean: 0.49797120789098615
Standard deviation: 0.2711374362662088

Sex
Distribution Type: Gaussian
Variable name: None
Mean: 0.499105314976713
Standard deviation: 0.2532752962301981

SibSp
Distribution Type: Gaussian
Variable name: None
Mean: 0.5041073192221043
Standard deviation: 0.2551321895621187

Parch
Distribution Type: Gaussian
Variable name: None
Mean: 0.49217800298342235
Standard deviation: 0.24492259246390335

Embarked
Distribution Type: Gaussian
Variable name: None
Mean: 0.4995180379678339
Standard deviation: 0.24969301054921042

cabin_txt
Distribution Type: Gaussian
Variable name: None
Mean: 0.49698643136152015
Standard deviation: 0.239644650329943

cabin_count
Distribution Type: Gaussian
Vari

In [15]:
df_sampled = sampler.sample_table('titanic', reset_primary_keys=True)
df_sampled.set_index('PassengerId', inplace=True)

In [16]:
df_sampled.to_pickle('data/synthetic/sampled.pkl')

## Profile tables

In [17]:
import pandas_profiling

In [19]:
df_sampled.profile_report()



In [20]:
df.profile_report()

