# Generate synthetic dataset

In [1]:
import json
import numpy as np
import pandas as pd
import pickle

from pathlib import Path
from scripts.preparation_fcns import *

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
pd.options.mode.chained_assignment = None

## Import raw data

In [3]:
train_file = Path('data/raw/train.csv')

In [4]:
df = process_dataset(train_file)

## Normalize the data to a Gaussian distribution

In [5]:
from sklearn.preprocessing import PowerTransformer

from sklearn_pandas import DataFrameMapper
from sklearn_pandas import gen_features

In [6]:
feature_def = gen_features(
     columns=[[k] for k in df.select_dtypes('number').columns],
    classes=[PowerTransformer]
 )

In [7]:
mapper = DataFrameMapper(feature_def, default=None, df_out=True)

In [8]:
df_t = mapper.fit_transform(df)[list(df)]

## Prepare table Metadata

In [9]:
fields_dic = prepare_dict(df_t)

In [10]:
table_dic = {"path": "",
        "headers": True,
        "name": "titanic",
        "path": "titanic.csv",
        "primary_key": "PassengerId",
        "use": True,
        'fields': fields_dic
}    

In [11]:
meta_dic = {'tables':[table_dic],
           'path':''}

In [12]:
with open('data/temp/meta_titanic.json', 'w') as f:
    json.dump(meta_dic, f)

In [13]:
df_t.to_csv('data/temp/titanic.csv', index_label='PassengerId')

## Load data in the model and fit it

In [14]:
from sdv.data_navigator import CSVDataLoader, Table, DataNavigator
from sdv.modeler import Modeler
from sdv.sampler import Sampler

In [15]:
data_loader = CSVDataLoader('data/temp/meta_titanic.json')
data_navigator = data_loader.load_data()
data_navigator.ht.missing=True

In [16]:
# Transform data
data_navigator.transform_data();

In [17]:
modeler = Modeler(data_navigator)

# Model the dataset/database
modeler.model_database()
sampler = Sampler(data_navigator, modeler)

In [None]:
print(modeler.models['titanic'])

In [18]:
df_sampled = sampler.sample_table('titanic', reset_primary_keys=True)
df_sampled.set_index('PassengerId', inplace=True)

In [19]:
df_inv = mapper.inverse_transform(df_sampled)
df_inv.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived,cabin_txt,cabin_num,cabin_count,ticket_txt,ticket_num,Age_na,cabin_num_na,ticket_num_na
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,3.0,male,28.793615,0.0,0.0,9.985045,C,False,na,41.207889,na,na,35741.29869,False,True,False
1,3.0,female,34.95908,0.0,0.0,32.703828,Q,True,na,48.581768,na,na,619110.59047,False,True,False
2,3.0,male,33.886608,0.0,4.0,31.213471,C,False,na,47.690618,na,na,979.114621,False,True,False
3,3.0,male,20.778475,1.0,2.0,9.113491,S,False,na,42.996767,na,na,16324.981862,False,True,False
4,3.0,female,38.972155,0.0,0.0,26.662378,S,False,na,35.88289,1,na,415142.587409,False,True,False


In [20]:
df_inv.to_pickle('data/synthetic/sampled.pkl')

## Profiling the sampled dataset

In [21]:
import pandas_profiling

In [22]:
df_inv.profile_report()



In [23]:
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [32]:
df_inv.iplot(kind='heatmap', data=)

TypeError: '<=' not supported between instances of 'float' and 'str'