In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from synthpop import Synthpop
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network
from sklearn.model_selection import train_test_split

housing = pd.read_csv('housing.csv')

In [8]:
#SPLITTING DATA
#Here, we merely create the datasets. We are not synthesizing anything or imputing anything yet.
for sim in range(10):
    #Original Dataset
    no_missing_train, no_missing_test = train_test_split(housing, test_size=0.2)
    no_missing_train.to_csv(f'Datasets/housing_original_train_{sim}.csv', index=False)
    no_missing_test.to_csv(f'Datasets/housing_original_test_{sim}.csv', index=False)
    
    #MAR 
    #People with a guest room and area > 5000 will not report their number of bedrooms, or whether they have parking, or aircon
    #People without aircon or hotwater will not report how many stories
    mar_train = no_missing_train.copy()
    mar_train.loc[(mar_train['guestroom'] == "yes") & (mar_train['area'] > 5000), ['bedrooms', 'parking']] = np.nan
    mar_train.loc[(mar_train['hotwaterheating'] == "no") & (mar_train['airconditioning'] == "no"), 'stories'] = np.nan
    mar_test = no_missing_test.copy()
    mar_test.loc[(mar_test['guestroom'] == "yes") & (mar_test['area'] > 5000), ['bedrooms', 'parking']] = np.nan
    mar_test.loc[(mar_test['hotwaterheating'] == "no") & (mar_test['airconditioning'] == "no"), 'stories'] = np.nan

    
    mar_train.to_csv('Datasets/housing_mar_train_'+str(sim)+'.csv', index=False)
    mar_test.to_csv('Datasets/housing_mar_test_'+str(sim)+'.csv', index=False)
    
    #MNAR
    #People with an area greater than 6000 sqft and more than two bedrooms will not report their area
    #People with >2 stories and a basement will not report their guestroom
    mnar_train = no_missing_train.copy()
    mnar_train.loc[(mnar_train['parking'] >= 2) & (mnar_train['bedrooms'] > 2), ['bedrooms', 'parking']] = np.nan
    mnar_train.loc[(mnar_train['stories'] > 2) & (mnar_train['basement'] == "yes"), 'stories'] = np.nan
    mnar_test = no_missing_test.copy()
    mnar_test.loc[(mnar_test['parking'] >= 2) & (mnar_test['bedrooms'] > 2), ['bedrooms', 'parking']] = np.nan
    mnar_test.loc[(mnar_test['stories'] > 2) & (mnar_test['basement'] == "yes"), 'stories'] = np.nan
    
    mnar_train.to_csv(f'Datasets/housing_mnar_train_{sim}.csv', index=False)
    mnar_test.to_csv(f'Datasets/housing_mnar_test_{sim}.csv', index=False)

In [4]:
threshold_value = 10
categorical_values = {'price': False,'mainroad': True, 
                      'guestroom': True, 
                      'basement': True, 
                      'hotwaterheating': True, 
                      'airconditioning': True, 
                      'prefarea': True, 
                      'furnishingstatus': True}
epsilon = 0
versions = ['original', 'mar', 'mnar']
datasets = range(1)
for version in versions:
    for sim in datasets:
        dataset = pd.read_csv(f'Datasets/housing_{version}_train_{sim}.csv')
        rows = len(dataset)
        describer = DataDescriber(category_threshold=threshold_value)
        describer.describe_dataset_in_correlated_attribute_mode(dataset_file=f'Datasets/housing_{version}_train_{sim}.csv', 
                                                                epsilon=epsilon,
                                                                attribute_to_is_categorical=categorical_values)
        description_json_name = 'Datasets/housing_'+version+'_description_'+str(sim)+'.json'
        describer.save_dataset_description_to_file(description_json_name)
        generator = DataGenerator()
        
        generator.generate_dataset_in_correlated_attribute_mode(rows, description_json_name)
        
        new_file_name = f'Datasets/housing_synthesizer{version}_train_{sim}.csv'
        generator.save_synthetic_data(new_file_name)


Adding ROOT guestroom
Adding attribute price
Adding attribute area
Adding attribute stories
Adding attribute bedrooms
Adding attribute furnishingstatus
Adding attribute parking
Adding attribute bathrooms
Adding attribute basement
Adding attribute airconditioning
Adding attribute prefarea
Adding attribute mainroad
Adding attribute hotwaterheating


KeyboardInterrupt: 

In [75]:
generator = DataGenerator()
generator.generate_dataset_in_correlated_attribute_mode(rows, "housing_description.json")

In [56]:
generator.save_synthetic_data('synthetic_housing.csv')

In [3]:
help(DataGenerator().generate_dataset_in_correlated_attribute_mode)

Help on method generate_dataset_in_correlated_attribute_mode in module DataSynthesizer.DataGenerator:

generate_dataset_in_correlated_attribute_mode(n, description_file, seed=0) method of DataSynthesizer.DataGenerator.DataGenerator instance

