In [1]:
# Notebooks
import nbimporter
import os
import sys

# Functions from src
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Defined Functions
from utils import *

# Pandas, matplotlib, pickle, seaborn
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from statistics import mean
from collections import Counter
from ctgan import CTGANSynthesizer

random_state = 13


In [2]:
# global variables/constants
num_trials = 30
test_size_percentage = 0.2 # for CV within train split
fixed_depth = 10


# Load Datasets

## Occutherm

In [3]:
df_tcs_train = pd.read_pickle("data/occutherm/df_feature1_train.pkl") 
df_tcs_test = pd.read_pickle("data/occutherm/df_feature1_test.pkl")

# total count for instances per class: 818
print(df_tcs_train.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1508 entries, 0 to 1507
Data columns (total 10 columns):
Temperature (Fahrenheit)       1508 non-null float64
SkinTemperature                1508 non-null float64
ClothingInsulation             1508 non-null float64
Height(cm)                     1508 non-null float64
Shoulder Circumference(cm)     1508 non-null float64
Weight(lbs)                    1508 non-null float64
Gender                         1508 non-null int64
Temperature_outside            1508 non-null float64
Humidity_outside               1508 non-null float64
Discrete Thermal Comfort_TA    1508 non-null int64
dtypes: float64(8), int64(2)
memory usage: 117.9 KB
None


## Cresh

In [4]:
df_cresh_train = pd.read_pickle("data/cresh/cresh_train.pkl")
df_cresh_test = pd.read_pickle("data/cresh/cresh_test.pkl")

# total count for instances per class: 713
print(df_cresh_train.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159 entries, 0 to 1158
Data columns (total 11 columns):
heartRate_cozie        1159 non-null float64
humidity_sensing       1159 non-null float64
light_sensing          1159 non-null float64
noise_sensing          1159 non-null float64
temperature_sensing    1159 non-null float64
temperature_mbient     1159 non-null float64
hour_sin               1159 non-null float64
hour_cos               1159 non-null float64
day_of_week_sin        1159 non-null float64
day_of_week_cos        1159 non-null float64
thermal_cozie          1159 non-null int64
dtypes: float64(10), int64(1)
memory usage: 99.7 KB
None


## ASHRAE

In [5]:
df_ashrae_train = pd.read_pickle("data/ashrae/ashrae_train.pkl")
df_ashrae_test = pd.read_pickle("data/ashrae/ashrae_test.pkl")

print(df_ashrae_train.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46477 entries, 0 to 46476
Data columns (total 7 columns):
SET                          46477 non-null float64
Clo                          46477 non-null float64
Met                          46477 non-null float64
Air temperature (¡C)         46477 non-null float64
Relative humidity (%)        46477 non-null float64
Air velocity (m/s)           46477 non-null float64
Thermal sensation rounded    46477 non-null float64
dtypes: float64(7)
memory usage: 2.5 MB
None


# Samplig synthetic datasets

https://sdv-dev.github.io/CTGAN/

https://github.com/sdv-dev/CTGAN

Models can be trained directly on this notebooks and then single synthetic datasets can be generated by running the file `generate_ctgan_<dataset>.py`. However, for this noteboooks since `num_trials` of synthetic datasets will be generated, the script is adapted below.


## Occutherm
Required samples

| Model | -2 | -1 | 0 | 1 | 2 |
|---|---|---|---|---|---|
| SMOTE | 646 | 515 | 0 | 647 | 774 |
| ADASYN | 651 | 519 | 0 | 680 | 770 |

In [6]:
discrete_columns_occutherm = [
    'Gender',
    'Discrete Thermal Comfort_TA'
]

ctgan_occutherm = CTGANSynthesizer()
ctgan_occutherm.fit(df_tcs_train, discrete_columns_occutherm) #epochs=300
save_pickle(ctgan_occutherm, "models/ctgan_occutherm.pkl")


Epoch 1, Loss G: 1.1149, Loss D: 0.0095
Epoch 2, Loss G: 1.0254, Loss D: 0.0247
Epoch 3, Loss G: 0.9356, Loss D: -0.0215
Epoch 4, Loss G: 0.9791, Loss D: -0.0518
Epoch 5, Loss G: 0.8646, Loss D: 0.0126
Epoch 6, Loss G: 0.7581, Loss D: -0.1192
Epoch 7, Loss G: 0.6541, Loss D: -0.1034
Epoch 8, Loss G: 0.6203, Loss D: -0.1120
Epoch 9, Loss G: 0.4839, Loss D: -0.0668
Epoch 10, Loss G: 0.3786, Loss D: 0.1177
Epoch 11, Loss G: 0.2656, Loss D: -0.0821
Epoch 12, Loss G: 0.2667, Loss D: 0.1129
Epoch 13, Loss G: 0.2395, Loss D: 0.0337
Epoch 14, Loss G: 0.3670, Loss D: -0.0791
Epoch 15, Loss G: 0.3724, Loss D: -0.0884
Epoch 16, Loss G: 0.3650, Loss D: -0.1650
Epoch 17, Loss G: 0.2684, Loss D: -0.2133
Epoch 18, Loss G: 0.2069, Loss D: -0.0918
Epoch 19, Loss G: 0.1664, Loss D: 0.0681
Epoch 20, Loss G: 0.1318, Loss D: 0.0726
Epoch 21, Loss G: 0.1020, Loss D: 0.0265
Epoch 22, Loss G: 0.1198, Loss D: -0.0350
Epoch 23, Loss G: 0.1418, Loss D: -0.0547
Epoch 24, Loss G: 0.2757, Loss D: 0.0611
Epoch 25, L

## Cresh
Required samples

| Model | 9 | 10 | 11 |
|---|---|---|---|
| SMOTE | 547 | 0 | 433 |
| ADASYN | 534 | 0 | 402 |

In [7]:
discrete_columns_cresh = [ # discrete_columns_cresh = [1, 2, 12]
    'thermal_cozie'
]

ctgan_cresh = CTGANSynthesizer()
ctgan_cresh.fit(df_cresh_train, discrete_columns_cresh) #epochs=300
save_pickle(ctgan_cresh, "models/ctgan_cresh.pkl")


Epoch 1, Loss G: 1.1775, Loss D: -0.0260
Epoch 2, Loss G: 1.2265, Loss D: -0.0278
Epoch 3, Loss G: 1.1982, Loss D: -0.0596
Epoch 4, Loss G: 1.1549, Loss D: -0.0219
Epoch 5, Loss G: 1.0889, Loss D: -0.0722
Epoch 6, Loss G: 1.0656, Loss D: -0.0711
Epoch 7, Loss G: 1.0035, Loss D: -0.0128
Epoch 8, Loss G: 0.9336, Loss D: -0.1047
Epoch 9, Loss G: 0.8466, Loss D: -0.0165
Epoch 10, Loss G: 0.7886, Loss D: -0.0860
Epoch 11, Loss G: 0.6601, Loss D: -0.0061
Epoch 12, Loss G: 0.6222, Loss D: -0.0055
Epoch 13, Loss G: 0.5008, Loss D: 0.1091
Epoch 14, Loss G: 0.4767, Loss D: 0.1337
Epoch 15, Loss G: 0.2825, Loss D: -0.0081
Epoch 16, Loss G: 0.3230, Loss D: 0.1154
Epoch 17, Loss G: 0.3441, Loss D: 0.0629
Epoch 18, Loss G: 0.3011, Loss D: 0.1181
Epoch 19, Loss G: 0.2116, Loss D: 0.1240
Epoch 20, Loss G: 0.2060, Loss D: 0.0757
Epoch 21, Loss G: 0.1986, Loss D: 0.0912
Epoch 22, Loss G: 0.1506, Loss D: 0.0791
Epoch 23, Loss G: 0.2001, Loss D: 0.0882
Epoch 24, Loss G: 0.1865, Loss D: 0.0746
Epoch 25, Lo

## ASHRAE
Required samples

| Model  | -3 | -2 | -1 | 0 | 1 | 2 | 3 |
|--------|----|----|----|---|---|---|---|
| SMOTE  | 17894 | 16287 | 11695 | 0 | 10161 | 14096 | 16173 |
| ADASYN | 18159 | 16619 | 12065 | 0 | 10987 | 14974 | 16971 |


In [11]:
discrete_columns_ashrae = [
    'Thermal sensation rounded'
]

ctgan_ashrae = CTGANSynthesizer()
ctgan_ashrae.fit(df_ashrae_train, discrete_columns_ashrae, epochs=600) #epochs=300
save_pickle(ctgan_ashrae, "models/ctgan_ashrae.pkl")


Epoch 1, Loss G: 0.6192, Loss D: 0.3329
Epoch 2, Loss G: -0.1899, Loss D: 0.1287
Epoch 3, Loss G: -0.9032, Loss D: -0.0536
Epoch 4, Loss G: -1.1570, Loss D: 0.0847
Epoch 5, Loss G: -1.3507, Loss D: 0.0044
Epoch 6, Loss G: -1.5600, Loss D: -0.0220
Epoch 7, Loss G: -1.6640, Loss D: -0.0038
Epoch 8, Loss G: -1.6458, Loss D: -0.0449
Epoch 9, Loss G: -1.6058, Loss D: 0.0456
Epoch 10, Loss G: -1.8149, Loss D: 0.0502
Epoch 11, Loss G: -1.7369, Loss D: -0.0242
Epoch 12, Loss G: -1.8675, Loss D: -0.0535
Epoch 13, Loss G: -1.7964, Loss D: -0.0773
Epoch 14, Loss G: -1.8895, Loss D: -0.0081
Epoch 15, Loss G: -1.8853, Loss D: -0.0656
Epoch 16, Loss G: -1.9387, Loss D: 0.0852
Epoch 17, Loss G: -1.9157, Loss D: -0.0237
Epoch 18, Loss G: -1.9925, Loss D: 0.0045
Epoch 19, Loss G: -1.9595, Loss D: 0.0384
Epoch 20, Loss G: -1.9388, Loss D: 0.0879
Epoch 21, Loss G: -2.0491, Loss D: -0.0159
Epoch 22, Loss G: -2.0937, Loss D: -0.0421
Epoch 23, Loss G: -2.1109, Loss D: -0.0616
Epoch 24, Loss G: -1.9906, Loss

# CTGAN

Adapted from `generate_ctgan_<dataset>.py`

In [12]:
def sample_CTGAN(df, dataset="occutherm", logs=False):
    # load require samples to balance and initial count
    if dataset == "occutherm":
        # samples to generate per label
        occutherm_req_two = 774
        occutherm_req_one = 647
        occutherm_req_zero = 0 
        occutherm_req_minus_one = 515
        occutherm_req_minus_two = 646
        # count of generated samples per label
        count_two = 0
        count_one = 0
        count_minus_one = 0
        count_minus_two = 0 
    elif dataset == "cresh":
        # samples to generate per label
        cresh_req_nine = 547
        cresh_req_ten = 0
        cresh_req_eleven = 433
        # count of generated samples per label
        count_nine = 0
        count_eleven = 0
    elif dataset == "ashrae":
        # samples to generate per label
        ashrae_req_three = 16173
        ashrae_req_two = 14096
        ashrae_req_one = 10161 
        ashrae_req_zero = 0
        ashrae_req_minus_one = 11695
        ashrae_req_minus_two = 16287
        ashrae_req_minus_three = 17894
        # count of generated samples per label
        count_three = 0
        count_two = 0
        count_one = 0
        count_minus_one = 0
        count_minus_two = 0
        count_minus_three = 0

    samples_count = 0 # total number of samples generated
    finish_loop = False
    
    # samples to generated every loop
    if dataset == "ashrae":
        print_threshold = 10000
    else:
        print_threshold = 1000
    
    # initiliaze synthetic dataframe
    columns = df.columns.values
    df_synth = pd.DataFrame(columns=columns)
    
    # load model
    model_path = "models/ctgan_" + dataset + ".pkl"
    ctgan = pickle.load(open( model_path, "rb" ))

    while True:
        if finish_loop:
            samples_count += print_threshold
            break
        
        # generate `print_threshold` samples
        curr_df = ctgan.sample(print_threshold)
        
        # iterate through the generated samples
        if dataset == "occutherm":
            for index, row in curr_df.iterrows():  
                if (row['Discrete Thermal Comfort_TA'] == 2) and (count_two != occutherm_req_two): 
                    df_synth = df_synth.append(row)
                    count_two += 1 
                elif (row['Discrete Thermal Comfort_TA'] == 1) and (count_one != occutherm_req_one): 
                    df_synth = df_synth.append(row)
                    count_one += 1
                elif (row['Discrete Thermal Comfort_TA'] == -1) and (count_minus_one != occutherm_req_minus_one): 
                    df_synth = df_synth.append(row)
                    count_minus_one += 1
                elif (row['Discrete Thermal Comfort_TA'] == -2) and (count_minus_two != occutherm_req_minus_two): 
                    df_synth = df_synth.append(row)
                    count_minus_two += 1
                elif (count_two == occutherm_req_two) and (count_one == occutherm_req_one) and (count_minus_one == occutherm_req_minus_one) and (count_minus_two == occutherm_req_minus_two):
                    finish_loop = True
                    break
                  
        elif dataset == "cresh":
            for index, row in curr_df.iterrows():  
                if (row['thermal_cozie'] == 9) and (count_nine != cresh_req_nine): 
                    df_synth = df_synth.append(row)
                    count_nine += 1 
                elif (row['thermal_cozie'] == 11) and (count_eleven != cresh_req_eleven): 
                    df_synth = df_synth.append(row)
                    count_eleven += 1
                elif (count_nine == cresh_req_nine) and (count_eleven == cresh_req_eleven):
                    finish_loop = True
                    break

        elif dataset == "ashrae":
            for index, row in curr_df.iterrows(): 
                if (row['Thermal sensation rounded'] == 3) and (count_three != ashrae_req_three): 
                    df_synth = df_synth.append(row)
                    count_three += 1 
                elif (row['Thermal sensation rounded'] == 2) and (count_two != ashrae_req_two): 
                    df_synth = df_synth.append(row)
                    count_two += 1
                elif (row['Thermal sensation rounded'] == 1) and (count_one != ashrae_req_one): 
                    df_synth = df_synth.append(row)
                    count_one += 1
                elif (row['Thermal sensation rounded'] == -1) and (count_minus_one != ashrae_req_minus_one): 
                    df_synth = df_synth.append(row)
                    count_minus_one += 1
                elif (row['Thermal sensation rounded'] == -2) and (count_minus_two != ashrae_req_minus_two): 
                    df_synth = df_synth.append(row)
                    count_minus_two += 1
                elif (row['Thermal sensation rounded'] == -3) and (count_minus_three != ashrae_req_minus_three): 
                    df_synth = df_synth.append(row)
                    count_minus_three += 1
                elif (count_three == ashrae_req_three) and \
                     (count_two == ashrae_req_two) and \
                     (count_one == ashrae_req_one) and \
                     (count_minus_one == ashrae_req_minus_one) and \
                     (count_minus_two == ashrae_req_minus_two) and \
                     (count_minus_three == ashrae_req_minus_three):
                    finish_loop = True
                    break

        samples_count += print_threshold

    if logs:
        print('Original {} train dataset shape {}'.format(df, Counter(y)))        
        print('Resampled (CTGAN) synth dataset shape %s' % Counter(np.array(df_synth.iloc[:, -1])))

    if dataset == "occutherm":
        df_synth['Gender'] = pd.to_numeric(df_synth['Gender'], errors='coerce')
    # last column in float instead of string
    df_synth.iloc[:, -1] = pd.to_numeric(df_synth.iloc[:, -1], errors='coerce')

    return df_synth
    

In [13]:
def evaluate_on_dataset(df_train, df_test, dataset='occutherm'):
    model = 'ctgan'
    # empty list to hold values during trials
    variability_list = []
    diversity_list = []

    class_acc_test_list_0 = []
    class_acc_test_list_1 = []
    class_acc_test_list_2 = []
    class_acc_test_list_3 = []
    
    class_report_rdf_list = []
        
    for i in range(0, num_trials):      
        ###################################
        # Sample synthethic dataset for 'dataset'
        df_synth = sample_CTGAN(df_train, dataset)

        # merge synthethic + real dataset
        df_real_synth = pd.concat([df_synth, df_train])
        
        ###################################
        # Variability of generated samples
        variability = evaluation_variability(df_synth)
        variability_list.append(variability)
    
        #################################################
        # Class diversity with respect to the training set
        diversity = evaluation_diversity(df_synth, df_train, baseline=False)
        diversity_list.append(diversity)

        #####################################
        # Quality on the final classification
        # use best models NB, KNN, SVM, RDF
        class_acc_test, class_acc_train, class_models, class_report_rdf = evaluation_classification(df_real_synth, df_test, rdf_depth=fixed_depth, depth_file_name='default', test_size_percentage=test_size_percentage)
        class_acc_test_list_0.append(class_acc_test[0])
        class_acc_test_list_1.append(class_acc_test[1])
        class_acc_test_list_2.append(class_acc_test[2])
        class_acc_test_list_3.append(class_acc_test[3])
        class_report_rdf_list.append(class_report_rdf)
        
        ########################
        # end of for loop trials
        print("End of {} trial".format(i + 1))
        
    # get average of trials
    variability = mean(variability_list)
    diversity = mean(diversity_list)
    class_acc_test = [mean(class_acc_test_list_0), mean(class_acc_test_list_1), mean(class_acc_test_list_2), mean(class_acc_test_list_3)]
    
    #####################################
    # Saving results
    # Format is folder/<dataset_string>-<experiment_name>_<metric or model>_<test or train>_<model>.pkl
    save_pickle(variability, "metrics/" + dataset + "_variability_" + model + "_trials.pkl")
    save_pickle(diversity, "metrics/" + dataset + "_diversity_" + model + "_trials.pkl")
    save_pickle(class_acc_test, "metrics/" + dataset + "_classification_" + model + "_trials.pkl")
    save_pickle(class_report_rdf_list, "label-metrics/" + dataset + "_class_report_" + model + "_trials.pkl")
    
    print("################################################################################")
    print("# Metrics and models for dataset {} saved!".format(dataset))
    print("################################################################################")


## Occutherm

In [14]:
evaluate_on_dataset(df_tcs_train, df_tcs_test, dataset='occutherm')


End of 1 trial
End of 2 trial
End of 3 trial
End of 4 trial
End of 5 trial
End of 6 trial
End of 7 trial
End of 8 trial
End of 9 trial
End of 10 trial
End of 11 trial
End of 12 trial
End of 13 trial
End of 14 trial
End of 15 trial
End of 16 trial
End of 17 trial
End of 18 trial
End of 19 trial
End of 20 trial
End of 21 trial
End of 22 trial
End of 23 trial
End of 24 trial
End of 25 trial
End of 26 trial
End of 27 trial
End of 28 trial
End of 29 trial
End of 30 trial
################################################################################
# Metrics and models for dataset occutherm saved!
################################################################################


## Cresh

In [15]:
evaluate_on_dataset(df_cresh_train, df_cresh_test, dataset='cresh')


End of 1 trial
End of 2 trial
End of 3 trial
End of 4 trial
End of 5 trial
End of 6 trial
End of 7 trial
End of 8 trial
End of 9 trial
End of 10 trial
End of 11 trial
End of 12 trial
End of 13 trial
End of 14 trial
End of 15 trial
End of 16 trial
End of 17 trial
End of 18 trial
End of 19 trial
End of 20 trial
End of 21 trial
End of 22 trial
End of 23 trial
End of 24 trial
End of 25 trial
End of 26 trial
End of 27 trial
End of 28 trial
End of 29 trial
End of 30 trial
################################################################################
# Metrics and models for dataset cresh saved!
################################################################################


## ASHRAE

In [16]:
evaluate_on_dataset(df_ashrae_train, df_ashrae_test, dataset='ashrae')


End of 1 trial
End of 2 trial
End of 3 trial
End of 4 trial
End of 5 trial
End of 6 trial
End of 7 trial
End of 8 trial
End of 9 trial
End of 10 trial
End of 11 trial
End of 12 trial
End of 13 trial
End of 14 trial
End of 15 trial
End of 16 trial
End of 17 trial
End of 18 trial
End of 19 trial
End of 20 trial
End of 21 trial
End of 22 trial
End of 23 trial
End of 24 trial
End of 25 trial
End of 26 trial
End of 27 trial
End of 28 trial
End of 29 trial
End of 30 trial
################################################################################
# Metrics and models for dataset ashrae saved!
################################################################################
