In [1]:
# Notebooks
import nbimporter
import os
import sys

# Functions from src
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Defined Functions
from utils import *

# Pandas, matplotlib, pickle, seaborn
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from statistics import mean
from collections import Counter
from ctgan import CTGANSynthesizer

random_state = 13


In [2]:
# global variables/constants
num_trials = 30
test_size_percentage = 0.2 # for CV within train split
fixed_depth = 10


# Load Datasets

## Occutherm

In [3]:
df_tcs_train = pd.read_pickle("data/occutherm/df_feature1_train_reduced.pkl") 
df_tcs_test = pd.read_pickle("data/occutherm/df_feature1_test_reduced.pkl")

# total count for instances per class: 818
print(df_tcs_train.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1508 entries, 0 to 1507
Data columns (total 10 columns):
Temperature (Fahrenheit)       1508 non-null float64
SkinTemperature                1508 non-null float64
ClothingInsulation             1508 non-null float64
Height(cm)                     1508 non-null float64
Shoulder Circumference(cm)     1508 non-null float64
Weight(lbs)                    1508 non-null float64
Gender                         1508 non-null int64
Temperature_outside            1508 non-null float64
Humidity_outside               1508 non-null float64
Discrete Thermal Comfort_TA    1508 non-null int64
dtypes: float64(8), int64(2)
memory usage: 117.9 KB
None


## Cresh

## ASHRAE

In [5]:
df_ashrae_train = pd.read_pickle("data/ashrae/ashrae_train_reduced.pkl")
df_ashrae_test = pd.read_pickle("data/ashrae/ashrae_test_reduced.pkl")

print(df_ashrae_train.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46477 entries, 0 to 46476
Data columns (total 7 columns):
SET                          46477 non-null float64
Clo                          46477 non-null float64
Met                          46477 non-null float64
Air temperature (¡C)         46477 non-null float64
Relative humidity (%)        46477 non-null float64
Air velocity (m/s)           46477 non-null float64
Thermal sensation rounded    46477 non-null float64
dtypes: float64(7)
memory usage: 2.5 MB
None


# Samplig synthetic datasets

https://sdv-dev.github.io/CTGAN/

https://github.com/sdv-dev/CTGAN


## Occutherm
Required samples

| Model | -1 | 0 | 1 |
|---|---|---|---|
| SMOTE | 342 | 0 | 603 |
| ADASYN | 247 | 0 | 621 |


In [6]:
discrete_columns_occutherm = [
    'Gender',
    'Discrete Thermal Comfort_TA'
]

ctgan_occutherm = CTGANSynthesizer()
ctgan_occutherm.fit(df_tcs_train, discrete_columns_occutherm)
save_pickle(ctgan_occutherm, "models/ctgan_occutherm_reduced.pkl")


Epoch 1, Loss G: 1.0212, Loss D: -0.0214
Epoch 2, Loss G: 0.8884, Loss D: 0.0092
Epoch 3, Loss G: 0.8281, Loss D: -0.0023
Epoch 4, Loss G: 0.7858, Loss D: 0.0585
Epoch 5, Loss G: 0.7447, Loss D: 0.0485
Epoch 6, Loss G: 0.7176, Loss D: -0.0333
Epoch 7, Loss G: 0.6489, Loss D: -0.0041
Epoch 8, Loss G: 0.5759, Loss D: -0.0855
Epoch 9, Loss G: 0.4882, Loss D: -0.1023
Epoch 10, Loss G: 0.4135, Loss D: -0.0607
Epoch 11, Loss G: 0.2339, Loss D: -0.1579
Epoch 12, Loss G: 0.2064, Loss D: 0.0891
Epoch 13, Loss G: 0.2617, Loss D: -0.0360
Epoch 14, Loss G: 0.1553, Loss D: -0.0447
Epoch 15, Loss G: 0.2053, Loss D: -0.0410
Epoch 16, Loss G: 0.2413, Loss D: 0.0212
Epoch 17, Loss G: 0.3197, Loss D: -0.0441
Epoch 18, Loss G: 0.4104, Loss D: -0.1524
Epoch 19, Loss G: 0.4262, Loss D: -0.3367
Epoch 20, Loss G: 0.3004, Loss D: -0.2962
Epoch 21, Loss G: 0.2703, Loss D: -0.1711
Epoch 22, Loss G: 0.2682, Loss D: -0.1673
Epoch 23, Loss G: 0.2166, Loss D: 0.1099
Epoch 24, Loss G: 0.1745, Loss D: -0.0534
Epoch 2

## Cresh
Required samples

| Model | 9 | 10 | 11 |
|---|---|---|---|
| SMOTE | 547 | 0 | 433 |
| ADASYN | 534 | 0 | 402 |

## ASHRAE
Required samples

| Model  | -1 | 0 | 1 |
|--------|----|----|----|
| SMOTE  | 7686 | 0 | 3167 |
| ADASYN | 8141 | 0 | 3167 |

In [8]:
discrete_columns_ashrae = [
    'Thermal sensation rounded'
]

ctgan_ashrae = CTGANSynthesizer()
ctgan_ashrae.fit(df_ashrae_train, discrete_columns_ashrae, epochs=600)
save_pickle(ctgan_ashrae, "models/ctgan_ashrae_reduced.pkl")


Epoch 1, Loss G: 0.1217, Loss D: 0.0285
Epoch 2, Loss G: -0.7411, Loss D: 0.0209
Epoch 3, Loss G: -1.0486, Loss D: 0.0388
Epoch 4, Loss G: -1.1401, Loss D: -0.1162
Epoch 5, Loss G: -1.5141, Loss D: -0.0441
Epoch 6, Loss G: -1.5996, Loss D: 0.0939
Epoch 7, Loss G: -1.5514, Loss D: 0.1002
Epoch 8, Loss G: -1.7177, Loss D: 0.0008
Epoch 9, Loss G: -1.7831, Loss D: 0.0494
Epoch 10, Loss G: -1.6974, Loss D: 0.0592
Epoch 11, Loss G: -1.9259, Loss D: 0.0212
Epoch 12, Loss G: -1.8926, Loss D: 0.1224
Epoch 13, Loss G: -1.8374, Loss D: 0.0117
Epoch 14, Loss G: -1.9471, Loss D: 0.0417
Epoch 15, Loss G: -1.9125, Loss D: -0.0217
Epoch 16, Loss G: -2.0393, Loss D: 0.0414
Epoch 17, Loss G: -1.9643, Loss D: -0.0942
Epoch 18, Loss G: -1.9512, Loss D: -0.0174
Epoch 19, Loss G: -2.1509, Loss D: 0.0011
Epoch 20, Loss G: -1.9764, Loss D: 0.0960
Epoch 21, Loss G: -2.0597, Loss D: -0.0974
Epoch 22, Loss G: -2.0817, Loss D: -0.0156
Epoch 23, Loss G: -2.0674, Loss D: -0.0135
Epoch 24, Loss G: -2.0234, Loss D: 0

# CTGAN

Adapted from `generate_ctgan_<dataset>.py`

In [9]:
def sample_CTGAN(df, dataset="occutherm", logs=False):
    # load require samples to balance and initial count
    if dataset == "occutherm":
        # samples to generate per label
        occutherm_req_one = 342
        occutherm_req_zero = 0 
        occutherm_req_minus_one = 603
        # count of generated samples per label
        count_one = 0
        count_minus_one = 0
    elif dataset == "cresh":
        # samples to generate per label
        cresh_req_nine = 547
        cresh_req_ten = 0
        cresh_req_eleven = 433
        # count of generated samples per label
        count_nine = 0
        count_eleven = 0
    elif dataset == "ashrae":
        # samples to generate per label
        ashrae_req_one = 7686 
        ashrae_req_zero = 0
        ashrae_req_minus_one = 3167
        # count of generated samples per label
        count_one = 0
        count_minus_one = 0
        
    samples_count = 0 # total number of samples generated
    finish_loop = False
    
    # samples to generated every loop
    if dataset == "ashrae":
        print_threshold = 10000
    else:
        print_threshold = 3000
    
    # initiliaze synthetic dataframe
    columns = df.columns.values
    df_synth = pd.DataFrame(columns=columns)
    
    # load model
    model_path = "models/ctgan_" + dataset + "_reduced" + ".pkl"
    ctgan = pickle.load(open( model_path, "rb" ))

    while True:
        if finish_loop:
            samples_count += print_threshold
            break
        
        # generate `print_threshold` samples
        curr_df = ctgan.sample(print_threshold)
        
#         print(curr_df.iloc[:, -1].unique())
        
        # iterate through the generated samples
        if dataset == "occutherm":
            for index, row in curr_df.iterrows():  
                if (row['Discrete Thermal Comfort_TA'] == 1) and (count_one != occutherm_req_one): 
                    df_synth = df_synth.append(row)
                    count_one += 1
                elif (row['Discrete Thermal Comfort_TA'] == -1) and (count_minus_one != occutherm_req_minus_one): 
                    df_synth = df_synth.append(row)
                    count_minus_one += 1
                elif (count_one == occutherm_req_one) and \
                    (count_minus_one == occutherm_req_minus_one):
                    finish_loop = True
                    break
                  
        elif dataset == "cresh":
            for index, row in curr_df.iterrows():  
                if (row['thermal_cozie'] == 9) and (count_nine != cresh_req_nine): 
                    df_synth = df_synth.append(row)
                    count_nine += 1 
                elif (row['thermal_cozie'] == 11) and (count_eleven != cresh_req_eleven): 
                    df_synth = df_synth.append(row)
                    count_eleven += 1
                elif (count_nine == cresh_req_nine) and (count_eleven == cresh_req_eleven):
                    finish_loop = True
                    break

        elif dataset == "ashrae":
            for index, row in curr_df.iterrows(): 
                if (row['Thermal sensation rounded'] == 1) and (count_one != ashrae_req_one): 
                    df_synth = df_synth.append(row)
                    count_one += 1
                elif (row['Thermal sensation rounded'] == -1) and (count_minus_one != ashrae_req_minus_one): 
                    df_synth = df_synth.append(row)
                    count_minus_one += 1
                elif (count_one == ashrae_req_one) and \
                     (count_minus_one == ashrae_req_minus_one):
                    finish_loop = True
                    break

        samples_count += print_threshold

    if logs:
        print('Original {} train dataset shape {}'.format(df, Counter(y)))        
        print('Resampled (CTGAN) synth dataset shape %s' % Counter(np.array(df_synth.iloc[:, -1])))

    if dataset == "occutherm":
        df_synth['Gender'] = pd.to_numeric(df_synth['Gender'], errors='coerce')
    # last column in float instead of string
    df_synth.iloc[:, -1] = pd.to_numeric(df_synth.iloc[:, -1], errors='coerce')

    return df_synth
    

In [10]:
def evaluate_on_dataset(df_train, df_test, dataset='occutherm'):
    model = 'ctgan'
    # empty list to hold values during trials
    variability_list = []
    diversity_list = []

    class_acc_test_list_0 = []
    class_acc_test_list_1 = []
    class_acc_test_list_2 = []
    class_acc_test_list_3 = []
    
    class_report_rdf_list = []
        
    for i in range(0, num_trials):      
        ###################################
        # Sample synthethic dataset for 'dataset'
        df_synth = sample_CTGAN(df_train, dataset)

        # merge synthethic + real dataset
        df_real_synth = pd.concat([df_synth, df_train])
                
        ###################################
        # Variability of generated samples
        variability = evaluation_variability(df_synth)
        variability_list.append(variability)
    
        #################################################
        # Class diversity with respect to the training set
        diversity = evaluation_diversity(df_synth, df_train, baseline=False)
        diversity_list.append(diversity)

        #####################################
        # Quality on the final classification
        # use best models NB, KNN, SVM, RDF
        class_acc_test, class_acc_train, class_models, class_report_rdf = evaluation_classification(df_real_synth, df_test, rdf_depth=fixed_depth, depth_file_name='default', test_size_percentage=test_size_percentage)
        class_acc_test_list_0.append(class_acc_test[0])
        class_acc_test_list_1.append(class_acc_test[1])
        class_acc_test_list_2.append(class_acc_test[2])
        class_acc_test_list_3.append(class_acc_test[3])
        class_report_rdf_list.append(class_report_rdf)
        
        ########################
        # end of for loop trials
        print("End of {} trial".format(i + 1))
        
    # get average of trials
    variability = mean(variability_list)
    diversity = mean(diversity_list)
    class_acc_test = [mean(class_acc_test_list_0), mean(class_acc_test_list_1), mean(class_acc_test_list_2), mean(class_acc_test_list_3)]
    
    #####################################
    # Saving results
    # Format is folder/<dataset_string>-<experiment_name>_<metric or model>_<test or train>_<model>.pkl
    save_pickle(variability, "metrics/" + dataset + "-reduced" + "_variability_" + model + "_trials.pkl")
    save_pickle(diversity, "metrics/" + dataset + "-reduced" + "_diversity_" + model + "_trials.pkl")
    save_pickle(class_acc_test, "metrics/" + dataset + "-reduced" + "_classification_" + model + "_trials.pkl")
    save_pickle(class_report_rdf_list, "label-metrics/" + dataset + "-reduced" + "_class_report_" + model + "_trials.pkl")
    
    print("################################################################################")
    print("# Metrics and models for dataset {} saved!".format(dataset))
    print("################################################################################")


## Occutherm

In [11]:
evaluate_on_dataset(df_tcs_train, df_tcs_test, dataset='occutherm')


End of 1 trial
End of 2 trial
End of 3 trial
End of 4 trial
End of 5 trial
End of 6 trial
End of 7 trial
End of 8 trial
End of 9 trial
End of 10 trial
End of 11 trial
End of 12 trial
End of 13 trial
End of 14 trial
End of 15 trial
End of 16 trial
End of 17 trial
End of 18 trial
End of 19 trial
End of 20 trial
End of 21 trial
End of 22 trial
End of 23 trial
End of 24 trial
End of 25 trial
End of 26 trial
End of 27 trial
End of 28 trial
End of 29 trial
End of 30 trial
################################################################################
# Metrics and models for dataset occutherm saved!
################################################################################


## Cresh

## ASHRAE

In [13]:
evaluate_on_dataset(df_ashrae_train, df_ashrae_test, dataset='ashrae')


End of 1 trial
End of 2 trial
End of 3 trial
End of 4 trial
End of 5 trial
End of 6 trial
End of 7 trial
End of 8 trial
End of 9 trial
End of 10 trial
End of 11 trial
End of 12 trial
End of 13 trial
End of 14 trial
End of 15 trial
End of 16 trial
End of 17 trial
End of 18 trial
End of 19 trial
End of 20 trial
End of 21 trial
End of 22 trial
End of 23 trial
End of 24 trial
End of 25 trial
End of 26 trial
End of 27 trial
End of 28 trial
End of 29 trial
End of 30 trial
################################################################################
# Metrics and models for dataset ashrae saved!
################################################################################
