In [1]:
# Notebooks
import nbimporter
import os
import sys

# Functions from src
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Defined Functions
from utils import *

# Pandas, matplotlib, pickle, seaborn
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from statistics import mean
from collections import Counter
from imblearn.over_sampling import SMOTE, SMOTENC, ADASYN

random_state = 13 # if there is a need to fix it


In [2]:
# global variables/constants
num_trials = 30
test_size_percentage = 0.2 # for CV within train split
fixed_depth = 10


# Load Datasets - reduced labels

## Occutherm

In [3]:
# load TCS dataset
df_tcs_train = pd.read_pickle("data/occutherm/df_feature1_train_reduced.pkl") 
df_tcs_test = pd.read_pickle("data/occutherm/df_feature1_test_reduced.pkl")

# total count for instances per class: 818
print(df_tcs_train.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1508 entries, 0 to 1507
Data columns (total 10 columns):
Temperature (Fahrenheit)       1508 non-null float64
SkinTemperature                1508 non-null float64
ClothingInsulation             1508 non-null float64
Height(cm)                     1508 non-null float64
Shoulder Circumference(cm)     1508 non-null float64
Weight(lbs)                    1508 non-null float64
Gender                         1508 non-null int64
Temperature_outside            1508 non-null float64
Humidity_outside               1508 non-null float64
Discrete Thermal Comfort_TA    1508 non-null int64
dtypes: float64(8), int64(2)
memory usage: 117.9 KB
None


## Cresh (already has 3 classes)

## ASHRAE

In [5]:
df_ashrae_train = pd.read_pickle("data/ashrae/ashrae_train_reduced.pkl")
df_ashrae_test = pd.read_pickle("data/ashrae/ashrae_test_reduced.pkl")

print(df_ashrae_train.info())
# Number of training instances: 46477
# Number of testing instances: 19920


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46477 entries, 0 to 46476
Data columns (total 7 columns):
SET                          46477 non-null float64
Clo                          46477 non-null float64
Met                          46477 non-null float64
Air temperature (¡C)         46477 non-null float64
Relative humidity (%)        46477 non-null float64
Air velocity (m/s)           46477 non-null float64
Thermal sensation rounded    46477 non-null float64
dtypes: float64(7)
memory usage: 2.5 MB
None


# SMOTE - ADASYN

https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html

In [40]:
def sample_smote_adasyn(df, dataset='occutherm', logs=True, random_state=None):
    columns = df.columns.values
    
    X = np.array(df.iloc[:, 0:df.shape[1] - 1]) # minus 1 for the comfort label
    y = np.array(df.iloc[:, -1])
    
    ########
    # SMOTE 
    ########
    if dataset == 'occutherm':
        X_res, y_res = SMOTENC(categorical_features=[6], random_state=random_state).fit_resample(X, y)
    elif dataset == 'cresh': # cresh and ashrae datasets don't have categorical features
        X_res, y_res = SMOTE(random_state=random_state).fit_resample(X, y)
    elif dataset == 'ashrae':
        X_res, y_res = SMOTE(random_state=random_state, sampling_strategy='all').fit_resample(X, y)

    # the output of .fit_resample is the real dataset with the synthetic samples
    df_x = pd.DataFrame(X_res)
    df_y = pd.DataFrame(y_res)
    df_real_synth_smote = pd.concat([df_x, df_y], axis=1)
    df_real_synth_smote.columns = columns
    df_real_synth_smote = df_real_synth_smote.sample(frac=1).reset_index(drop=True)
    # concatenate original real data to the real+synth and drop duplicates, the result will be only synth data
    df_synth_smote = pd.concat([df_real_synth_smote, df]).drop_duplicates(keep=False)   

    #########
    # ADASYN
    #########
    if dataset == 'ashrae':
#         Original ashrae train dataset {0.0: 19271, 1.0: 15922, -1.0: 11284}
#         Resampled (ADASYN) train (real + synth) dataset {-1.0: 19612, 0.0: 19271, 1.0: 15922}
        X_res, y_res= ADASYN(random_state=random_state, sampling_strategy='minority').fit_resample(X, y)
    else:
        X_res, y_res = ADASYN(random_state=random_state).fit_resample(X, y)
    
    # the output of .fit_resample is the real dataset with the synthetic samples
    df_x = pd.DataFrame(X_res)
    df_y = pd.DataFrame(y_res)
    df_real_synth_adasyn = pd.concat([df_x, df_y], axis=1)
    df_real_synth_adasyn.columns = columns
    df_real_synth_adasyn = df_real_synth_adasyn.sample(frac=1).reset_index(drop=True) # shuffling
    # concatenate original real data to the real+synth and drop duplicates, the result will be only synth data
    df_synth_adasyn = pd.concat([df_real_synth_adasyn, df]).drop_duplicates(keep=False)
    
    if logs:
        print('Original {} train dataset shape {}'.format(dataset, Counter(y)))

        print('Resampled (SMOTE) real + synth dataset shape %s' % Counter(np.array(df_real_synth_smote.iloc[:, -1])))
        print('Resampled (SMOTE) synth dataset shape %s' % Counter(np.array(df_synth_smote.iloc[:, -1])))
        
        print('Resampled (ADASYN) train (real + synth) dataset shape %s' % Counter(np.array(df_real_synth_adasyn.iloc[:, -1])))
        print('Resampled (ADASYN) synth dataset shape %s' % Counter(np.array(df_synth_adasyn.iloc[:, -1])))
    
    return df_real_synth_smote, df_synth_smote, df_real_synth_adasyn, df_synth_adasyn


In [7]:
def evaluate_on_dataset(df_train, df_test, model='adasyn', dataset='occutherm'):
    # empty list to hold values during trials
    accgen_acc_test_list_0 = []
    accgen_acc_test_list_1 = []
    accgen_acc_test_list_2 = []
    accgen_acc_test_list_3 = []
    
    variability_list = []
    diversity_list = []

    class_acc_test_list_0 = []
    class_acc_test_list_1 = []
    class_acc_test_list_2 = []
    class_acc_test_list_3 = []
    
    class_report_rdf_list = []
    
    for i in range(0, num_trials):
        ###################################
        # Sample synthethic dataset for 'df'
        if model == 'adasyn':
            _ , _ , df_real_synth, df_synth = sample_smote_adasyn(df_train, dataset)
        elif model == 'smote':
            df_real_synth, df_synth, _ , _ = sample_smote_adasyn(df_train, dataset)
        
        ###################################
        # Variability of generated samples
        variability = evaluation_variability(df_synth)
        variability_list.append(variability)
    
        #################################################
        # Class diversity with respect to the training set
        diversity = evaluation_diversity(df_synth, df_train, baseline=False)
        diversity_list.append(diversity)

        #####################################
        # Quality on the final classification
        # use best models NB, KNN, SVM, RDF
        class_acc_test, class_acc_train, class_models, class_report_rdf = evaluation_classification(df_real_synth, df_test, rdf_depth=fixed_depth, depth_file_name='default', test_size_percentage=test_size_percentage)
        class_acc_test_list_0.append(class_acc_test[0])
        class_acc_test_list_1.append(class_acc_test[1])
        class_acc_test_list_2.append(class_acc_test[2])
        class_acc_test_list_3.append(class_acc_test[3])
        class_report_rdf_list.append(class_report_rdf)
        
        ########################
        # end of for loop trials
        print("End of {} trial".format(i + 1))
        
    # get average of trials
    accgen_acc_test = [] # no need anymore
    variability = mean(variability_list)
    diversity = mean(diversity_list)
    class_acc_test = [mean(class_acc_test_list_0), mean(class_acc_test_list_1), mean(class_acc_test_list_2), mean(class_acc_test_list_3)]
    
    #####################################
    # Saving results
    # Format is folder/<dataset_string>-<experiment_name>_<metric or model>_<test or train>_<model>.pkl
    save_pickle(accgen_acc_test, "metrics/" + dataset + "-reduced" +"_accgen_test_" + model + "_trials.pkl")
    save_pickle(variability, "metrics/" + dataset + "-reduced" + "_variability_" + model + "_trials.pkl")
    save_pickle(diversity, "metrics/" + dataset + "-reduced" + "_diversity_" + model + "_trials.pkl")
    save_pickle(class_acc_test, "metrics/" + dataset + "-reduced" + "_classification_" + model + "_trials.pkl")
    save_pickle(class_report_rdf_list, "label-metrics/" + dataset + "-reduced" + "_class_report_" + model + "_trials.pkl")
    
    print("################################################################################")
    print("# Metrics and models for dataset {} saved!".format(dataset))
    print("################################################################################")


## Occutherm

In [8]:
evaluate_on_dataset(df_tcs_train, df_tcs_test, model='smote', dataset='occutherm')


Original occutherm train dataset shape Counter({0: 818, -1: 475, 1: 215})
Resampled (SMOTE) real + synth dataset shape Counter({-1: 818, 0: 818, 1: 818})
Resampled (SMOTE) synth dataset shape Counter({1: 603, -1: 342})
Resampled (ADASYN) train (real + synth) dataset shape Counter({1: 836, 0: 818, -1: 722})
Resampled (ADASYN) synth dataset shape Counter({1: 621, -1: 247})
End of 1 trial
Original occutherm train dataset shape Counter({0: 818, -1: 475, 1: 215})
Resampled (SMOTE) real + synth dataset shape Counter({1: 818, 0: 818, -1: 818})
Resampled (SMOTE) synth dataset shape Counter({1: 603, -1: 343})
Resampled (ADASYN) train (real + synth) dataset shape Counter({1: 836, 0: 818, -1: 722})
Resampled (ADASYN) synth dataset shape Counter({1: 621, -1: 247})
End of 2 trial
Original occutherm train dataset shape Counter({0: 818, -1: 475, 1: 215})
Resampled (SMOTE) real + synth dataset shape Counter({1: 818, -1: 818, 0: 818})
Resampled (SMOTE) synth dataset shape Counter({1: 603, -1: 340})
Res

In [9]:
evaluate_on_dataset(df_tcs_train, df_tcs_test, model='adasyn', dataset='occutherm')


Original occutherm train dataset shape Counter({0: 818, -1: 475, 1: 215})
Resampled (SMOTE) real + synth dataset shape Counter({-1: 818, 1: 818, 0: 818})
Resampled (SMOTE) synth dataset shape Counter({1: 603, -1: 342})
Resampled (ADASYN) train (real + synth) dataset shape Counter({1: 836, 0: 818, -1: 722})
Resampled (ADASYN) synth dataset shape Counter({1: 621, -1: 247})
End of 1 trial
Original occutherm train dataset shape Counter({0: 818, -1: 475, 1: 215})
Resampled (SMOTE) real + synth dataset shape Counter({0: 818, -1: 818, 1: 818})
Resampled (SMOTE) synth dataset shape Counter({1: 603, -1: 343})
Resampled (ADASYN) train (real + synth) dataset shape Counter({1: 836, 0: 818, -1: 722})
Resampled (ADASYN) synth dataset shape Counter({1: 621, -1: 247})
End of 2 trial
Original occutherm train dataset shape Counter({0: 818, -1: 475, 1: 215})
Resampled (SMOTE) real + synth dataset shape Counter({-1: 818, 0: 818, 1: 818})
Resampled (SMOTE) synth dataset shape Counter({1: 603, -1: 342})
Res

## Cresh (already has only 3 labels)

## ASHRAE

In [41]:
evaluate_on_dataset(df_ashrae_train, df_ashrae_test, model='smote', dataset='ashrae')


Original ashrae train dataset shape Counter({0.0: 19271, 1.0: 15922, -1.0: 11284})
Resampled (SMOTE) real + synth dataset shape Counter({-1.0: 19271, 1.0: 19271, 0.0: 19271})
Resampled (SMOTE) synth dataset shape Counter({-1.0: 7717, 1.0: 3169})
Resampled (ADASYN) train (real + synth) dataset shape Counter({-1.0: 19612, 0.0: 19271, 1.0: 15922})
Resampled (ADASYN) synth dataset shape Counter({-1.0: 8117})
End of 1 trial
Original ashrae train dataset shape Counter({0.0: 19271, 1.0: 15922, -1.0: 11284})
Resampled (SMOTE) real + synth dataset shape Counter({1.0: 19271, -1.0: 19271, 0.0: 19271})
Resampled (SMOTE) synth dataset shape Counter({-1.0: 7702, 1.0: 3138})
Resampled (ADASYN) train (real + synth) dataset shape Counter({-1.0: 19612, 0.0: 19271, 1.0: 15922})
Resampled (ADASYN) synth dataset shape Counter({-1.0: 8126})
End of 2 trial
Original ashrae train dataset shape Counter({0.0: 19271, 1.0: 15922, -1.0: 11284})
Resampled (SMOTE) real + synth dataset shape Counter({-1.0: 19271, 0.0:

In [42]:
evaluate_on_dataset(df_ashrae_train, df_ashrae_test, model='adasyn', dataset='ashrae')


Original ashrae train dataset shape Counter({0.0: 19271, 1.0: 15922, -1.0: 11284})
Resampled (SMOTE) real + synth dataset shape Counter({0.0: 19271, -1.0: 19271, 1.0: 19271})
Resampled (SMOTE) synth dataset shape Counter({-1.0: 7686, 1.0: 3167})
Resampled (ADASYN) train (real + synth) dataset shape Counter({-1.0: 19612, 0.0: 19271, 1.0: 15922})
Resampled (ADASYN) synth dataset shape Counter({-1.0: 8141})
End of 1 trial
Original ashrae train dataset shape Counter({0.0: 19271, 1.0: 15922, -1.0: 11284})
Resampled (SMOTE) real + synth dataset shape Counter({1.0: 19271, 0.0: 19271, -1.0: 19271})
Resampled (SMOTE) synth dataset shape Counter({-1.0: 7686, 1.0: 3144})
Resampled (ADASYN) train (real + synth) dataset shape Counter({-1.0: 19612, 0.0: 19271, 1.0: 15922})
Resampled (ADASYN) synth dataset shape Counter({-1.0: 8114})
End of 2 trial
Original ashrae train dataset shape Counter({0.0: 19271, 1.0: 15922, -1.0: 11284})
Resampled (SMOTE) real + synth dataset shape Counter({-1.0: 19271, 1.0: