In [2]:
# Notebooks
import nbimporter
import os
import sys

# Functions from src
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Defined Functions
from utils import *

# Pandas, matplotlib, pickle, seaborn
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from statistics import mean 
from collections import Counter
from imblearn.over_sampling import SMOTE, SMOTENC, ADASYN


In [3]:
# global constants
num_trials = 30
test_size_percentage = 0.2
fixed_depth = 10


# Load TCS Dataset - reduced labels

In [5]:
df_tcs_train = pd.read_pickle("data/occutherm/df_feature1_train_reduced.pkl") 
df_tcs_test = pd.read_pickle("data/occutherm/df_feature1_test_reduced.pkl")

dataset_string = "occutherm-reduced"

# total count for instances per class: 818
print(df_tcs_train.describe())


       Temperature (Fahrenheit)  SkinTemperature  ClothingInsulation  \
count               1508.000000      1508.000000         1508.000000   
mean                  71.453707        85.207610            0.558176   
std                    6.221285         5.362427            0.198067   
min                   60.070000        62.986781            0.000000   
25%                   65.599998        81.927500            0.410000   
50%                   70.199997        85.376000            0.490000   
75%                   77.634998        88.598001            0.680000   
max                   85.000000       110.235782            1.070000   

        Height(cm)  Shoulder Circumference(cm)  Weight(lbs)       Gender  \
count  1508.000000                 1508.000000  1508.000000  1508.000000   
mean    169.909218                  109.055637   152.835411     0.443634   
std       9.215815                   10.985466    30.818397     0.496978   
min     151.000000                   89.500000 

# Classification models on train data (imbalanced)

In [4]:
acc_rdf, rdf_real_model = train_rdf(df_tcs_train, rdf_depth=fixed_depth, test_size_percentage=test_size_percentage)


In [5]:
print("rdf acc CV: {}".format(acc_rdf))


nb acc CV: (0.6423841059602649, {'-1': {'precision': 0.6271186440677966, 'recall': 0.7789473684210526, 'f1-score': 0.6948356807511737, 'support': 95}, '0': {'precision': 0.7322834645669292, 'recall': 0.5670731707317073, 'f1-score': 0.6391752577319588, 'support': 164}, '1': {'precision': 0.47368421052631576, 'recall': 0.627906976744186, 'f1-score': 0.5399999999999999, 'support': 43}, 'accuracy': 0.6423841059602649, 'macro avg': {'precision': 0.6110287730536805, 'recall': 0.6579758386323153, 'f1-score': 0.6246703128277109, 'support': 302}, 'weighted avg': {'precision': 0.6623813921458565, 'recall': 0.6423841059602649, 'f1-score': 0.6425633507927244, 'support': 302}})
knn acc CV: (0.7715231788079471, {'-1': {'precision': 0.7551020408163265, 'recall': 0.7789473684210526, 'f1-score': 0.7668393782383419, 'support': 95}, '0': {'precision': 0.7951807228915663, 'recall': 0.8048780487804879, 'f1-score': 0.8, 'support': 164}, '1': {'precision': 0.7105263157894737, 'recall': 0.627906976744186, 'f1

In [6]:
save_pickle(rdf_real_model, "models/" + dataset_string + "_rdf_reall_full.pkl")
save_pickle(acc_rdf, "metrics/" + dataset_string + "_rdf_reall_full_acc.pkl")


# Variability baseline

In [7]:
variability_baseline_list = []

for _ in range(0, num_trials):    
    variability_baseline = evaluation_variability(df_tcs_train)
    variability_baseline_list.append(variability_baseline)

mean_var_baseline = mean(variability_baseline_list)
print(mean_var_baseline)
save_pickle(mean_var_baseline, "metrics/" + dataset_string + "_variability_baseline.pkl")


53.732345146094445


# Diversity baseline

In [8]:
diversity_baseline_list = []

for _ in range(0, num_trials):
    diversity_baseline = evaluation_diversity(df_tcs_train, df_tcs_train, baseline=True)
    diversity_baseline_list.append(diversity_baseline)

mean_diversity_baseline = mean(diversity_baseline_list)
print(mean_diversity_baseline)
save_pickle(mean_diversity_baseline, "metrics/" + dataset_string + "_diversity_baseline.pkl")


1.6886419994986452


# Quality of the final classification

In [9]:
class_acc_test, class_acc_train, class_models, class_report_rdf = evaluation_classification(df_tcs_train, 
                                                                                                df_tcs_test, 
                                                                                                rdf_depth=fixed_depth, 
                                                                                                depth_file_name='default', 
                                                                                                test_size_percentage=test_size_percentage)


In [10]:
print(class_acc_test)


[0.5974955277280859, 0.5885509838998211, 0.055456171735241505, 0.6565295169946332]


In [7]:
final_classification_rdf = class_acc_test[3]

save_pickle(final_classification_nb, "metrics/" + dataset_string + "_nb_classification_baseline.pkl")
save_pickle(final_classification_knn, "metrics/" + dataset_string + "_knn_classification_baseline.pkl")
save_pickle(final_classification_svm, "metrics/" + dataset_string + "_svm_classification_baseline.pkl")
save_pickle(final_classification_rdf, "metrics/" + dataset_string + "_rdf_classification_baseline.pkl")

save_pickle(class_report_rdf, "label-metrics/" + dataset_string + "_class_report_baseline_trials.pkl")
