In [1]:
# Notebooks
import nbimporter
import os
import sys

# Functions from src
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Defined Functions
from utils import *

# Pandas, matplotlib, pickle, seaborn
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from statistics import mean 
from collections import Counter


In [2]:
# global variables/constants
num_trials = 30
test_size_percentage = 0.2
fixed_depth = 10


# Load ASHRAE Dataset - reduced labels

In [3]:
df_ashrae_train = pd.read_pickle("data/ashrae/ashrae_train_reduced.pkl")
df_ashrae_test = pd.read_pickle("data/ashrae/ashrae_test_reduced.pkl")

dataset_string = "ashrae-reduced"

print(len(df_ashrae_train))
print(len(df_ashrae_test))
df_ashrae_train.head(3)
# Number of training instances: 46477
# Number of testing instances: 19920


46477
19920


Unnamed: 0,SET,Clo,Met,Air temperature (¡C),Relative humidity (%),Air velocity (m/s),Thermal sensation rounded
0,19.96,0.49,1.0,23.2,41.9,0.38,0.0
1,27.0,1.04,1.01,28.3,14.1,0.18,0.0
2,19.08,0.5,1.0,20.6,45.8,0.08,-1.0


# Classification models on train data (imbalanced)

In [4]:
acc_rdf, rdf_real_model = train_rdf(df_ashrae_train, rdf_depth=fixed_depth, test_size_percentage=test_size_percentage)


In [5]:
print("rdf acc CV: {}".format(acc_rdf))


In [6]:
save_pickle(rdf_real_model, "models/" + dataset_string + "_rdf_reall_full.pkl")
save_pickle(acc_rdf, "metrics/" + dataset_string + "_rdf_reall_full_acc.pkl")


# Variability baseline

In [7]:
variability_baseline_list = []

for _ in range(0, num_trials):
    variability_baseline = evaluation_variability(df_ashrae_train)
    variability_baseline_list.append(variability_baseline)

mean_var_baseline = mean(variability_baseline_list)
print(mean_var_baseline)
save_pickle(mean_var_baseline, "metrics/" + dataset_string + "_variability_baseline.pkl")


18.164089910819968


# Diversity baseline

In [8]:
diversity_baseline_list = []

for _ in range(0, num_trials):
    diversity_baseline = evaluation_diversity(df_ashrae_train, df_ashrae_train, baseline=True)
    diversity_baseline_list.append(diversity_baseline)

mean_diversity_baseline = mean(diversity_baseline_list)
print(mean_diversity_baseline)
save_pickle(mean_diversity_baseline, "metrics/" + dataset_string + "_diversity_baseline.pkl")


0.4822430597895802


# Quality of the final classification

In [9]:
class_acc_test, class_acc_train, class_models, class_report_rdf = evaluation_classification(df_ashrae_train, 
                                                                                                df_ashrae_test, 
                                                                                                rdf_depth=fixed_depth, 
                                                                                                depth_file_name='default', 
                                                                                                test_size_percentage=test_size_percentage)


In [10]:
print(class_acc_test)
print(class_report_rdf)


[0.35727911646586347, 0.4592871485943775, 0.35672690763052206, 0.4944277108433735]
{'-1.0': {'precision': 0.39863852961198093, 'recall': 0.6037113402061856, 'f1-score': 0.4801968019680196, 'support': 4850}, '0.0': {'precision': 0.5317097090552468, 'recall': 0.39675570191486764, 'f1-score': 0.45442480966682963, 'support': 8199}, '1.0': {'precision': 0.5680656651695833, 'recall': 0.5338378693057779, 'f1-score': 0.5504201680672268, 'support': 6871}, 'accuracy': 0.4944277108433735, 'macro avg': {'precision': 0.49947130127893696, 'recall': 0.5114349704756104, 'f1-score': 0.4950139265673587, 'support': 19920}, 'weighted avg': {'precision': 0.5118506003284279, 'recall': 0.4944277108433735, 'f1-score': 0.49381126901572026, 'support': 19920}}


In [11]:
final_classification_rdf = class_acc_test[3] # RDF

save_pickle(final_classification_rdf, "metrics/" + dataset_string + "_rdf_classification_baseline.pkl")
save_pickle(class_report_rdf, "label-metrics/" + dataset_string + "_class_report_baseline_trials.pkl")
