In [1]:
# Notebooks
import nbimporter
import os
import sys

# Functions from src
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Defined Functions
from utils import *

# Pandas, matplotlib, pickle, seaborn
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from statistics import mean 
from collections import Counter


In [2]:
# global variables/constants
num_trials = 30
test_size_percentage = 0.2
fixed_depth = 10


# Load ASHRAE Dataset

In [3]:
df_ashrae = pd.read_pickle("data/ashrae/ashrae.pkl")
df_ashrae_train = pd.read_pickle("data/ashrae/ashrae_train.pkl")
df_ashrae_test = pd.read_pickle("data/ashrae/ashrae_test.pkl")

dataset_string = "ashrae"

print(len(df_ashrae_train))
print(len(df_ashrae_test))
df_ashrae_train.head(3)
# Number of training instances: 46477
# Number of testing instances: 19920


46477
19920


Unnamed: 0,SET,Clo,Met,Air temperature (¡C),Relative humidity (%),Air velocity (m/s),Thermal sensation rounded
0,19.96,0.49,1.0,23.2,41.9,0.38,0.0
1,27.0,1.04,1.01,28.3,14.1,0.18,0.0
2,19.08,0.5,1.0,20.6,45.8,0.08,-1.0


# Classification models on train data (imbalanced)

In [4]:
acc_rdf, rdf_real_model = train_rdf(df_ashrae_train, rdf_depth=fixed_depth, test_size_percentage=test_size_percentage)


In [5]:
print("rdf acc CV: {}".format(acc_rdf))


In [6]:
save_pickle(rdf_real_model, "models/" + dataset_string + "_rdf_reall_full.pkl")


# Variability baseline

In [4]:
variability_baseline_list = []

for _ in range(0, num_trials):
    variability_baseline = evaluation_variability(df_ashrae_train)
    variability_baseline_list.append(variability_baseline)

mean_var_baseline = mean(variability_baseline_list)
print(mean_var_baseline)
save_pickle(mean_var_baseline, "metrics/" + dataset_string + "_variability_baseline.pkl")


18.12202905181786


# Diversity baseline

In [6]:
diversity_baseline_list = []

for _ in range(0, num_trials):
    diversity_baseline = evaluation_diversity(df_ashrae_train, df_ashrae_train, baseline=True)
    diversity_baseline_list.append(diversity_baseline)

mean_diversity_baseline = mean(diversity_baseline_list)
print(mean_diversity_baseline)
save_pickle(mean_diversity_baseline, "metrics/" + dataset_string + "_diversity_baseline.pkl")


0.554039133125188


# Quality of the final classification

In [7]:
class_acc_test, class_acc_train, class_models, class_report_rdf = evaluation_classification(df_ashrae_train, 
                                                                                                df_ashrae_test, 
                                                                                                rdf_depth=fixed_depth, 
                                                                                                depth_file_name='default', 
                                                                                                test_size_percentage=test_size_percentage)




In [8]:
print(class_acc_test)
print(class_report_rdf)


[0.045532128514056226, 0.33619477911646584, 0.41159638554216865, 0.2628012048192771]
{'-3.0': {'precision': 0.12686230248306998, 'recall': 0.5520628683693517, 'f1-score': 0.20631424375917767, 'support': 509}, '-2.0': {'precision': 0.10586011342155009, 'recall': 0.27450980392156865, 'f1-score': 0.15279672578444747, 'support': 1224}, '-1.0': {'precision': 0.25087243117487396, 'recall': 0.20757138273981393, 'f1-score': 0.22717696629213482, 'support': 3117}, '0.0': {'precision': 0.5506050605060506, 'recall': 0.24417611903890718, 'f1-score': 0.3383185466835657, 'support': 8199}, '1.0': {'precision': 0.2864248098303101, 'recall': 0.25395590142671853, 'f1-score': 0.2692149044410834, 'support': 3855}, '2.0': {'precision': 0.22419488026424442, 'recall': 0.2574679943100996, 'f1-score': 0.23968218936217173, 'support': 2109}, '3.0': {'precision': 0.18053311793214863, 'recall': 0.4928335170893054, 'f1-score': 0.2642624889151641, 'support': 907}, 'accuracy': 0.2628012048192771, 'macro avg': {'precis

In [9]:
final_classification_rdf = class_acc_test[3] # RDF

save_pickle(final_classification_rdf, "metrics/" + dataset_string + "_rdf_classification_baseline.pkl")
save_pickle(class_report_rdf, "label-metrics/" + dataset_string + "_class_report_baseline_trials.pkl")
