In [1]:
# Notebooks
import nbimporter
import os
import sys

# Functions from src
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Defined functions
from utils import *

# Pandas, matplotlib, pickle, seaborn
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from statistics import mean 
from collections import Counter


In [2]:
# global variables/constants
num_trials = 30
test_size_percentage = 0.2
fixed_depth = 10


# Load Cresh Dataset

In [3]:
df_cresh = pd.read_pickle("data/cresh/cresh.pkl")
df_cresh_train = pd.read_pickle("data/cresh/cresh_train.pkl")
df_cresh_test = pd.read_pickle("data/cresh/cresh_test.pkl")

dataset_string = "cresh"

# total count for instances per class: 713
print(df_cresh_train.describe())


       heartRate_cozie  humidity_sensing  light_sensing  noise_sensing  \
count      1159.000000       1159.000000    1159.000000    1159.000000   
mean         80.327869         65.699624     184.154443      51.646247   
std          14.835174         10.741136     228.407204       7.400868   
min          51.000000         51.030800      11.000000      41.000000   
25%          69.500000         58.711700      60.000000      46.000000   
50%          79.000000         61.147500      99.000000      49.000000   
75%          90.000000         73.851950     224.000000      54.000000   
max         149.000000         98.682200    1294.000000      72.000000   

       temperature_sensing  temperature_mbient     hour_sin     hour_cos  \
count          1159.000000         1159.000000  1159.000000  1159.000000   
mean             26.680520           28.807916    -0.537530    -0.548435   
std               1.642619            1.337675     0.433082     0.472309   
min              21.658800   

# Classification models on train data (imbalanced)

In [6]:
acc_rdf, rdf_real_model = train_rdf(df_cresh_train, rdf_depth=fixed_depth, test_size_percentage=test_size_percentage)


Features: ['heartRate_cozie' 'humidity_sensing' 'light_sensing' 'noise_sensing'
 'temperature_sensing' 'temperature_mbient' 'hour_sin' 'hour_cos'
 'day_of_week_sin' 'day_of_week_cos']
Expected accuracy (f1 micro) based on Cross-Validation:  0.5846189808321645
GaussianNB(priors=None, var_smoothing=1e-09)
Number of folds: 10
Best parameters set found on development set:
{'algorithm': 'brute', 'metric': 'seuclidean', 'n_neighbors': 13, 'weights': 'distance'}
Number of folds: 10
Best parameters set found on development set:
{'C': 1000, 'class_weight': 'balanced', 'gamma': 0.1, 'kernel': 'rbf'}
Number of folds: 10
Best parameters set found on development set:
{'class_weight': 'balanced', 'criterion': 'gini', 'min_samples_split': 2, 'n_estimators': 100}


In [7]:
print("rdf acc CV: {}".format(acc_rdf))


nb acc CV: 0.5818965517241379
knn acc CV: 0.6810344827586207
svm acc CV: 0.6810344827586207
rdf acc CV: 0.5858470764617691


In [8]:
save_pickle(rdf_real_model, "models/" + dataset_string + "_rdf_reall_full.pkl")
save_pickle(acc_rdf, "metrics/" + dataset_string + "_rdf_reall_full_acc.pkl")


# Variability baseline

In [9]:
variability_baseline_list = []

for _ in range(0, num_trials):
    variability_baseline = evaluation_variability(df_cresh_train)
    variability_baseline_list.append(variability_baseline)

mean_var_baseline = mean(variability_baseline_list)
print(mean_var_baseline)
save_pickle(mean_var_baseline, "metrics/" + dataset_string + "_variability_baseline.pkl")


Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort: 9
Thermal Comfort: 11
Thermal Comfort: 10
Thermal Comfort:

# Diversity baseline

In [10]:
diversity_baseline_list = []

for _ in range(0, num_trials):
    diversity_baseline = evaluation_diversity(df_cresh_train, df_cresh_train, baseline=True)
    diversity_baseline_list.append(diversity_baseline)

mean_diversity_baseline = mean(diversity_baseline_list)
print(mean_diversity_baseline)
save_pickle(mean_diversity_baseline, "metrics/" + dataset_string + "_diversity_baseline.pkl")


5.6614872565810765


# Quality of the final classification

The paper that builds group models on this feature set only uses RDF https://www.researchgate.net/publication/338527635_Indoor_Comfort_Personalities_Scalable_Occupant_Preference_Capture_Using_Micro_Ecological_Momentary_Assessments


In [4]:
class_acc_test, class_acc_train, class_models, class_report_rdf = evaluation_classification(df_cresh_train, 
                                                                                                df_cresh_test, 
                                                                                                rdf_depth=fixed_depth, 
                                                                                                depth_file_name='default', 
                                                                                                test_size_percentage=test_size_percentage)




In [5]:
print(class_acc_test)
print(class_report_rdf)

[0.30158730158730157, 0.6095238095238096, 0.7111111111111111, 0.6444444444444445]
{'9.0': {'precision': 0.25, 'recall': 0.02857142857142857, 'f1-score': 0.05128205128205128, 'support': 35}, '10.0': {'precision': 0.75, 'recall': 0.7633928571428571, 'f1-score': 0.7566371681415928, 'support': 224}, '11.0': {'precision': 0.37349397590361444, 'recall': 0.5535714285714286, 'f1-score': 0.4460431654676259, 'support': 56}, 'accuracy': 0.6444444444444445, 'macro avg': {'precision': 0.4578313253012048, 'recall': 0.4485119047619048, 'f1-score': 0.41798746163042333, 'support': 315}, 'weighted avg': {'precision': 0.6275100401606425, 'recall': 0.6444444444444445, 'f1-score': 0.6230476657929385, 'support': 315}}


In [7]:
# Figure 6 from the paper above
final_classification_rdf = class_acc_test[3]

save_pickle(final_classification_rdf, "metrics/" + dataset_string + "_rdf_classification_baseline.pkl")
save_pickle(class_report_rdf, "label-metrics/" + dataset_string + "_class_report_baseline_trials.pkl")
