In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score 

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import GaussianNB, CategoricalNB

class_name = 'Occupancy'
df_training = pd.read_csv('training.csv', skipinitialspace=True, na_values='?', 
                 keep_default_na=True)

df_test = pd.read_csv('test.csv', skipinitialspace=True, na_values='?', 
                 keep_default_na=True)

columns2remove = ['date', 'Unnamed: 0', 'cumulative_hour', 'cumulative_minute', 'day']
df_training.drop(columns2remove, inplace=True, axis=1)
df_test.drop(columns2remove, inplace=True, axis=1)

df_training.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,hour,minute,day_minute,weekend
0,23.7,26.272,585.2,749.2,0.004764,1,14,19,859,0
1,23.718,26.29,578.4,760.4,0.004773,1,14,20,860,0
2,23.73,26.23,572.666667,769.666667,0.004765,1,14,21,861,0
3,23.7225,26.125,493.75,774.75,0.004744,1,14,22,862,0
4,23.754,26.2,488.6,779.0,0.004767,1,14,23,863,0


In [8]:
attributes = [col for col in df_training.columns if col != class_name]

#X_train = scaler.fit_transform(df_training[attributes].values) # .values per convertire il df_training da pandas dataframe in numpy array
X_train = df_training[attributes].values
y_train = df_training[class_name]

#X_test = scaler.fit_transform(df_test[attributes].values) 
X_test = df_test[attributes].values # senza normalizzazione
y_test = df_test[class_name]

In [9]:
# Per prima cosa proviamo a creare un modello utilizzando tutte le variabili.
# sotto poi proveremo alcune combinazioni e vedremo che hanno delle performance migliori.
clf = GaussianNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
"""
precision = TP / ( TP + FP ) TP + FP tutti i record con valore predetto positivo (=0 in questo caso)
precision[0] = 1 vuol dire che non ci sono FP rispetto a 0 (non ci sono 1 classificati come 0)
recall = TP / ( TP + FN ) TP + FN tutti i record con valore reale positivo (=0 in questo caso)
recall[0] = 0.87 vuol dire che ci sono FN rispetto a 0 (ci sono 0 che classifico 1)
TP = 87%,    FN = 13%

recall = TP / ( TP + FN )
recall[1] = 1 vuol dire che non ci sono FN rispetto a 1 (non ci sono 1 classificati come 0)



positivo = 0
classifico 0, reale 0 ---> TP
classifico 1, reale 1 ---> TN
classifico 0, reale 1 ---> FP
classifico 1, reale 0 ---> FN

positivo = 1

""" 

Accuracy 0.8905642023346303
F1-score [0.92869969 0.76472639]
              precision    recall  f1-score   support

           0       1.00      0.87      0.93      5071
           1       0.62      1.00      0.76      1097

    accuracy                           0.89      6168
   macro avg       0.81      0.93      0.85      6168
weighted avg       0.93      0.89      0.90      6168



'\nprecision = TP / ( TP + FP ) TP + FP tutti i record con valore predetto positivo (=0 in questo caso)\nprecision[0] = 1 vuol dire che non ci sono FP rispetto a 0 (non ci sono 1 classificati come 0)\nrecall = TP / ( TP + FN ) TP + FN tutti i record con valore reale positivo (=0 in questo caso)\nrecall[0] = 0.87 vuol dire che ci sono FN rispetto a 0 (ci sono 0 che classifico 1)\nTP = 87%,    FN = 13%\n\nrecall = TP / ( TP + FN )\nrecall[1] = 1 vuol dire che non ci sono FN rispetto a 1 (non ci sono 1 classificati come 0)\n\n\n\npositivo = 0\nclassifico 0, reale 0 ---> TP\nclassifico 1, reale 1 ---> TN\nclassifico 0, reale 1 ---> FP\nclassifico 1, reale 0 ---> FN\n\npositivo = 1\n\n'

In [10]:
df_training[attributes]

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,hour,minute,day_minute,weekend
0,23.7000,26.2720,585.200000,749.200000,0.004764,14,19,859,0
1,23.7180,26.2900,578.400000,760.400000,0.004773,14,20,860,0
2,23.7300,26.2300,572.666667,769.666667,0.004765,14,21,861,0
3,23.7225,26.1250,493.750000,774.750000,0.004744,14,22,862,0
4,23.7540,26.2000,488.600000,779.000000,0.004767,14,23,863,0
...,...,...,...,...,...,...,...,...,...
14387,19.7900,36.2000,0.000000,501.000000,0.005172,2,27,147,1
14388,19.7900,36.2000,0.000000,502.000000,0.005172,2,28,148,1
14389,19.7900,36.2675,0.000000,506.333333,0.005182,2,29,149,1
14390,19.7900,36.2900,0.000000,510.333333,0.005185,2,30,150,1


In [9]:
from itertools import combinations
import operator

tested_attributes = ['Temperature', 'Light', 'CO2', 'Humidity', 'HumidityRatio']

summary = []
i=1

y_train = df_training[class_name]
y_test = df_test[class_name]

for r in range(2, 6):
    for attr in combinations(tested_attributes, r):
        # 'attr' è la combinazione di attributi da testare per ogni iterazione
        print("Attributes to test in iteration {}: {}".format(i, attr))
        X_train = df_training[list(attr)].values
        X_test = df_test[list(attr)].values

        clf = GaussianNB()
        clf.fit(X_train, y_train)
        
        #Cross validation per trovare il numero migliore di cv      
        cv_performances_f1 = {}
        cv_performances_acc = {}
        for k in range(3, 50):
            scores_acc = cross_val_score(clf, X_train, y_train, cv=k, n_jobs = -1)
            scores_f1 = cross_val_score(clf, X_train, y_train, cv=k, scoring='f1_macro', n_jobs = -1)
            cv_performances_f1[k] = scores_f1.mean()
            cv_performances_acc[k] = scores_acc.mean()
            
        best_performances_f1 = max(cv_performances_f1.items(), key=operator.itemgetter(1))[0]
        print("Best performance with cv = {}, f1-score = {}".format(best_performances_f1, cv_performances_f1[best_performances_f1]))
        print("Best performance with cv = {}, accuracy = {}".format(best_performances_f1, cv_performances_acc[best_performances_f1]))
        
        #classification on test set using best classifier for this iteration
        y_pred = clf.predict(X_test)
        
        print()
        print('Accuracy %s' % accuracy_score(y_test, y_pred))
        print('F1-score %s' % f1_score(y_test, y_pred, average=None))
        print(classification_report(y_test, y_pred))
        report = classification_report(y_test, y_pred, output_dict=True)
        
        summary.append({'iteration' : i,
                    'attributes': attr,
                    'classificator': clf,
                    'cv': best_performances_f1,
                    'cv_accuracy' : cv_performances_acc[best_performances_f1],
                    'cv_f1': cv_performances_f1[best_performances_f1],
                    'test_accuracy' : accuracy_score(y_test, y_pred),
                    'f1-score [0]' : f1_score(y_test, y_pred, average=None)[0],
                    'f1-score [1]' : f1_score(y_test, y_pred, average=None)[1],
                    'precision [1]' : report['1']['precision'],
                    'recall [1]' : report['1']['recall']})
        print("---------------------------------------------------------\n\n\n")
        
        i+=1

Attributes to test in iteration 1: ('Temperature', 'Light')
Best performance with cv = 48, f1-score = 0.955658464426352
Best performance with cv = 48, accuracy = 0.9583472686733555

Accuracy 0.9785992217898832
F1-score [0.98681582 0.94320138]
              precision    recall  f1-score   support

           0       1.00      0.97      0.99      5071
           1       0.89      1.00      0.94      1097

    accuracy                           0.98      6168
   macro avg       0.95      0.99      0.97      6168
weighted avg       0.98      0.98      0.98      6168

---------------------------------------------------------



Attributes to test in iteration 2: ('Temperature', 'CO2')
Best performance with cv = 14, f1-score = 0.8291232492383797
Best performance with cv = 14, accuracy = 0.8729155086158976

Accuracy 0.8218223086900129
F1-score [0.87942951 0.65880161]
              precision    recall  f1-score   support

           0       0.99      0.79      0.88      5071
           1      

Best performance with cv = 6, f1-score = 0.8202797037795125
Best performance with cv = 6, accuracy = 0.8651249136218025

Accuracy 0.8114461738002594
F1-score [0.87139224 0.6468266 ]
              precision    recall  f1-score   support

           0       0.99      0.78      0.87      5071
           1       0.48      0.97      0.65      1097

    accuracy                           0.81      6168
   macro avg       0.74      0.87      0.76      6168
weighted avg       0.90      0.81      0.83      6168

---------------------------------------------------------



Attributes to test in iteration 15: ('Temperature', 'CO2', 'HumidityRatio')
Best performance with cv = 14, f1-score = 0.8289918657502184
Best performance with cv = 14, accuracy = 0.8728460255697609

Accuracy 0.8216601815823605
F1-score [0.87930656 0.65859714]
              precision    recall  f1-score   support

           0       0.99      0.79      0.88      5071
           1       0.50      0.97      0.66      1097

    ac

In [10]:
#ordiniamo i risultati in ordine decrescente per precision
precisions = []

for i in range(len(summary)):
    precisions.append(float(summary[i]['precision [1]']))

#precision e summary hanno gli stessi indici
    
best_idx = precisions.index(max(precisions)) #index of the best classificator according to precision
summary[best_idx] #best classificator description

{'iteration': 7,
 'attributes': ('Light', 'HumidityRatio'),
 'classificator': GaussianNB(priors=None, var_smoothing=1e-09),
 'cv': 41,
 'cv_accuracy': 0.9670672879514343,
 'cv_f1': 0.9635431324157956,
 'test_accuracy': 0.9810311284046692,
 'f1-score [0]': 0.988331504936671,
 'f1-score [1]': 0.9493287137288869,
 'precision [1]': 0.9042904290429042,
 'recall [1]': 0.9990884229717412}

In [11]:
#Combinazioni degli attributi ordinate per precision[1], per selezionare quali modelli aggiungere alla tabella del report
from collections import OrderedDict

d = {}
i = 0
for r in range(2, 6):
    for attr in combinations(tested_attributes, r):
        d[attr] = precisions[i]
        i += 1

sortedDict = OrderedDict(sorted(d.items(), key=lambda x: x[1]))
print('Attributes          :       precision[1]\n')    
for key in sortedDict:
    print(key, ':', sortedDict[key])
    if(key == ('Light', 'CO2', 'HumidityRatio')):
        print('\n------------------ precision[1] > 88% ----------------------------------\n')

Attributes          :       precision[1]

('Humidity', 'HumidityRatio') : 0.18668885191347753
('CO2', 'HumidityRatio') : 0.4548856548856549
('CO2', 'Humidity') : 0.4585774058577406
('CO2', 'Humidity', 'HumidityRatio') : 0.4587693595646714
('Temperature', 'Humidity', 'HumidityRatio') : 0.48278285312719604
('Temperature', 'CO2', 'Humidity', 'HumidityRatio') : 0.48475193445607645
('Temperature', 'CO2', 'Humidity') : 0.4849726775956284
('Temperature', 'CO2', 'HumidityRatio') : 0.49929411764705883
('Temperature', 'CO2') : 0.4995291902071563
('Temperature', 'HumidityRatio') : 0.532312925170068
('Temperature', 'Humidity') : 0.5356662180349933
('Temperature', 'Light', 'CO2', 'Humidity') : 0.5632065775950668
('Temperature', 'Light', 'CO2', 'Humidity', 'HumidityRatio') : 0.5632065775950668
('Light', 'CO2', 'Humidity') : 0.564076170869789
('Light', 'CO2', 'Humidity', 'HumidityRatio') : 0.564076170869789
('Temperature', 'Light', 'CO2') : 0.5649484536082474
('Temperature', 'Light', 'CO2', 'Humidity

In [12]:
#Combinazioni degli attributi ordinate per recall[1], per selezionare quali modelli aggiungere alla tabella del report
recalls = []

for i in range(len(summary)):
    recalls.append(float(summary[i]['recall [1]']))
d2 = {}
i = 0
for r in range(2, 6):
    for attr in combinations(tested_attributes, r):
        d2[attr] = recalls[i]
        i += 1

sortedDict = OrderedDict(sorted(d2.items(), key=lambda x: x[1]))
print('Attributes          :       recall[1]\n')          
for key in sortedDict:
    print(key, ':', sortedDict[key])
    if(key == ('Temperature', 'Humidity', 'HumidityRatio')):
        print('\n------------------ recall[1] > 96% ------------------------------\n')
    if(key == ('CO2', 'HumidityRatio')):
        print('\n--------------------------------------------------------------\n')

Attributes          :       recall[1]

('Temperature', 'Humidity') : 0.3628076572470374
('Humidity', 'HumidityRatio') : 0.5113947128532361
('Temperature', 'HumidityRatio') : 0.5706472196900638
('Temperature', 'Humidity', 'HumidityRatio') : 0.626253418413856

------------------ recall[1] > 96% ------------------------------

('Temperature', 'CO2') : 0.96718322698268
('Temperature', 'CO2', 'HumidityRatio') : 0.96718322698268
('Temperature', 'CO2', 'Humidity') : 0.9708295350957156
('Temperature', 'CO2', 'Humidity', 'HumidityRatio') : 0.9708295350957156
('CO2', 'HumidityRatio') : 0.9972652689152234

--------------------------------------------------------------

('Temperature', 'Light') : 0.9990884229717412
('Light', 'CO2') : 0.9990884229717412
('Light', 'Humidity') : 0.9990884229717412
('Light', 'HumidityRatio') : 0.9990884229717412
('CO2', 'Humidity') : 0.9990884229717412
('Temperature', 'Light', 'CO2') : 0.9990884229717412
('Temperature', 'Light', 'Humidity') : 0.9990884229717412
('Temp