In [4]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict

# Data Preparation

In [5]:
class_name = 'Occupancy'
df_training = pd.read_csv('training.csv', skipinitialspace=True, na_values='?', keep_default_na=True)
df_test = pd.read_csv('test.csv', skipinitialspace=True, na_values='?', keep_default_na=True)


columns2remove = ['date', 'Unnamed: 0', 'cumulative_hour', 'cumulative_minute', 'day', 'weekend', 'day_minute', 'minute', 'hour', 'Light']
df_training.drop(columns2remove, inplace=True, axis=1)
df_test.drop(columns2remove, inplace=True, axis=1)
df_training.head()

attributes = [col for col in df_training.columns if col != class_name]

X_train = df_training[attributes].values
y_train = df_training[class_name]

X_test = df_test[attributes].values
y_test = df_test[class_name]

#Unisco training e test perche' fra un po' Guidotti usa X e y, numpy array prima dello split
frames = [df_training, df_test]

result = pd.concat(frames)

attributes = [col for col in df_training.columns if col != class_name]
X = result[attributes].values
y = result[class_name]

feature_names = ['Temperature', 'Humidity', 'CO2', 'HumidityRatio']

# Data Partitioning

In [6]:
df_training.head()

Unnamed: 0,Temperature,Humidity,CO2,HumidityRatio,Occupancy
0,23.7,26.272,749.2,0.004764,1
1,23.718,26.29,760.4,0.004773,1
2,23.73,26.23,769.666667,0.004765,1
3,23.7225,26.125,774.75,0.004744,1
4,23.754,26.2,779.0,0.004767,1


In [7]:
from sklearn.model_selection import train_test_split, cross_val_score 

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [8]:
X_train.shape

(14392, 4)

In [6]:
np.sqrt(5)

2.23606797749979

# Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
#import pickle
#pickle.dump(clf, 'filename.pickle')
#clf = pickle.load('filename.pickle')

In [9]:
#classification_report(y_test, y_pred, output_dict=True)

In [10]:
#import json
#json.dumps(classification_report(y_test, y_pred, output_dict=True))

In [11]:
#json.loads(json.dumps(classification_report(y_test, y_pred, output_dict=True)))

In [12]:
from sklearn.inspection import permutation_importance

In [13]:
import pydotplus
from sklearn import tree
from IPython.display import Image
import os

### Cross Validation

In [14]:
from sklearn.model_selection import cross_val_score

In [15]:
clf5 = RandomForestClassifier()
scores = cross_val_score(clf5, X, y, cv=5)

print('Accuracy %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Accuracy 0.611 +/- 0.204


### Tuning the hyper-parameters

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [17]:
%%time

param_list = {'max_depth': [None] + list(np.arange(2, 20)),
              'min_samples_split': [10, 20, 30, 50, 100, 150, 200, 250],
              'min_samples_leaf': [1, 5, 10, 20, 30, 50, 100],
              'max_features' : ['auto', 'log2', 'None'] + list(np.arange(1, 4))
             }

grid_search = GridSearchCV(clf5, param_grid=param_list, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
clf6_grid = grid_search.best_estimator_

y_pred = clf6_grid.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))


Accuracy 0.881485084306096
F1-score [0.92487925 0.7193858 ]
              precision    recall  f1-score   support

           0       0.97      0.89      0.92      5071
           1       0.62      0.85      0.72      1097

    accuracy                           0.88      6168
   macro avg       0.79      0.87      0.82      6168
weighted avg       0.90      0.88      0.89      6168

Wall time: 3h 44min 50s


In [18]:
clf6_grid

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features=1,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=200,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
%%time
#MI SEMBRA CHE QUESTO SIA IL TOP
param_list = {'max_depth': [None] + list(np.arange(2, 20)),
              'min_samples_split': [10, 20, 30, 50, 100, 150, 200, 250],
              'min_samples_leaf': [1, 5, 10, 20, 30, 50, 100],
              'max_features' : ['auto', 'log2', 'None'] + list(np.arange(1, 4))
             }

random_search = RandomizedSearchCV(clf5, param_distributions=param_list, n_iter=100, cv=5, n_jobs=-1)
random_search.fit(X_train, y_train)
clf6_random = random_search.best_estimator_


y_pred = clf6_random.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
report = classification_report(y_test, y_pred)
print("precision:", report['1']['precision'])

Accuracy 0.857976653696498
F1-score [0.90850219 0.68283852]
              precision    recall  f1-score   support

           0       0.97      0.86      0.91      5071
           1       0.57      0.86      0.68      1097

    accuracy                           0.86      6168
   macro avg       0.77      0.86      0.80      6168
weighted avg       0.89      0.86      0.87      6168



TypeError: string indices must be integers

In [20]:
clf6_random

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features=1,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=30, min_samples_split=30,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
#BEST MODEL EVER
clf6_random_best = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features=3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=20, min_samples_split=100,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

clf6_random_best.fit(X_train, y_train)
y_pred = clf6_random_best.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
report = classification_report(y_test, y_pred, output_dict = True)
print("precision [1]:", report['1']['precision'])
print("recall [1]:", report['1']['recall'])

Accuracy 0.5168612191958496
F1-score [0.58553547 0.42090944]
              precision    recall  f1-score   support

           0       0.99      0.42      0.59      5071
           1       0.27      0.99      0.42      1097

    accuracy                           0.52      6168
   macro avg       0.63      0.70      0.50      6168
weighted avg       0.86      0.52      0.56      6168

precision [1]: 0.2674734502346258
recall [1]: 0.9872379216043756


In [22]:
#predict su training e stampo classification per vedere se fa overfitting

y_pred_train = clf6_random.predict(X_train)

print('Accuracy %s' % accuracy_score(y_train, y_pred_train))
print('F1-score %s' % f1_score(y_train, y_pred_train, average=None))
print(classification_report(y_train, y_pred_train))

Accuracy 0.8896609227348526
F1-score [0.92497401 0.79154634]
              precision    recall  f1-score   support

           0       0.94      0.91      0.92     10739
           1       0.76      0.83      0.79      3653

    accuracy                           0.89     14392
   macro avg       0.85      0.87      0.86     14392
weighted avg       0.89      0.89      0.89     14392



In [23]:
clf6_random

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features=1,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=30, min_samples_split=30,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
#Stampa importanza variabili nel trainig e test

# Bagging

In [12]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

If None, then the base estimator is a decision tree.

In [26]:
help(BaggingClassifier)

Help on class BaggingClassifier in module sklearn.ensemble._bagging:

class BaggingClassifier(sklearn.base.ClassifierMixin, BaseBagging)
 |  A Bagging classifier.
 |  
 |  A Bagging classifier is an ensemble meta-estimator that fits base
 |  classifiers each on random subsets of the original dataset and then
 |  aggregate their individual predictions (either by voting or by averaging)
 |  to form a final prediction. Such a meta-estimator can typically be used as
 |  a way to reduce the variance of a black-box estimator (e.g., a decision
 |  tree), by introducing randomization into its construction procedure and
 |  then making an ensemble out of it.
 |  
 |  This algorithm encompasses several works from the literature. When random
 |  subsets of the dataset are drawn as random subsets of the samples, then
 |  this algorithm is known as Pasting [1]_. If samples are drawn with
 |  replacement, then the method is known as Bagging [2]_. When random subsets
 |  of the dataset are drawn as r

In [17]:

%%time

clf13B = DecisionTreeClassifier(max_depth=1)
clf13B.fit(X_train, y_train)

y_pred = clf13B.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.45622568093385213
F1-score [0.50589275 0.39545782]
              precision    recall  f1-score   support

           0       1.00      0.34      0.51      5071
           1       0.25      1.00      0.40      1097

    accuracy                           0.46      6168
   macro avg       0.62      0.67      0.45      6168
weighted avg       0.87      0.46      0.49      6168

Wall time: 22.9 ms


In [25]:
%%time

#Decision Tree
clf7 = BaggingClassifier(base_estimator=None, n_estimators=18, random_state=0, n_jobs=-1)
clf7.fit(X_train, y_train)

y_pred = clf7.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.6003566796368353
F1-score [0.70276137 0.39030423]
              precision    recall  f1-score   support

           0       0.90      0.57      0.70      5071
           1       0.27      0.72      0.39      1097

    accuracy                           0.60      6168
   macro avg       0.59      0.65      0.55      6168
weighted avg       0.79      0.60      0.65      6168

Wall time: 177 ms


In [13]:
%%time

clf8B = SVC(C=1000)

clf8B.fit(X_train, y_train)

y_pred = clf8B.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.6796368352788587
F1-score [0.75802106 0.52613909]
              precision    recall  f1-score   support

           0       1.00      0.61      0.76      5071
           1       0.36      1.00      0.53      1097

    accuracy                           0.68      6168
   macro avg       0.68      0.81      0.64      6168
weighted avg       0.89      0.68      0.72      6168

Wall time: 6.2 s


In [15]:
%%time

clf8 = BaggingClassifier(base_estimator=SVC(C=1000), n_estimators=30, random_state=0, n_jobs=-1)

clf8.fit(X_train, y_train)

y_pred = clf8.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.7133592736705577
F1-score [0.78887031 0.55376073]
              precision    recall  f1-score   support

           0       1.00      0.65      0.79      5071
           1       0.38      1.00      0.55      1097

    accuracy                           0.71      6168
   macro avg       0.69      0.83      0.67      6168
weighted avg       0.89      0.71      0.75      6168

Wall time: 35.3 s


In [29]:
%%time

clf9 = BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=100), n_estimators=100, random_state=0, n_jobs=-1)
clf9.fit(X_train, y_train)

y_pred = clf9.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.4696822308690013
F1-score [0.56158692 0.32902564]
              precision    recall  f1-score   support

           0       0.88      0.41      0.56      5071
           1       0.21      0.73      0.33      1097

    accuracy                           0.47      6168
   macro avg       0.54      0.57      0.45      6168
weighted avg       0.76      0.47      0.52      6168

Wall time: 2min 5s


In [30]:
%%time

clf10 = BaggingClassifier(base_estimator=clf6_random, n_estimators=100, random_state=0, n_jobs=-1)
clf10.fit(X_train, y_train)

y_pred = clf10.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
report = classification_report(y_test, y_pred, output_dict = True)
print("precision [1]:", report['1']['precision'])
print("recall [1]:", report['1']['recall'])

Accuracy 0.856355382619974
F1-score [0.90716681 0.68266476]
              precision    recall  f1-score   support

           0       0.97      0.85      0.91      5071
           1       0.56      0.87      0.68      1097

    accuracy                           0.86      6168
   macro avg       0.77      0.86      0.79      6168
weighted avg       0.90      0.86      0.87      6168

precision [1]: 0.5622418879056047
recall [1]: 0.8687329079307201
Wall time: 1min 47s


In [31]:
%%time

clf10_bis = BaggingClassifier(base_estimator=clf6_random_best, n_estimators=100, random_state=0, n_jobs=-1)
clf10_bis.fit(X_train, y_train)

y_pred = clf10_bis.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))
report = classification_report(y_test, y_pred, output_dict = True)
print("precision [1]:", report['1']['precision'])
print("recall [1]:", report['1']['recall'])

Accuracy 0.7417315175097277
F1-score [0.8143573  0.57576565]
              precision    recall  f1-score   support

           0       1.00      0.69      0.81      5071
           1       0.41      0.99      0.58      1097

    accuracy                           0.74      6168
   macro avg       0.70      0.84      0.70      6168
weighted avg       0.89      0.74      0.77      6168

precision [1]: 0.40669676448457487
recall [1]: 0.9854147675478578
Wall time: 2min 15s


In [32]:
%%time

clf11 = BaggingClassifier(base_estimator=clf6_grid, n_estimators=100, random_state=0, n_jobs=-1)
clf11.fit(X_train, y_train)

y_pred = clf11.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.8818093385214008
F1-score [0.92503856 0.72079663]
              precision    recall  f1-score   support

           0       0.97      0.89      0.93      5071
           1       0.62      0.86      0.72      1097

    accuracy                           0.88      6168
   macro avg       0.79      0.87      0.82      6168
weighted avg       0.91      0.88      0.89      6168

Wall time: 1min 31s


# Boosting

In [2]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


If None, then the base estimator is DecisionTreeClassifier(max_depth=1).

In [9]:
%%time

clf12 = AdaBoostClassifier(base_estimator=None, n_estimators=100, random_state=0)
clf12.fit(X_train, y_train)

y_pred = clf12.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.647697795071336
F1-score [0.74706088 0.41975968]
              precision    recall  f1-score   support

           0       0.91      0.63      0.75      5071
           1       0.30      0.72      0.42      1097

    accuracy                           0.65      6168
   macro avg       0.60      0.67      0.58      6168
weighted avg       0.80      0.65      0.69      6168

Wall time: 1.18 s


In [10]:

%%time

clf13B = DecisionTreeClassifier(max_depth=1)
clf13B.fit(X_train, y_train)

y_pred = clf13B.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.45622568093385213
F1-score [0.50589275 0.39545782]
              precision    recall  f1-score   support

           0       1.00      0.34      0.51      5071
           1       0.25      1.00      0.40      1097

    accuracy                           0.46      6168
   macro avg       0.62      0.67      0.45      6168
weighted avg       0.87      0.46      0.49      6168

Wall time: 23.9 ms


In [35]:
%%time

clf13 = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=100), n_estimators=100, random_state=0)
clf13.fit(X_train, y_train)

y_pred = clf13.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.47649156939040205
F1-score [0.57541091 0.31748045]
              precision    recall  f1-score   support

           0       0.86      0.43      0.58      5071
           1       0.21      0.68      0.32      1097

    accuracy                           0.48      6168
   macro avg       0.54      0.56      0.45      6168
weighted avg       0.75      0.48      0.53      6168

Wall time: 4min 13s


# Le celle che seguono impiegano un bel po' di tempo a concludere

In [36]:
%%time

clf14 = AdaBoostClassifier(base_estimator=clf6_random, n_estimators=100, random_state=0)
clf14.fit(X_train, y_train)

y_pred = clf14.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.4803826199740597
F1-score [0.57215325 0.33849329]
              precision    recall  f1-score   support

           0       0.89      0.42      0.57      5071
           1       0.22      0.75      0.34      1097

    accuracy                           0.48      6168
   macro avg       0.55      0.59      0.46      6168
weighted avg       0.77      0.48      0.53      6168

Wall time: 2min 7s


In [37]:
%%time
# clf6_grid aveva Accuracy 0.9944876783398184

clf15 = AdaBoostClassifier(base_estimator=clf6_grid, n_estimators=100, random_state=0)
clf15.fit(X_train, y_train)

y_pred = clf15.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.5299935149156939
F1-score [0.62550058 0.36909684]
              precision    recall  f1-score   support

           0       0.91      0.48      0.63      5071
           1       0.24      0.77      0.37      1097

    accuracy                           0.53      6168
   macro avg       0.57      0.63      0.50      6168
weighted avg       0.79      0.53      0.58      6168

Wall time: 1min 44s


In [38]:
#provare a:
# - utilizzare un altro classificatore (Knn, naive, SVM) OK
# - ciclo per vedere performance al variare di n_estimators
# - predict sul training per vedere se fa overfitting (clf6_grif e clf15)

In [39]:
%%time
# utilizzo un altro classificatore come base classifier

from sklearn.neighbors import KNeighborsClassifier
#knn base estimator
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=45, weights='distance')

clf14_bis = AdaBoostClassifier(base_estimator=knn, n_estimators=100, random_state=0)
clf14_bis.fit(X_train, y_train)

y_pred = clf14_bis.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

ValueError: KNeighborsClassifier doesn't support sample_weight.

In [40]:
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None,
                           n_jobs=-1, n_neighbors=45, weights = 'uniform')
clf16_bis = BaggingClassifier(base_estimator=knn, n_estimators=10, random_state=0, n_jobs=-1)
clf16_bis.fit(X_train, y_train)

y_pred = clf16_bis.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.5976005188067445
F1-score [0.67597911 0.46920445]
              precision    recall  f1-score   support

           0       1.00      0.51      0.68      5071
           1       0.31      1.00      0.47      1097

    accuracy                           0.60      6168
   macro avg       0.65      0.76      0.57      6168
weighted avg       0.88      0.60      0.64      6168



In [41]:
help(KNeighborsClassifier)

Help on class KNeighborsClassifier in module sklearn.neighbors._classification:

class KNeighborsClassifier(sklearn.neighbors._base.NeighborsBase, sklearn.neighbors._base.KNeighborsMixin, sklearn.neighbors._base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)
 |  Classifier implementing the k-nearest neighbors vote.
 |  
 |  Read more in the :ref:`User Guide <classification>`.
 |  
 |  Parameters
 |  ----------
 |  n_neighbors : int, optional (default = 5)
 |      Number of neighbors to use by default for :meth:`kneighbors` queries.
 |  
 |  weights : str or callable, optional (default = 'uniform')
 |      weight function used in prediction.  Possible values:
 |  
 |      - 'uniform' : uniform weights.  All points in each neighborhood
 |        are weighted equally.
 |      - 'distance' : weight points by the inverse of their distance.
 |        in this case, closer neighbors of a query point will have a
 |        greater influence than neighbors which are further away.
 |      -

In [42]:
%%time
# utilizzo un altro classificatore come base classifier
from sklearn.naive_bayes import GaussianNB

#naive base estimator
naive = GaussianNB()

clf14_tris = AdaBoostClassifier(base_estimator=naive, n_estimators=100, random_state=0)
clf14_tris.fit(X_train, y_train)

y_pred = clf14_tris.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.8328469520103762
F1-score [0.90209857 0.42880886]
              precision    recall  f1-score   support

           0       0.87      0.94      0.90      5071
           1       0.55      0.35      0.43      1097

    accuracy                           0.83      6168
   macro avg       0.71      0.64      0.67      6168
weighted avg       0.81      0.83      0.82      6168

Wall time: 1.88 s


In [43]:
%%time
#BAGGING n_estimators = 1000
#Nota: Questi risultati sono uguali a quelli di clf6_random con n_estimators=100

clf16 = BaggingClassifier(base_estimator=clf6_random, n_estimators=1000, random_state=0, n_jobs=-1)
clf16.fit(X_train, y_train)

y_pred = clf16.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

# clf6_random aveva Accuracy 0.995136186770428

Accuracy 0.856355382619974
F1-score [0.90716681 0.68266476]
              precision    recall  f1-score   support

           0       0.97      0.85      0.91      5071
           1       0.56      0.87      0.68      1097

    accuracy                           0.86      6168
   macro avg       0.77      0.86      0.79      6168
weighted avg       0.90      0.86      0.87      6168

Wall time: 5min 46s


In [44]:
#check overfitting
y_pred_train = clf16.predict(X_train)

print('Accuracy %s' % accuracy_score(y_train, y_pred_train))
print('F1-score %s' % f1_score(y_train, y_pred_train, average=None))
print(classification_report(y_train, y_pred_train))

Accuracy 0.8937604224569206
F1-score [0.92809105 0.79670257]
              precision    recall  f1-score   support

           0       0.94      0.92      0.93     10739
           1       0.77      0.82      0.80      3653

    accuracy                           0.89     14392
   macro avg       0.86      0.87      0.86     14392
weighted avg       0.90      0.89      0.89     14392



In [45]:
%%time
#BAGGING n_estimators = 1000
#Nota: Questi risultati sono uguali a quelli di clf6_grid :(

clf17 = BaggingClassifier(base_estimator=clf6_grid, n_estimators=1000, random_state=0, n_jobs=-1)
clf17.fit(X_train, y_train)

y_pred = clf17.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

# clf6_grid aveva Accuracy 0.9944876783398184

Accuracy 0.8818093385214008
F1-score [0.92503856 0.72079663]
              precision    recall  f1-score   support

           0       0.97      0.89      0.93      5071
           1       0.62      0.86      0.72      1097

    accuracy                           0.88      6168
   macro avg       0.79      0.87      0.82      6168
weighted avg       0.91      0.88      0.89      6168

Wall time: 5min 31s


In [46]:
%%time
#BAGGING n_estimators = 1000, random_state = 1
#Non cambia nulla con la cella sopra :(

clf18 = BaggingClassifier(base_estimator=clf6_grid, n_estimators=1000, random_state=1, n_jobs=-1)
clf18.fit(X_train, y_train)

y_pred = clf18.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.8818093385214008
F1-score [0.92503856 0.72079663]
              precision    recall  f1-score   support

           0       0.97      0.89      0.93      5071
           1       0.62      0.86      0.72      1097

    accuracy                           0.88      6168
   macro avg       0.79      0.87      0.82      6168
weighted avg       0.91      0.88      0.89      6168

Wall time: 5min 52s


In [47]:
%%time
#BAGGING n_estimators = 10000, random_state = 10
#Da confrontare con la cella sopra!

#Voglio rispondermi alle seguenti domande che scrivo qua altrimenti non so perche' sto facendo le cose:
#Ci mette 10 volte il tempo che ci ha messo la cella sopra?
#Cambia Accuracy 0.9944876783398184?

#Non e' cambiato nulla e ci ha messo 2 ore
clf19 = BaggingClassifier(base_estimator=clf6_grid, n_estimators=10000, random_state=10, n_jobs=-1)
clf19.fit(X_train, y_train)

y_pred = clf19.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))



Accuracy 0.881485084306096
F1-score [0.92484836 0.71981602]
              precision    recall  f1-score   support

           0       0.97      0.89      0.92      5071
           1       0.62      0.86      0.72      1097

    accuracy                           0.88      6168
   macro avg       0.79      0.87      0.82      6168
weighted avg       0.90      0.88      0.89      6168

Wall time: 53min 5s
