# Gather Packages

In [139]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import joblib

# Gather Data

In [2]:
actual_calgary = pd.read_parquet('actual_511880780620.parquet.gzip')

# Data Preparation

In [126]:
variables = ['Temperature', 'Temperature_ctrl', 'HvacMode']
df = actual_calgary.copy()
df = df[variables]
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

df['HvacMode'] = df['HvacMode'].apply(lambda x: 1 if x == 'heat' else 0)

X = df.drop(columns='HvacMode')
y = df['HvacMode']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Training

In [143]:
lr = LogisticRegression(C=0.1)
lr.fit(X_train, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [144]:
rf = RandomForestClassifier(max_depth=2, n_estimators=50, n_jobs=-1)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [146]:
gb = GradientBoostingClassifier(max_depth=2, n_estimators=50, learning_rate=0.1)
gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=2,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=50,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

# Helper Function

In [155]:
def PrintMetrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print('Clasification Report:\n')
    print(classification_report(y_test, y_pred))
    print('Confusion Matrix:\n')
    print(confusion_matrix(y_test, y_pred))
    return 

# Model Evaluation

In [156]:
PrintMetrics(lr, X_test, y_test)

Clasification Report:

              precision    recall  f1-score   support

           0       0.71      0.64      0.68      2850
           1       0.89      0.91      0.90      8624

    accuracy                           0.85     11474
   macro avg       0.80      0.78      0.79     11474
weighted avg       0.84      0.85      0.84     11474

Confusion Matrix:

[[1834 1016]
 [ 743 7881]]


In [157]:
PrintMetrics(rf, X_test, y_test)

Clasification Report:

              precision    recall  f1-score   support

           0       0.69      0.74      0.71      2850
           1       0.91      0.89      0.90      8624

    accuracy                           0.85     11474
   macro avg       0.80      0.81      0.81     11474
weighted avg       0.85      0.85      0.85     11474

Confusion Matrix:

[[2101  749]
 [ 963 7661]]


In [158]:
PrintMetrics(gb, X_test, y_test)

Clasification Report:

              precision    recall  f1-score   support

           0       0.69      0.81      0.75      2850
           1       0.93      0.88      0.91      8624

    accuracy                           0.86     11474
   macro avg       0.81      0.85      0.83     11474
weighted avg       0.87      0.86      0.87     11474

Confusion Matrix:

[[2322  528]
 [1048 7576]]


# Model Pickling

In [159]:
joblib.dump(gb, 'XGBoost.pkl')

['XGBoost.pkl']