## Import Modules

In [1]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

#sklearn
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef


plt.style.use('fivethirtyeight')
import seaborn as sns
# Use seaborn style defaults and set the default figure size
sns.set_theme(style='whitegrid', font='Arial', rc={'figure.figsize':(10,5),
            'font.size':14,
            'axes.titlesize':16,
            'axes.labelsize':15,
            'xtick.labelsize': 12,
            'ytick.labelsize': 12,
            'legend.fontsize': 13},color_codes=True)

# CSV
import csv

In [2]:
# room number
a = 'E07'
# number of lags as input (not relevant in this case)
#b = '5'
# number of last timestep to predict
c = 12

In [6]:
# d = number of timestep to predict
for d in range(0, c+1):
    
    if d==0:
        # Timestep as string
        e = 't'
    else:
        # Timestep as string
        e = 't+'+str(d)
        
    
    # Import data
    
    # Read in data and set index

    raw_data = pd.read_csv("\Pre-Processing\data_E07_input_5_output_144.txt", parse_dates=True)
    data = raw_data.copy()
    data['DateTime'] = pd.to_datetime(data['DateTime'])
    data = data.set_index('DateTime')
    
    # Drop columns for Year and Second
    data = data.drop('Second_0', axis = 1)
    
    # Saving data names for later use
    data_list = list(data.columns)
    
    # Drop columns of future timestamps that should not be used as input for this model
    if d==0:
        for i in range(1,145):
            v = 't+'+str(i)
            data = data.drop(v, axis = 1)
    else:
        for i in range(d+1,145):
            v = 't+'+str(i)
            data = data.drop(v, axis = 1)

        for i in range(1, d):
            v = 't+'+str(i)
            data = data.drop(v, axis = 1)
            
            
    # parameter that only should be used as input
    if d==0:
        l = ['E07CO2', 'E07Tair', 'E07ElL', 'E07SP', 'E07W', 'E07WT', 't-1', 't-2', 't-3', 't-4','t-5']
    else:
        l = ['E07CO2', 'E07Tair', 'E07ElL', 'E07SP', 'E07W', 'E07WT', 't', 't-1', 't-2', 't-3', 't-4','t-5']
        
    for f in l:
        
        
        # Create data set / dummy variables
        
        # keep only date parameter, one additional parameter and the timestep to predict
        to_keep = [i for i in data_list[6:86]]
        to_keep.append(f)
        to_keep.append(e)
        
        data_final=data[to_keep]
        data_final.columns.values
        
        # Create X and y
        X = data_final.loc[:, data_final.columns != e]
        y = data_final.loc[:, data_final.columns == e]
        
        # format y to a 1D array
        y = y.values.ravel()
        
        # format X when more than one parameter are used as input
        sc_x = StandardScaler()
        X = sc_x.fit_transform(X)
        X = pd.DataFrame(X)
        
        
        # Logistic Regression Model Fitting
        
        # Split data into training and testing sets (75 % training, 25 % testing)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False, random_state=42)
        
#         # Split testing set into testing and validation sets (15 % training, 15 % validation of the whole data set --> 50/50 % of 30 %)
#         X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
        
        # Print shapes of data sets
        print('Training Data Shape:', X_train.shape)
        print('Training Labels Shape:', y_train.shape)
        print('Testing Data Shape:', X_test.shape)
        print('Testing Labels Shape:', y_test.shape)
#         print('Validation Data Shape:', X_val.shape)
#         print('Validation Labels Shape:', y_val.shape)
        
        # Create and fit model
        logreg = LogisticRegression(max_iter=1000)
        logreg.fit(X_train, y_train)
        
        # accuracy on the training set
        accuracy_train = logreg.score(X_train, y_train)
        accuracy_train_percent = round(accuracy_train*100, 3)

        # Make predictions on the test set
        y_pred = logreg.predict(X_test)
        print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
        
        # accuracy on the test set
        accuracy_test = logreg.score(X_test, y_test)
        accuracy_test_percent = round(accuracy_test*100, 3)
        print(accuracy_test)
        
#         # Make predictions on the validation set
#         y_pred_val = logreg.predict(X_val)
#         print('Accuracy of logistic regression classifier on validation set: {:.2f}'.format(logreg.score(X_val, y_val)))
        
#         # accuracy on the validation set
#         accuracy_val = logreg.score(X_val, y_val)
#         accuracy_val_percent = round(accuracy_val*100, 3)
#         print(accuracy_val)
        
        
        # Analysis of results
        
        # Confusion Matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        print(conf_matrix)
        # check if not only 0s are predicted
        count_no_occ_pred = np.count_nonzero(y_pred == 0)
        print('Number of 0 predicted: ', count_no_occ_pred)
        count_occ_pred = np.count_nonzero(y_pred == 1)
        print('Number of 1 predicted: ', count_occ_pred)
        
        # Classification Report
        print(classification_report(y_test, y_pred))
        
        # Precision
        precision = precision_score(y_test, y_pred)
        
        # Recall
        recall = recall_score(y_test, y_pred)

        # F1 Score
        f1 = f1_score(y_test, y_pred)

        # Matthews Correlation Coefficient
        mcc = matthews_corrcoef(y_test, y_pred)
        
        # ROC
        logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
        fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
        plt.figure()
        plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
        plt.plot([0,1], [0,1], 'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic - '+e)
        plt.legend(loc="lower right")
        plt.savefig('Logistic_Regression_ROC_'+a+'_one_parameter_and_date_'+f+'_forecast_'+e+'.pdf', bbox_inches='tight', dpi=100)
        plt.close()
        
        
        # Save results
        
        if d==0 and f=='E07CO2':
            # Creating csv file with results for model with all variables
            with open('Logistic_Regression_results_'+a+'_one_parameter_and_date.csv', 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(['Predicted Timestep','Input', 'Accuracy - Training Set', 'Accuracy - Test Set', 'Precision', 'Recall', 'F1-Score', 'Matthews Correlation Coefficient'])
                writer.writerow([e, f, accuracy_train, accuracy_test, precision, recall,  f1, mcc ])
                
        else:
            # Appending results to existing csv file
            with open('Logistic_Regression_results_'+a+'_one_parameter_and_date.csv', 'a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow([e, f, accuracy_train, accuracy_test, precision, recall,  f1, mcc ])
               

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.88
0.8754716981132076
[[18606  1714]
 [ 1322  2738]]
Number of 0 predicted:  19928
Number of 1 predicted:  4452
              precision    recall  f1-score   support

         0.0       0.93      0.92      0.92     20320
         1.0       0.62      0.67      0.64      4060

    accuracy                           0.88     24380
   macro avg       0.77      0.80      0.78     24380
weighted avg       0.88      0.88      0.88     24380

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.90
0.8951189499589828
[[19024  1296]
 [ 1261  2799]]
Number of 0 predicted:  20285
Number of 1 predicted:  4095
              precision    recall  f1-score   support

         0.0       0.9

Accuracy of logistic regression classifier on test set: 0.88
0.8751845775225595
[[18448  1871]
 [ 1172  2889]]
Number of 0 predicted:  19620
Number of 1 predicted:  4760
              precision    recall  f1-score   support

         0.0       0.94      0.91      0.92     20319
         1.0       0.61      0.71      0.66      4061

    accuracy                           0.88     24380
   macro avg       0.77      0.81      0.79     24380
weighted avg       0.88      0.88      0.88     24380

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.88
0.8772764561115669
[[18603  1716]
 [ 1276  2785]]
Number of 0 predicted:  19879
Number of 1 predicted:  4501
              precision    recall  f1-score   support

         0.0       0.94      0.92      0.93     20319
         1.0       0.62      0.69      0.65      4061

    accuracy                           0.

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.90
0.9000820344544709
[[19074  1244]
 [ 1192  2870]]
Number of 0 predicted:  20266
Number of 1 predicted:  4114
              precision    recall  f1-score   support

         0.0       0.94      0.94      0.94     20318
         1.0       0.70      0.71      0.70      4062

    accuracy                           0.90     24380
   macro avg       0.82      0.82      0.82     24380
weighted avg       0.90      0.90      0.90     24380

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.88
0.8779327317473339
[[18532  1786]
 [ 1190  2872]]
Number of 0 predicted:  19722
Number of 1 predicted:  4658
              precision    recall  f1-score   support

         0.0       0.9

Accuracy of logistic regression classifier on test set: 0.88
0.8780557834290402
[[18584  1733]
 [ 1240  2823]]
Number of 0 predicted:  19824
Number of 1 predicted:  4556
              precision    recall  f1-score   support

         0.0       0.94      0.91      0.93     20317
         1.0       0.62      0.69      0.66      4063

    accuracy                           0.88     24380
   macro avg       0.78      0.80      0.79     24380
weighted avg       0.88      0.88      0.88     24380

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.93
0.9339212469237079
[[19526   791]
 [  820  3243]]
Number of 0 predicted:  20346
Number of 1 predicted:  4034
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96     20317
         1.0       0.80      0.80      0.80      4063

    accuracy                           0.

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.92
0.9152173913043479
[[19304  1012]
 [ 1055  3009]]
Number of 0 predicted:  20359
Number of 1 predicted:  4021
              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95     20316
         1.0       0.75      0.74      0.74      4064

    accuracy                           0.92     24380
   macro avg       0.85      0.85      0.85     24380
weighted avg       0.91      0.92      0.92     24380

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.91
0.9110336341263331
[[19218  1098]
 [ 1071  2993]]
Number of 0 predicted:  20289
Number of 1 predicted:  4091
              precision    recall  f1-score   support

         0.0       0.9

Accuracy of logistic regression classifier on test set: 0.91
0.9068498769483183
[[19151  1164]
 [ 1107  2958]]
Number of 0 predicted:  20258
Number of 1 predicted:  4122
              precision    recall  f1-score   support

         0.0       0.95      0.94      0.94     20315
         1.0       0.72      0.73      0.72      4065

    accuracy                           0.91     24380
   macro avg       0.83      0.84      0.83     24380
weighted avg       0.91      0.91      0.91     24380

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.90
0.9048400328137818
[[19071  1244]
 [ 1076  2989]]
Number of 0 predicted:  20147
Number of 1 predicted:  4233
              precision    recall  f1-score   support

         0.0       0.95      0.94      0.94     20315
         1.0       0.71      0.74      0.72      4065

    accuracy                           0.

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.90
0.9009023789991797
[[18946  1368]
 [ 1048  3018]]
Number of 0 predicted:  19994
Number of 1 predicted:  4386
              precision    recall  f1-score   support

         0.0       0.95      0.93      0.94     20314
         1.0       0.69      0.74      0.71      4066

    accuracy                           0.90     24380
   macro avg       0.82      0.84      0.83     24380
weighted avg       0.90      0.90      0.90     24380

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.90
0.897949138638228
[[18923  1391]
 [ 1097  2969]]
Number of 0 predicted:  20020
Number of 1 predicted:  4360
              precision    recall  f1-score   support

         0.0       0.95

Accuracy of logistic regression classifier on test set: 0.90
0.8973338802296965
[[18893  1420]
 [ 1083  2984]]
Number of 0 predicted:  19976
Number of 1 predicted:  4404
              precision    recall  f1-score   support

         0.0       0.95      0.93      0.94     20313
         1.0       0.68      0.73      0.70      4067

    accuracy                           0.90     24380
   macro avg       0.81      0.83      0.82     24380
weighted avg       0.90      0.90      0.90     24380

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.88
0.8799835931091058
[[18614  1698]
 [ 1228  2840]]
Number of 0 predicted:  19842
Number of 1 predicted:  4538
              precision    recall  f1-score   support

         0.0       0.94      0.92      0.93     20312
         1.0       0.63      0.70      0.66      4068

    accuracy                           0.

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.88
0.883921246923708
[[18749  1562]
 [ 1268  2801]]
Number of 0 predicted:  20017
Number of 1 predicted:  4363
              precision    recall  f1-score   support

         0.0       0.94      0.92      0.93     20311
         1.0       0.64      0.69      0.66      4069

    accuracy                           0.88     24380
   macro avg       0.79      0.81      0.80     24380
weighted avg       0.89      0.88      0.89     24380

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.88
0.8780557834290402
[[18560  1751]
 [ 1222  2847]]
Number of 0 predicted:  19782
Number of 1 predicted:  4598
              precision    recall  f1-score   support

         0.0       0.94

Accuracy of logistic regression classifier on test set: 0.87
0.8749384741591468
[[18530  1780]
 [ 1269  2801]]
Number of 0 predicted:  19799
Number of 1 predicted:  4581
              precision    recall  f1-score   support

         0.0       0.94      0.91      0.92     20310
         1.0       0.61      0.69      0.65      4070

    accuracy                           0.87     24380
   macro avg       0.77      0.80      0.79     24380
weighted avg       0.88      0.87      0.88     24380

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.88
0.8782198523379819
[[18713  1597]
 [ 1372  2698]]
Number of 0 predicted:  20085
Number of 1 predicted:  4295
              precision    recall  f1-score   support

         0.0       0.93      0.92      0.93     20310
         1.0       0.63      0.66      0.65      4070

    accuracy                           0.

              precision    recall  f1-score   support

         0.0       0.93      0.92      0.92     20309
         1.0       0.62      0.66      0.64      4071

    accuracy                           0.88     24380
   macro avg       0.77      0.79      0.78     24380
weighted avg       0.88      0.88      0.88     24380

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.88
0.8826907301066448
[[18833  1476]
 [ 1384  2687]]
Number of 0 predicted:  20217
Number of 1 predicted:  4163
              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93     20309
         1.0       0.65      0.66      0.65      4071

    accuracy                           0.88     24380
   macro avg       0.79      0.79      0.79     24380
weighted avg       0.88      0.88      0.88     24380

Training Data Shape: (73137, 81)
Training Labels

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.87
0.8730106644790813
[[18498  1810]
 [ 1286  2786]]
Number of 0 predicted:  19784
Number of 1 predicted:  4596
              precision    recall  f1-score   support

         0.0       0.93      0.91      0.92     20308
         1.0       0.61      0.68      0.64      4072

    accuracy                           0.87     24380
   macro avg       0.77      0.80      0.78     24380
weighted avg       0.88      0.87      0.88     24380

Training Data Shape: (73137, 81)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 81)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.89
0.8947087776866284
[[18890  1418]
 [ 1149  2923]]
Number of 0 predicted:  20039
Number of 1 predicted:  4341
              precision    recall  f1-score   support

         0.0       0.9