## Import Modules

In [18]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
# Use seaborn style defaults and set the default figure size
sns.set_theme(style='whitegrid', font='Arial', rc={'figure.figsize':(10,5),
            'font.size':14,
            'axes.titlesize':16,
            'axes.labelsize':15,
            'xtick.labelsize': 12,
            'ytick.labelsize': 12,
            'legend.fontsize': 13},color_codes=True)

#sklearn
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef


import seaborn as sns

# CSV
import csv

In [19]:
# room number
a = 'E07'
# number of lags as input
b = '5'
# number of last timestep to predict
c = 36

In [20]:
# d = number of timestep to predict
for d in range(1,c+1):
    
    if d==0:
        # Timestep as string
        e = 't'
    else:
        # Timestep as string
        e = 't+'+str(d)
    
    
    # Import data
    
    # Read in data and set index
    raw_data = pd.read_csv("\Pre-Processing\data_E07_input_5_output_144.txt", parse_dates=True)
    data = raw_data.copy()
    data['DateTime'] = pd.to_datetime(data['DateTime'])
    data = data.set_index('DateTime')

    # Drop columns for Year and Second
    data = data.drop('Second_0', axis = 1)
    
    # Drop columns of future timestamps that should not be used as input for this model
    if d==0:
        for i in range(1,145):
            v = 't+'+str(i)
            data = data.drop(v, axis = 1)
    else:
        for i in range(d+1,145):
            v = 't+'+str(i)
            data = data.drop(v, axis = 1)

        for i in range(1, d):
            v = 't+'+str(i)
            data = data.drop(v, axis = 1)
        
    
    # Create Data Set / Dummy Variables
    
    # Saving data names for later use
    data_list = list(data.columns)
    
    # Create X and y
    X = data.loc[:, data.columns != e]
    y = data.loc[:, data.columns == e]
    
    # format y to a 1D array
    y = y.values.ravel()
      
    # format X when more than one parameter is used as input
    sc_x = StandardScaler()
    X = sc_x.fit_transform(X)
    X = pd.DataFrame(X)
    
    
    # Logistic Regression Model Fitting
    
    # Split data into training and testing sets using scikit-learn (75 % training, 25 % testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False, random_state=42)
    
    # Split testing set into testing and validation sets 
    # 15 % training, 15 % validation of the whole data set --> 50/50 % of 30 %
    #X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, shuffle=False, random_state=42)
    
    # Print shapes of data sets
    print('Training Data Shape:', X_train.shape)
    print('Training Labels Shape:', y_train.shape)
    print('Testing Data Shape:', X_test.shape)
    print('Testing Labels Shape:', y_test.shape)
#     print('Validation Data Shape:', X_val.shape)
#     print('Validation Labels Shape:', y_val.shape)
    
    # Create and fit model
    logreg = LogisticRegression(max_iter=1000)
    logreg.fit(X_train, y_train)
    
    # Accuracy on the training set
    accuracy_train = logreg.score(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = logreg.predict(X_test)
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
    
    # Accuracy on the test set
    accuracy_test = logreg.score(X_test, y_test)
    print(accuracy_test)
    
#     # Make predictions on the validation set
#     y_pred_val = logreg.predict(X_val)
#     print('Accuracy of logistic regression classifier on validation set: {:.2f}'.format(logreg.score(X_val, y_val)))
    
#     # Accuracy on the validation set
#     accuracy_val = logreg.score(X_val, y_val)
#     print(accuracy_val)
    
    
    # Analysis of results
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(conf_matrix)
    # check if not only 0s are predicted
    count_no_occ_pred = np.count_nonzero(y_pred == 0)
    print('Number of 0 predicted: ', count_no_occ_pred)
    count_occ_pred = np.count_nonzero(y_pred == 1)
    print('Number of 1 predicted: ', count_occ_pred)
    
    # Classification Report
    print(classification_report(y_test, y_pred))
    
    # Precision
    precision = precision_score(y_test, y_pred)

    # Recall
    recall = recall_score(y_test, y_pred)

    # F1 Score
    f1 = f1_score(y_test, y_pred)

    # Matthews Correlation Coefficient
    mcc = matthews_corrcoef(y_test, y_pred)
    
    #ROC
    logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
    fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot([0,1], [0,1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic - '+e)
    plt.legend(loc="lower right")
    plt.savefig('Logistic_Regression_ROC_'+a+'_all_variables_input_'+b+'_forecast_'+e+'.pdf', bbox_inches='tight', dpi=100)
    plt.savefig('Logistic_Regression_ROC_'+a+'_all_variables_input_'+b+'_forecast_'+e+'.png', bbox_inches='tight', dpi=100)
    plt.close()
    
    
    # Save results
    
    if d==1:
        # Creating csv file with results for model with all variables
        with open('Logistic_Regression_results_'+a+'_input_'+b+'_all_variables.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['Predicted Timestep', 'Accuracy - Training Set', 'Accuracy - Test Set', 'Precision', 'Recall', 'F1-Score', 'Matthews Correlation Coefficient'])
            writer.writerow([e, accuracy_train, accuracy_test, precision, recall, f1, mcc])
    else:
        # Appending results to existing csv file
        with open('Logistic_Regression_results_'+a+'_input_'+b+'_all_variables.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([e, accuracy_train, accuracy_test, precision,  recall, f1, mcc])

Training Data Shape: (73137, 92)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 92)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.97
0.9714520098441345
[[19982   337]
 [  359  3702]]
Number of 0 predicted:  20341
Number of 1 predicted:  4039
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98     20319
         1.0       0.92      0.91      0.91      4061

    accuracy                           0.97     24380
   macro avg       0.95      0.95      0.95     24380
weighted avg       0.97      0.97      0.97     24380

Training Data Shape: (73137, 92)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 92)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.95
0.9502050861361772
[[19715   603]
 [  611  3451]]
Number of 0 predicted:  20326
Number of 1 predicted:  4054
              precision    recall  f1-score   support

         0.0       0.9

Accuracy of logistic regression classifier on test set: 0.90
0.9023789991796555
[[19003  1303]
 [ 1077  2997]]
Number of 0 predicted:  20080
Number of 1 predicted:  4300
              precision    recall  f1-score   support

         0.0       0.95      0.94      0.94     20306
         1.0       0.70      0.74      0.72      4074

    accuracy                           0.90     24380
   macro avg       0.82      0.84      0.83     24380
weighted avg       0.90      0.90      0.90     24380

Training Data Shape: (73137, 92)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 92)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.90
0.8996718621821165
[[19025  1280]
 [ 1166  2909]]
Number of 0 predicted:  20191
Number of 1 predicted:  4189
              precision    recall  f1-score   support

         0.0       0.94      0.94      0.94     20305
         1.0       0.69      0.71      0.70      4075

    accuracy                           0.

Training Data Shape: (73137, 92)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 92)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.89
0.8906070549630845
[[18897  1400]
 [ 1267  2816]]
Number of 0 predicted:  20164
Number of 1 predicted:  4216
              precision    recall  f1-score   support

         0.0       0.94      0.93      0.93     20297
         1.0       0.67      0.69      0.68      4083

    accuracy                           0.89     24380
   macro avg       0.80      0.81      0.81     24380
weighted avg       0.89      0.89      0.89     24380

Training Data Shape: (73137, 92)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 92)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.89
0.8887612797374898
[[18842  1455]
 [ 1257  2826]]
Number of 0 predicted:  20099
Number of 1 predicted:  4281
              precision    recall  f1-score   support

         0.0       0.9