## Import Modules

In [10]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

#sklearn
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef

plt.style.use('fivethirtyeight')
import seaborn as sns
# Use seaborn style defaults and set the default figure size
sns.set_theme(style='whitegrid', font='Arial', rc={'figure.figsize':(10,5),
            'font.size':14,
            'axes.titlesize':16,
            'axes.labelsize':15,
            'xtick.labelsize': 12,
            'ytick.labelsize': 12,
            'legend.fontsize': 13},color_codes=True)

# CSV
import csv

In [11]:
# room number
a = 'E07'
# number of lags as input
b = '5'
# number of last timestep to predict
c = 12

In [12]:
# d = number of timestep to predict
for d in range(1, c+1):
    
    if d==0:
        # Timestep as string
        e = 't'
    else:
        # Timestep as string
        e = 't+'+str(d)
    
    
    # Import data
    
    # Read in data and set index

    raw_data = data = pd.read_csv("\Pre-Processing\data_E07_input_5_output_144.txt", parse_dates=True)
    data = raw_data.copy()
    data['DateTime'] = pd.to_datetime(data['DateTime'])
    data = data.set_index('DateTime')
    
    # Drop columns for Year and Second
    data = data.drop('Second_0', axis = 1)
    
    # Saving data names for later use
    data_list = list(data.columns)
    
    # Drop columns of future timestamps that should not be used as input for this model
    if d==0:
        for i in range(1,145):
            v = 't+'+str(i)
            data = data.drop(v, axis = 1)
    else:
        for i in range(d+1,145):
            v = 't+'+str(i)
            data = data.drop(v, axis = 1)

        for i in range(1, d):
            v = 't+'+str(i)
            data = data.drop(v, axis = 1)
            
            
    # parameter that only should be used as input
    for f in ['E07CO2', 'E07Tair', 'E07ElL', 'E07SP', 'E07W', 'E07WT']:
        
        
        # Create data set / dummy variables
        
        # keep only date parameter, lags, one additional parameter and the timestep to predict
        if d==0:
            to_keep = [i for i in data_list[6:91]]
            to_keep.append(f)
            to_keep.append(e)
        else:
            to_keep = [i for i in data_list[6:92]]
            to_keep.append(f)
            to_keep.append(e)
            
        data_final=data[to_keep]
        data_final.columns.values
        
        # Create X and y
        X = data_final.loc[:, data_final.columns != e]
        y = data_final.loc[:, data_final.columns == e]
        
        # format y to a 1D array
        y = y.values.ravel()
        
        # format X when more than one parameter are used as input
        sc_x = StandardScaler()
        X = sc_x.fit_transform(X)
        X = pd.DataFrame(X)
        
        
        # Logistic Regression Model Fitting
        
        # Split data into training and testing sets (75 % training, 25 % testing)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False, random_state=42)
        
#         # Split testing set into testing and validation sets (15 % training, 15 % validation of the whole data set --> 50/50 % of 30 %)
#         X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
        
        # Print shapes of data sets
        print('Training Data Shape:', X_train.shape)
        print('Training Labels Shape:', y_train.shape)
        print('Testing Data Shape:', X_test.shape)
        print('Testing Labels Shape:', y_test.shape)
#         print('Validation Data Shape:', X_val.shape)
#         print('Validation Labels Shape:', y_val.shape)
        
        # Create and fit model
        logreg = LogisticRegression(max_iter=1000)
        logreg.fit(X_train, y_train)
        
        # accuracy on the training set
        accuracy_train = logreg.score(X_train, y_train)
        accuracy_train_percent = round(accuracy_train*100, 3)

        # Make predictions on the test set
        y_pred = logreg.predict(X_test)
        print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
        
        # accuracy on the test set
        accuracy_test = logreg.score(X_test, y_test)
        accuracy_test_percent = round(accuracy_test*100, 3)
        print(accuracy_test)
        
#         # Make predictions on the validation set
#         y_pred_val = logreg.predict(X_val)
#         print('Accuracy of logistic regression classifier on validation set: {:.2f}'.format(logreg.score(X_val, y_val)))
        
#         # accuracy on the validation set
#         accuracy_val = logreg.score(X_val, y_val)
#         accuracy_val_percent = round(accuracy_val*100, 3)
#         print(accuracy_val)
        
        
        # Analysis of results
        
        # Confusion Matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        print(conf_matrix)
        # check if not only 0s are predicted
        count_no_occ_pred = np.count_nonzero(y_pred == 0)
        print('Number of 0 predicted: ', count_no_occ_pred)
        count_occ_pred = np.count_nonzero(y_pred == 1)
        print('Number of 1 predicted: ', count_occ_pred)
        
        # Classification Report
        print(classification_report(y_test, y_pred))
        
        # Precision
        precision = precision_score(y_test, y_pred)
        precision_percent = round((precision*100),3)
        
        # Recall
        recall = recall_score(y_test, y_pred)
        recall_percent = round((recall*100),3)

        # F1 Score
        f1 = f1_score(y_test, y_pred)
        f1_percent = round((f1*100),3)

        # Matthews Correlation Coefficient
        mcc = matthews_corrcoef(y_test, y_pred)
        
        # ROC
        logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
        fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
        plt.figure()
        plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
        plt.plot([0,1], [0,1], 'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic - '+e)
        plt.legend(loc="lower right")
        plt.savefig('Logistic_Regression_ROC_'+a+'_one_parameter_and_date_and_lags_'+f+'_forecast_'+e+'.pdf', bbox_inches='tight', dpi=100)
        plt.close()
        
        
        # Save results
        
        if d==1 and f=='E07CO2':
            # Creating csv file with results for model with all variables
            with open('Logistic_Regression_results_'+a+'_one_parameter_and_date_and_lags.csv', 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(['Predicted Timestep','Input', 'Accuracy - Training Set', 'Rounded Percentage', 'Accuracy - Test Set', 'Rounded Percentage', 'Precision', 'Rounded Percentage', 'Recall', 'Rounded Percentage', 'F1-Score', 'Rounded Percentage', 'Matthews Correlation Coefficient'])
                writer.writerow([e, f, accuracy_train, accuracy_train_percent, accuracy_test, accuracy_test_percent,  precision, precision_percent, recall, recall_percent, f1, f1_percent, mcc])
                
        else:
            # Appending results to existing csv file
            with open('Logistic_Regression_results_'+a+'_one_parameter_and_date_and_lags.csv', 'a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow([e, f, accuracy_train, accuracy_train_percent, accuracy_test, accuracy_test_percent, precision, precision_percent, recall, recall_percent, f1, f1_percent, mcc])

Training Data Shape: (73137, 87)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 87)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.97
0.9712059064807219
[[19974   345]
 [  357  3704]]
Number of 0 predicted:  20331
Number of 1 predicted:  4049
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98     20319
         1.0       0.91      0.91      0.91      4061

    accuracy                           0.97     24380
   macro avg       0.95      0.95      0.95     24380
weighted avg       0.97      0.97      0.97     24380

Training Data Shape: (73137, 87)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 87)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.97
0.9712059064807219
[[19974   345]
 [  357  3704]]
Number of 0 predicted:  20331
Number of 1 predicted:  4049
              precision    recall  f1-score   support

         0.0       0.9

Accuracy of logistic regression classifier on test set: 0.94
0.9353568498769483
[[19499   818]
 [  758  3305]]
Number of 0 predicted:  20257
Number of 1 predicted:  4123
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96     20317
         1.0       0.80      0.81      0.81      4063

    accuracy                           0.94     24380
   macro avg       0.88      0.89      0.88     24380
weighted avg       0.94      0.94      0.94     24380

Training Data Shape: (73137, 87)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 87)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.93
0.9348646431501231
[[19506   811]
 [  777  3286]]
Number of 0 predicted:  20283
Number of 1 predicted:  4097
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96     20317
         1.0       0.80      0.81      0.81      4063

    accuracy                           0.

Training Data Shape: (73137, 87)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 87)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.92
0.9155865463494668
[[19273  1042]
 [ 1016  3049]]
Number of 0 predicted:  20289
Number of 1 predicted:  4091
              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95     20315
         1.0       0.75      0.75      0.75      4065

    accuracy                           0.92     24380
   macro avg       0.85      0.85      0.85     24380
weighted avg       0.92      0.92      0.92     24380

Training Data Shape: (73137, 87)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 87)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.92
0.9164889253486465
[[19280  1035]
 [ 1001  3064]]
Number of 0 predicted:  20281
Number of 1 predicted:  4099
              precision    recall  f1-score   support

         0.0       0.9

Accuracy of logistic regression classifier on test set: 0.91
0.912346185397867
[[19147  1166]
 [  971  3096]]
Number of 0 predicted:  20118
Number of 1 predicted:  4262
              precision    recall  f1-score   support

         0.0       0.95      0.94      0.95     20313
         1.0       0.73      0.76      0.74      4067

    accuracy                           0.91     24380
   macro avg       0.84      0.85      0.85     24380
weighted avg       0.91      0.91      0.91     24380

Training Data Shape: (73137, 87)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 87)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.91
0.912428219852338
[[19054  1259]
 [  876  3191]]
Number of 0 predicted:  19930
Number of 1 predicted:  4450
              precision    recall  f1-score   support

         0.0       0.96      0.94      0.95     20313
         1.0       0.72      0.78      0.75      4067

    accuracy                           0.91

Training Data Shape: (73137, 87)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 87)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.91
0.9066447908121411
[[19052  1258]
 [ 1018  3052]]
Number of 0 predicted:  20070
Number of 1 predicted:  4310
              precision    recall  f1-score   support

         0.0       0.95      0.94      0.94     20310
         1.0       0.71      0.75      0.73      4070

    accuracy                           0.91     24380
   macro avg       0.83      0.84      0.84     24380
weighted avg       0.91      0.91      0.91     24380

Training Data Shape: (73137, 87)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 87)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.91
0.9075471698113208
[[19063  1247]
 [ 1007  3063]]
Number of 0 predicted:  20070
Number of 1 predicted:  4310
              precision    recall  f1-score   support

         0.0       0.9

Accuracy of logistic regression classifier on test set: 0.90
0.9020098441345366
[[18923  1385]
 [ 1004  3068]]
Number of 0 predicted:  19927
Number of 1 predicted:  4453
              precision    recall  f1-score   support

         0.0       0.95      0.93      0.94     20308
         1.0       0.69      0.75      0.72      4072

    accuracy                           0.90     24380
   macro avg       0.82      0.84      0.83     24380
weighted avg       0.91      0.90      0.90     24380

Training Data Shape: (73137, 87)
Training Labels Shape: (73137,)
Testing Data Shape: (24380, 87)
Testing Labels Shape: (24380,)
Accuracy of logistic regression classifier on test set: 0.90
0.9020918785890074
[[18951  1357]
 [ 1030  3042]]
Number of 0 predicted:  19981
Number of 1 predicted:  4399
              precision    recall  f1-score   support

         0.0       0.95      0.93      0.94     20308
         1.0       0.69      0.75      0.72      4072

    accuracy                           0.

In [13]:
to_keep

['Month_1',
 'Month_2',
 'Month_3',
 'Month_4',
 'Month_5',
 'Month_6',
 'Month_7',
 'Month_8',
 'Month_9',
 'Month_10',
 'Month_11',
 'Month_12',
 'Day_1',
 'Day_2',
 'Day_3',
 'Day_4',
 'Day_5',
 'Day_6',
 'Day_7',
 'Day_8',
 'Day_9',
 'Day_10',
 'Day_11',
 'Day_12',
 'Day_13',
 'Day_14',
 'Day_15',
 'Day_16',
 'Day_17',
 'Day_18',
 'Day_19',
 'Day_20',
 'Day_21',
 'Day_22',
 'Day_23',
 'Day_24',
 'Day_25',
 'Day_26',
 'Day_27',
 'Day_28',
 'Day_29',
 'Day_30',
 'Day_31',
 'Hour_0',
 'Hour_1',
 'Hour_2',
 'Hour_3',
 'Hour_4',
 'Hour_5',
 'Hour_6',
 'Hour_7',
 'Hour_8',
 'Hour_9',
 'Hour_10',
 'Hour_11',
 'Hour_12',
 'Hour_13',
 'Hour_14',
 'Hour_15',
 'Hour_16',
 'Hour_17',
 'Hour_18',
 'Hour_19',
 'Hour_20',
 'Hour_21',
 'Hour_22',
 'Hour_23',
 'Minute_0',
 'Minute_10',
 'Minute_20',
 'Minute_30',
 'Minute_40',
 'Minute_50',
 'Weekday Name_Friday',
 'Weekday Name_Monday',
 'Weekday Name_Saturday',
 'Weekday Name_Sunday',
 'Weekday Name_Thursday',
 'Weekday Name_Tuesday',
 'Weekday N