In [1]:
import numpy as np
import pandas as pd
import os
import copy
import matplotlib.pylab as plt
import seaborn as sbn
import pickle

from scipy.stats import ks_2samp
from sklearn.preprocessing import MinMaxScaler, StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression,LogisticRegression
from itertools import permutations, combinations

## Loading datafiles for classification model training and testing

In [2]:
# Getting path for the 'parent folder'
path_cwd = os.getcwd()
path_parent = os.path.abspath(os.path.join(path_cwd, os.pardir))

# Getting path for the data files
datafiles_folder_name = 'Data_files'

datafile_training_predError = 'data_training_classification_pred_error_based.csv'
datafile_test1_predError = 'data_testing_classification_pred_error_based_BaseDemand.csv'
datafile_test2_predError = 'data_testing_classification_pred_error_based_DiffLow.csv'
datafile_test3_predError = 'data_testing_classification_pred_error_based_DiffHigh.csv'


datafile_training_mean = 'data_training_classification_mean_based.csv'
datafile_test1_mean = 'data_testing_classification_mean_based_BaseDemand.csv'
datafile_test2_mean = 'data_testing_classification_mean_based_DiffLow.csv'
datafile_test3_mean = 'data_testing_classification_mean_based_DiffHigh.csv'

path_training_predError = os.path.join(path_parent,datafiles_folder_name, datafile_training_predError)
path_test1_predError = os.path.join(path_parent, datafiles_folder_name, datafile_test1_predError)
path_test2_predError = os.path.join(path_parent, datafiles_folder_name, datafile_test2_predError)
path_test3_predError = os.path.join(path_parent, datafiles_folder_name, datafile_test3_predError)

path_training_mean = os.path.join(path_parent, datafiles_folder_name, datafile_training_mean)
path_test1_mean = os.path.join(path_parent, datafiles_folder_name, datafile_test1_mean)
path_test2_mean = os.path.join(path_parent, datafiles_folder_name, datafile_test2_mean)
path_test3_mean = os.path.join(path_parent, datafiles_folder_name, datafile_test3_mean)

# Loading files
data_training_predError = pd.read_csv(path_training_predError)
data_test1_predError = pd.read_csv(path_test1_predError)
data_test2_predError = pd.read_csv(path_test2_predError)
data_test3_predError = pd.read_csv(path_test3_predError)

data_training_mean = pd.read_csv(path_training_mean)
data_test1_mean = pd.read_csv(path_test1_mean)
data_test2_mean = pd.read_csv(path_test2_mean)
data_test3_mean = pd.read_csv(path_test3_mean)

## Logistic Regression Model

In [8]:
def logistic_reg(output_type,train_df,test_df):

    xdata=np.array(train_df)[:,:-2]
    if output_type=='binary':
        ydata=np.array(train_df)[:,-2]
    else:
        ydata=np.array(train_df)[:,-1]
    
    xdata_test=np.array(test_df)[:,:-2]
    if output_type=='binary':
        ydata_test=np.array(test_df)[:,-2]
    else:
        ydata_test=np.array(test_df)[:,-1]
    
    scaler = StandardScaler()
    xtrain_norm = scaler.fit_transform(xdata)
    xtest_norm = scaler.transform(xdata_test)
    print(xtrain_norm.shape)
    
    if output_type=='binary':
        model=LogisticRegression()
    else:
        model=LogisticRegression(multi_class='multinomial',class_weight='balanced',max_iter=5000)
        
    model.fit(xtrain_norm,ydata)
    beta = model.coef_
    ypred = model.predict(xtest_norm)
    report = classification_report(ydata_test,ypred,output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    
    conf_mat = confusion_matrix(ydata_test,ypred)
    
    return conf_mat, beta,report_df

## Results for Mean Based approach

* Test Set with normal/expected demand based 'no leak data' plus leak data

In [9]:
cmat_mean_1,beta_mean_1,report_mean_1 = logistic_reg('multi',data_training_mean,data_test1_mean)

(180, 28)


In [9]:
cmat_mean_1

array([[30, 40, 13,  7],
       [ 9, 18,  1,  2],
       [ 6,  5, 18,  1],
       [ 5,  2,  2, 21]])

In [10]:
report_mean_1

Unnamed: 0,precision,recall,f1-score,support
0.0,0.6,0.333333,0.428571,90.0
14.0,0.276923,0.6,0.378947,30.0
24.0,0.529412,0.6,0.5625,30.0
31.0,0.677419,0.7,0.688525,30.0
accuracy,0.483333,0.483333,0.483333,0.483333
macro avg,0.520939,0.558333,0.514636,180.0
weighted avg,0.547292,0.483333,0.485948,180.0


* Test Set with moderately higher demand based 'no leak data' plus leak data

In [10]:
cmat_mean_2,beta_mean_2,report_mean_2 = logistic_reg('multi',data_training_mean,data_test2_mean)

(180, 28)


In [12]:
cmat_mean_2

array([[40, 13, 29,  8],
       [ 9, 18,  1,  2],
       [ 6,  5, 18,  1],
       [ 5,  2,  2, 21]])

In [13]:
report_mean_2

Unnamed: 0,precision,recall,f1-score,support
0.0,0.666667,0.444444,0.533333,90.0
14.0,0.473684,0.6,0.529412,30.0
24.0,0.36,0.6,0.45,30.0
31.0,0.65625,0.7,0.677419,30.0
accuracy,0.538889,0.538889,0.538889,0.538889
macro avg,0.53915,0.586111,0.547541,180.0
weighted avg,0.581656,0.538889,0.542805,180.0


In [None]:
* Test Set with higher demand based 'no leak data' plus leak data

In [14]:
cmat_mean_3,beta_mean_3,report_mean_3 = logistic_reg('multi',data_training_mean,data_test3_mean)

In [15]:
cmat_mean_3

array([[11,  9, 67,  3],
       [ 9, 18,  1,  2],
       [ 6,  5, 18,  1],
       [ 5,  2,  2, 21]])

In [16]:
report_mean_3

Unnamed: 0,precision,recall,f1-score,support
0.0,0.354839,0.122222,0.181818,90.0
14.0,0.529412,0.6,0.5625,30.0
24.0,0.204545,0.6,0.305085,30.0
31.0,0.777778,0.7,0.736842,30.0
accuracy,0.377778,0.377778,0.377778,0.377778
macro avg,0.466643,0.505556,0.446561,180.0
weighted avg,0.429375,0.377778,0.358314,180.0


## Results for Prediction Error based approach

* Test Set with normal/expected demand based 'no leak data' plus leak data

In [11]:
cmat_predError_1,beta_predError_1,report_predError_1 = logistic_reg('multi',data_training_predError,data_test1_predError)

(180, 84)


In [12]:
cmat_predError_1

array([[25, 28, 23, 14],
       [ 7, 19,  3,  1],
       [ 6,  2, 18,  4],
       [ 4,  7,  1, 18]])

In [19]:
report_predError_1

Unnamed: 0,precision,recall,f1-score,support
0.0,0.595238,0.277778,0.378788,90.0
14.0,0.339286,0.633333,0.44186,30.0
24.0,0.4,0.6,0.48,30.0
31.0,0.486486,0.6,0.537313,30.0
accuracy,0.444444,0.444444,0.444444,0.444444
macro avg,0.455253,0.527778,0.45949,180.0
weighted avg,0.501914,0.444444,0.43259,180.0


In [None]:
* Test Set with moderately higher demand based 'no leak data' plus leak data

In [20]:
cmat_predError_2,beta_predError_2,report_predError_2 = logistic_reg('multi',data_training_predError,data_test2_predError)

In [21]:
cmat_predError_2

array([[40, 18, 24,  8],
       [ 7, 19,  3,  1],
       [ 6,  2, 18,  4],
       [ 4,  7,  1, 18]])

In [22]:
report_predError_2

Unnamed: 0,precision,recall,f1-score,support
0.0,0.701754,0.444444,0.544218,90.0
14.0,0.413043,0.633333,0.5,30.0
24.0,0.391304,0.6,0.473684,30.0
31.0,0.580645,0.6,0.590164,30.0
accuracy,0.527778,0.527778,0.527778,0.527778
macro avg,0.521687,0.569444,0.527016,180.0
weighted avg,0.581709,0.527778,0.53275,180.0


In [None]:
* Test Set with higher demand based 'no leak data' plus leak data

In [23]:
cmat_predError_3,beta_predError_3,report_predError_3 = logistic_reg('multi',data_training_predError,data_test3_predError)

In [24]:
cmat_predError_3

array([[28, 11, 39, 12],
       [ 7, 19,  3,  1],
       [ 6,  2, 18,  4],
       [ 4,  7,  1, 18]])

In [25]:
report_predError_3

Unnamed: 0,precision,recall,f1-score,support
0.0,0.622222,0.311111,0.414815,90.0
14.0,0.487179,0.633333,0.550725,30.0
24.0,0.295082,0.6,0.395604,30.0
31.0,0.514286,0.6,0.553846,30.0
accuracy,0.461111,0.461111,0.461111,0.461111
macro avg,0.479692,0.536111,0.478748,180.0
weighted avg,0.527202,0.461111,0.457437,180.0
