In [50]:
import numpy as np
import pandas as pd
import os
import copy
import matplotlib.pylab as plt
import seaborn as sbn
import pickle

from scipy.stats import ks_2samp
from sklearn.preprocessing import MinMaxScaler, StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression,LogisticRegression
from itertools import permutations, combinations
from statsmodels.discrete.discrete_model import Logit

## Loading datafiles for classification model training and testing

In [51]:
# Getting path for the 'parent folder'
path_cwd = os.getcwd()
path_parent = os.path.abspath(os.path.join(path_cwd, os.pardir))

# Getting path for the data files
datafiles_folder_name = 'Data_files'

datafile_training_predError = 'data_training_classification_pred_error_based.csv'
datafile_test1_predError = 'data_testing_classification_pred_error_based_BaseDemand.csv'

path_training_predError = os.path.join(path_parent,datafiles_folder_name, datafile_training_predError)
path_test1_predError = os.path.join(path_parent, datafiles_folder_name, datafile_test1_predError)

# Loading files
data_training_predError = pd.read_csv(path_training_predError)
data_test1_predError = pd.read_csv(path_test1_predError)

In [52]:
# Selecting the featutes like means, ks_stat or ks_pval

def feature_selection(data_df,features,output_col):
    
    columns = []
    for feature in features:
        col_temp = [col for col in data_df.columns if feature in col]
        columns.extend(col_temp)
    columns.extend(output_col)
    data_df_select = data_df[columns]
    return data_df_select

## Logistic Regression Model

In [53]:
def logistic_reg(output_type,
                 train_df_full,
                 test_df_full,
                 features): # 'obs','prd','stat','pval'
    
    if output_type == 'binary':
        output_col = ['leak']
    else:
        output_col = ['leak_num']
        
    train_df = feature_selection(train_df_full,features,output_col)
    test_df = feature_selection(test_df_full,features,output_col)

    xdata=np.array(train_df)[:,:-2]
    if output_type=='binary':
        ydata=np.array(train_df)[:,-2]
    else:
        ydata=np.array(train_df)[:,-1]
    
    xdata_test=np.array(test_df)[:,:-2]
    if output_type=='binary':
        ydata_test=np.array(test_df)[:,-2]
    else:
        ydata_test=np.array(test_df)[:,-1]
    
    scaler = StandardScaler()
    xtrain_norm = scaler.fit_transform(xdata)
    xtest_norm = scaler.transform(xdata_test)
    
    if output_type=='binary':
        model=LogisticRegression()
    else:
        model=LogisticRegression(multi_class='multinomial',class_weight='balanced',max_iter=5000)
        
    model.fit(xtrain_norm,ydata)
    beta = model.coef_
    ypred = model.predict(xtest_norm)
    report = classification_report(ydata_test,ypred,output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    
    conf_mat = confusion_matrix(ydata_test,ypred)
    
    return conf_mat, beta,report_df

## Results for Prediction Error based approach

* Test Set with normal/expected demand based 'no leak data' plus leak data

In [54]:
cmat_predError_1_stat,beta_predError_1_stat,report_predError_1_stat = logistic_reg('multi',
                                                                    data_training_predError,
                                                                    data_test1_predError,
                                                                   ['stat'])

In [55]:
cmat_predError_1_mean,beta_predError_1_mean,report_predError_1_mean = logistic_reg('multi',
                                                                    data_training_predError,
                                                                    data_test1_predError,
                                                                   ['obs','prd'])

In [56]:
cmat_predError_1_all,beta_predError_1_all,report_predError_1_all = logistic_reg('multi',
                                                                    data_training_predError,
                                                                    data_test1_predError,
                                                                   ['obs','prd','stat'])

In [57]:
cmat_predError_1_stat

array([[900,   0,   0,   0],
       [  0, 300,   0,   0],
       [  2,   0, 298,   0],
       [  1,   0,   0, 299]])

In [58]:
cmat_predError_1_mean

array([[900,   0,   0,   0],
       [  0, 300,   0,   0],
       [  0,   0, 300,   0],
       [  0,   0,   0, 300]])

In [59]:
cmat_predError_1_all

array([[900,   0,   0,   0],
       [  0, 300,   0,   0],
       [  0,   0, 300,   0],
       [  0,   0,   0, 300]])