In [1]:
import numpy as np
import pandas as pd
import os
import copy
import matplotlib.pylab as plt
import seaborn as sbn
import pickle

from scipy.stats import ks_2samp
from sklearn.preprocessing import MinMaxScaler, StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression,LogisticRegression
from itertools import permutations, combinations
from statsmodels.discrete.discrete_model import Logit

## Loading datafiles for classification model training and testing

In [2]:
# Getting path for the 'parent folder'
path_cwd = os.getcwd()
path_parent = os.path.abspath(os.path.join(path_cwd, os.pardir))

# Getting path for the data files
datafiles_folder_name = 'Data_files'

datafile_training_low = 'data_training_classification_lowNoise.csv'
datafile_test_low = 'data_testing_classification_lowNoise.csv'

datafile_training_high = 'data_training_classification_highNoise.csv'
datafile_test_high = 'data_testing_classification_highNoise.csv'

path_training_low = os.path.join(path_parent,datafiles_folder_name, datafile_training_low)
path_test_low = os.path.join(path_parent, datafiles_folder_name, datafile_test_low)

path_training_high = os.path.join(path_parent,datafiles_folder_name, datafile_training_high)
path_test_high = os.path.join(path_parent, datafiles_folder_name, datafile_test_high)

# Loading files
data_training_lowNoise = pd.read_csv(path_training_low)
data_test_lowNoise = pd.read_csv(path_test_low)
data_training_highNoise = pd.read_csv(path_training_high)
data_test_highNoise = pd.read_csv(path_test_high)

In [3]:
# Selecting the featutes like means, ks_stat or ks_pval

def feature_selection(data_df,features,output_col):
    
    columns = []
    for feature in features:
        col_temp = [col for col in data_df.columns if feature in col]
        columns.extend(col_temp)
    columns.extend(output_col)
    data_df_select = data_df[columns]
    return data_df_select

## Logistic Regression Model

In [4]:
def logistic_reg(output_type,
                 train_df_full,
                 test_df_full,
                 features): # 'obs','prd','stat','pval'
    
    if output_type == 'binary':
        output_col = ['leak']
    else:
        output_col = ['leak_num']
        
    train_df = feature_selection(train_df_full,features,output_col)
    test_df = feature_selection(test_df_full,features,output_col)

    xdata=np.array(train_df)[:,:-2]
    if output_type=='binary':
        ydata=np.array(train_df)[:,-2]
    else:
        ydata=np.array(train_df)[:,-1]
    
    xdata_test=np.array(test_df)[:,:-2]
    if output_type=='binary':
        ydata_test=np.array(test_df)[:,-2]
    else:
        ydata_test=np.array(test_df)[:,-1]
    
    scaler = StandardScaler()
    xtrain_norm = scaler.fit_transform(xdata)
    xtest_norm = scaler.transform(xdata_test)
    
    if output_type=='binary':
        model=LogisticRegression()
    else:
        model=LogisticRegression(multi_class='multinomial',class_weight='balanced',max_iter=5000)
        
    model.fit(xtrain_norm,ydata)
    beta = model.coef_
    ypred = model.predict(xtest_norm)
    report = classification_report(ydata_test,ypred,output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    
    conf_mat = confusion_matrix(ydata_test,ypred)
    
    return conf_mat, beta,report_df

## Results Low Noise

In [5]:
cmat_low_stat,beta_low_stat,report_low_stat = logistic_reg('multi',
                                                                    data_training_lowNoise,
                                                                    data_test_lowNoise,
                                                                   ['stat'])

In [7]:
cmat_low_means,beta_low_means,report_low_means = logistic_reg('multi',
                                                                    data_training_lowNoise,
                                                                    data_test_lowNoise,
                                                                   ['obs','prd'])

In [8]:
cmat_low_all,beta_low_all,report_low_all = logistic_reg('multi',
                                                                    data_training_lowNoise,
                                                                    data_test_lowNoise,
                                                                   ['stat','obs','prd'])

In [9]:
cmat_low_stat

array([[900,   0,   0,   0],
       [ 77, 121,  47,  55],
       [ 77,  44,  98,  81],
       [ 87,  39,  86,  88]])

In [11]:
cmat_low_means

array([[895,   0,   5,   0],
       [  1, 299,   0,   0],
       [ 25,   0, 275,   0],
       [  1,   0,   0, 299]])

In [12]:
cmat_low_all

array([[900,   0,   0,   0],
       [  0, 300,   0,   0],
       [ 28,   0, 272,   0],
       [  1,   0,   0, 299]])

## Results high Noise

In [13]:
cmat_high_stat,beta_high_stat,report_high_stat = logistic_reg('multi',
                                                                    data_training_highNoise,
                                                                    data_test_highNoise,
                                                                   ['stat'])
cmat_high_means,beta_high_means,report_high_means = logistic_reg('multi',
                                                                    data_training_highNoise,
                                                                    data_test_highNoise,
                                                                   ['obs','prd'])
cmat_high_all,beta_high_all,report_high_all = logistic_reg('multi',
                                                                    data_training_highNoise,
                                                                    data_test_highNoise,
                                                                   ['stat','obs','prd'])

In [14]:
cmat_high_stat

array([[900,   0,   0,   0],
       [  0, 124,  97,  79],
       [  0, 114,  88,  98],
       [  0,  74, 102, 124]])

In [15]:
cmat_high_means

array([[900,   0,   0,   0],
       [  0, 300,   0,   0],
       [  0,   0, 300,   0],
       [  0,   0,   0, 300]])

In [16]:
cmat_high_all

array([[900,   0,   0,   0],
       [  0, 300,   0,   0],
       [  0,   0, 300,   0],
       [  0,   0,   0, 300]])