In [24]:
import numpy as np
import pandas as pd
import os
import copy
import matplotlib.pylab as plt
import seaborn as sbn
import pickle

from scipy.stats import ks_2samp
from sklearn.preprocessing import MinMaxScaler, StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, confusion_matrix
from sklearn.linear_model import LinearRegression,LogisticRegression
from itertools import permutations, combinations

## Loading datafiles to generate data for training of regression and classification model and testing of classification model

In [25]:
# Getting path for the 'parent folder'
path_cwd = os.getcwd()
path_parent = os.path.abspath(os.path.join(path_cwd, os.pardir))

# Getting path for the data files
datafiles_folder_name = 'Data_files'

file_train_1 = 'data_base_demand_train_1.csv'
file_train_2 = 'data_base_demand_train_2.csv'
file_train_3 = 'data_base_demand_train_3.csv'
file_test_base_demand = 'data_base_demand_test.csv'


file_leak14 = 'data_leak_in_14.csv'
file_leak24 = 'data_leak_in_24.csv'
file_leak31 = 'data_leak_in_31.csv'

path_train1 = os.path.join(path_parent,datafiles_folder_name,file_train_1)
path_train2 = os.path.join(path_parent,datafiles_folder_name,file_train_2)
path_train3 = os.path.join(path_parent,datafiles_folder_name,file_train_2)
path_test_base_demand = os.path.join(path_parent,datafiles_folder_name,file_test_base_demand)

path_leak14 = os.path.join(path_parent,datafiles_folder_name,file_leak14)
path_leak24 = os.path.join(path_parent,datafiles_folder_name,file_leak24)
path_leak31 = os.path.join(path_parent,datafiles_folder_name,file_leak31)

In [26]:
# Proxy for historical observations
df_train_reg_model = pd.read_csv(path_train1)
df_train_class_model_ref = pd.read_csv(path_train2)
df_train_class_model_rec = pd.read_csv(path_train3)

# Proxy for recent observations, those need to be tested for leak
df_test_class_model = pd.read_csv(path_test_base_demand)
leak_file_14 = pd.read_csv(path_leak14)
leak_file_24 = pd.read_csv(path_leak24)
leak_file_31 = pd.read_csv(path_leak31)


## Dividing the leak datafiles into training and test sets

In [28]:
# Leak data to be used for classification model training
leaksize_training = [0.0005,0.002,0.003,0.004]
leak14_training=leak_file_14.loc[(leak_file_14.leak_area.isin(leaksize_training))]
leak24_training=leak_file_24.loc[(leak_file_24.leak_area.isin(leaksize_training))]
leak31_training=leak_file_31.loc[(leak_file_31.leak_area.isin(leaksize_training))]

# Leak data to be used for classification model testing
leaksize_testing = [0.0001,0.001,0.005]
leak14_testing=leak_file_14.loc[(leak_file_14.leak_area.isin(leaksize_testing))]
leak24_testing=leak_file_24.loc[(leak_file_24.leak_area.isin(leaksize_testing))]
leak31_testing=leak_file_31.loc[(leak_file_31.leak_area.isin(leaksize_testing))]

## Function to extract X (flow 1, flow 2) and y (deltaH = head 1 - head 2) data for regression model

In [29]:
def training_data(df,link_names,head_names):
    
    data_flow = np.array(df[link_names])*1000  # convering to litres per sec 
    data_head = np.array(df[head_names])
    

    train_out= data_head[:,0] - data_head[:,1] # deltaH
    train_in = data_flow                       # flow1, flow2
    
    return train_in, train_out

* Initialising Sensor list and sensor combination 

In [30]:
# Manually define a sensor list
sensor_list = [[4,4],[9,8],[18,17],[20,20],[26,27],[28,30],[32,33]]

# Possible combinations of sensors
sen_nums = np.arange(1,len(sensor_list)+1)
combs = list(combinations(sen_nums,2))

In [31]:
#chk how combs can help in iterating through all possible combinations of sensors
i=1
print(len(combs))
print(combs[i])
print('first sensor is ',combs[i][0])
print('second sensor is ',combs[i][1])

21
(1, 3)
first sensor is  1
second sensor is  3


## Function to get X and y data for regression model given sensor pair

In [32]:
# function to extract input (flow1, flow2) and output (deltaH) data from datasets prepared earlier

def data_in_out_noleak(sensor_pair,df):
    
    h1= sensor_list[sensor_pair[0]-1][0]
    h2= sensor_list[sensor_pair[1]-1][0]
    f1= sensor_list[sensor_pair[0]-1][1]
    f2= sensor_list[sensor_pair[1]-1][1]
    
    link_name = ['Link_flow'+str(f1),'Link_flow'+str(f2)] 
    node_name = ['Node_head'+str(h1),'Node_head'+str(h2)]
    
    data_in, data_out = training_data(df,link_name,node_name)
    
    return data_in, data_out   

In [33]:
# function to extract input and output data

def data_in_out_withleak(sensor_pair,df):
    
    h1= sensor_list[sensor_pair[0]-1][0]
    h2= sensor_list[sensor_pair[1]-1][0]
    f1= sensor_list[sensor_pair[0]-1][1]
    f2= sensor_list[sensor_pair[1]-1][1]
    
    link_name_leak = ['leak_flow_'+str(f1),'leak_flow_'+str(f2)]
    node_name_leak = ['leak_head_'+str(h1),'leak_head_'+str(h2)]    
    data_in, data_out = training_data(df,link_name_leak,node_name_leak)
    
    return data_in, data_out

## Linear regression models for each sensor combination being trained and saved as .pkl files

In [34]:

for comb in combs:    
    xtrain, ytrain = data_in_out_noleak(comb,df_train_reg_model)    
    poly = PolynomialFeatures(degree=2)
    x_train_poly = poly.fit_transform(xtrain)    
    lin_model = LinearRegression()
    lin_model.fit(x_train_poly,ytrain)
    
    pkl_filename = 'linmodel'+str(comb[0])+str(comb[1])+'.pkl'
    with open(pkl_filename, 'wb') as file:
        pickle.dump(lin_model, file)

## Create column names to be used for the dataframe that will store the results of regression model

In [35]:
# generate column names
colnames = []
for comb in combs:
    n1='obs'+str(comb[0])+str(comb[1])
    n2='prd'+str(comb[0])+str(comb[1])
    n3='stat'+str(comb[0])+str(comb[1])
    n4='pval'+str(comb[0])+str(comb[1])
    colnames.extend([n1,n2,n3,n4])
    

## The function below creates data to be used for training and testing classification model. It does the following:
* For each sensor pair:
* Randomly choose a set of observations of specified size from 'historical observations' named df_train_class_model
* Extract deltaH from this data which can be referred as deltaH_observed
* For the same data, make predictions for deltaH (deltaH_predicted) using the linear regression models stored earlier
* Stores the mean of error_historical which is deltaH_observed - deltaH_predicted
* Same steps for 'recent observations' from the dataset named df_test_class_model
* Error in prediction of recent observation can be referred error_recent
* Next the KS test is done on error_historical and error_recent
* Stat and pval is calculated and stored

In [36]:

def datafortrees(numcases,df_recent,sample_len,casetype):
    
    df_cases = pd.DataFrame(columns=colnames)
    fracsize_recent=sample_len/len(df_recent)
    fracsize_reference = sample_len/len(df_train_class_model_ref)
    
    # first loop to randomly select a sample test set

    for i in range(numcases):
        df_recent_sample = df_recent.sample(frac=fracsize_recent)
        df_reference_sample = df_train_class_model_ref.sample(frac=fracsize_reference)
        
        # second loop to cover all possible leak combinations

        comb_data = []
        for comb in combs:
            if casetype=='noleak':
                xtest_rec, ytest_rec = data_in_out_noleak(comb,df_recent_sample)
            else:
                xtest_rec, ytest_rec = data_in_out_withleak(comb,df_recent_sample)
                
            xtest_ref, ytest_ref = data_in_out_noleak(comb,df_reference_sample)
            

            # load the linear regression model, make predictions and store results
            pkl_filename = 'linmodel'+str(comb[0])+str(comb[1])+'.pkl'

            with open(pkl_filename, 'rb') as file:
                lin_model = pickle.load(file)

            xtest_ref_poly = poly.fit_transform(xtest_ref)
            xtest_rec_poly = poly.fit_transform(xtest_rec) 
            pred_ref = lin_model.predict(xtest_ref_poly).reshape(-1)
            pred_rec = lin_model.predict(xtest_rec_poly).reshape(-1)
            error_ref = (ytest_ref-pred_ref)
            error_rec = (ytest_rec-pred_rec)
            stat,pval = ks_2samp(error_ref,error_rec)
            comb_list = [np.mean(error_ref),np.mean(error_rec),stat,pval]            
            comb_data.extend(comb_list)
            
        comb_series = pd.Series(comb_data,index=df_cases.columns)    
        df_cases = df_cases.append(comb_series,ignore_index=True)
        
    return df_cases

## Generating dataset for training the classification model

In [37]:
## Defining size of training set. Prefer choosing a multiple of 6 
num_samples = 500
num_training_sample_total = 3000
num_training_sample_leak = int(num_training_sample_total/6)
num_training_sample_noleak = num_training_sample_total - 3* (num_training_sample_leak)

class_train_noleak = datafortrees(num_training_sample_noleak,df_train_class_model_rec,num_samples,'noleak')
class_train_leak14 = datafortrees(num_training_sample_leak,leak14_training,num_samples,'leak')
class_train_leak24 = datafortrees(num_training_sample_leak,leak24_training,num_samples,'leak')
class_train_leak31 = datafortrees(num_training_sample_leak,leak31_training,num_samples,'leak')

class_train_noleak['leak']=0
class_train_noleak['leak_num']=0

class_train_leak14['leak']=1
class_train_leak14['leak_num']=14

class_train_leak24['leak']=1
class_train_leak24['leak_num']=24

class_train_leak31['leak']=1
class_train_leak31['leak_num']=31

data_train_classification = pd.concat([class_train_noleak,class_train_leak14,
                                       class_train_leak24,class_train_leak31],axis=0)

## Generating dataset for testing the classification model, using the original and different demand distribution

In [38]:
num_samples=500
num_test_sample_total = 1800
num_test_sample_leak = int(num_test_sample_total/6)
num_test_sample_noleak = num_test_sample_total - 3* (num_test_sample_leak)

class_test_noleak = datafortrees(num_test_sample_noleak,df_test_class_model,num_samples,'noleak')

class_test_leak14 = datafortrees(num_test_sample_leak,leak14_testing,num_samples,'leak')
class_test_leak24 = datafortrees(num_test_sample_leak,leak24_testing,num_samples,'leak')
class_test_leak31 = datafortrees(num_test_sample_leak,leak31_testing,num_samples,'leak')

class_test_noleak['leak']=0
class_test_noleak['leak_num']=0

class_test_leak14['leak']=1
class_test_leak14['leak_num']=14

class_test_leak24['leak']=1
class_test_leak24['leak_num']=24

class_test_leak31['leak']=1
class_test_leak31['leak_num']=31

## Generating one sized leak sample data

In [17]:
num_samples=500
num_test_sample_total = 600
num_test_sample_leak = int(num_test_sample_total/6)
num_test_sample_noleak = num_test_sample_total - 3* (num_test_sample_leak)

class_test_noleak_2 = datafortrees(num_test_sample_noleak,df_test_class_model,num_samples,'noleak')

# testing sizes[0.0001,0.001,0.005]

leak14_testing_small = leak14_testing.loc[leak14_testing.leak_area==leaksize_testing[0]]
leak24_testing_small = leak24_testing.loc[leak24_testing.leak_area==leaksize_testing[0]]
leak31_testing_small = leak31_testing.loc[leak31_testing.leak_area==leaksize_testing[0]]

leak14_testing_mid = leak14_testing.loc[leak14_testing.leak_area==leaksize_testing[1]]
leak24_testing_mid = leak24_testing.loc[leak24_testing.leak_area==leaksize_testing[1]]
leak31_testing_mid = leak31_testing.loc[leak31_testing.leak_area==leaksize_testing[1]]

leak14_testing_large = leak14_testing.loc[leak14_testing.leak_area==leaksize_testing[2]]
leak24_testing_large = leak24_testing.loc[leak24_testing.leak_area==leaksize_testing[2]]
leak31_testing_large = leak31_testing.loc[leak31_testing.leak_area==leaksize_testing[2]]

class_test_leak14_small = datafortrees(num_test_sample_leak,leak14_testing_small,num_samples,'leak')
class_test_leak24_small = datafortrees(num_test_sample_leak,leak24_testing_small,num_samples,'leak')
class_test_leak31_small = datafortrees(num_test_sample_leak,leak31_testing_small,num_samples,'leak')

class_test_leak14_mid = datafortrees(num_test_sample_leak,leak14_testing_mid,num_samples,'leak')
class_test_leak24_mid = datafortrees(num_test_sample_leak,leak24_testing_mid,num_samples,'leak')
class_test_leak31_mid = datafortrees(num_test_sample_leak,leak31_testing_mid,num_samples,'leak')

class_test_leak14_large = datafortrees(num_test_sample_leak,leak14_testing_large,num_samples,'leak')
class_test_leak24_large = datafortrees(num_test_sample_leak,leak24_testing_large,num_samples,'leak')
class_test_leak31_large = datafortrees(num_test_sample_leak,leak31_testing_large,num_samples,'leak')

class_test_noleak_2['leak']=0
class_test_noleak_2['leak_num']=0

class_test_leak14_small['leak']=1
class_test_leak14_small['leak_num']=14

class_test_leak24_small['leak']=1
class_test_leak24_small['leak_num']=24

class_test_leak31_small['leak']=1
class_test_leak31_small['leak_num']=31

class_test_leak14_mid['leak']=1
class_test_leak14_mid['leak_num']=14

class_test_leak24_mid['leak']=1
class_test_leak24_mid['leak_num']=24

class_test_leak31_mid['leak']=1
class_test_leak31_mid['leak_num']=31

class_test_leak14_large['leak']=1
class_test_leak14_large['leak_num']=14

class_test_leak24_large['leak']=1
class_test_leak24_large['leak_num']=24

class_test_leak31_large['leak']=1
class_test_leak31_large['leak_num']=31

## Creating test sets by combining the above created sets

* Test set that includes 'usual demand no leak' and the leak cases

In [39]:
data_test_classification_1 = pd.concat([class_test_noleak,class_test_leak14,
                                      class_test_leak24,class_test_leak31],axis=0)

* Single leak sets

In [18]:
data_test_classification_small_leak = pd.concat([class_test_noleak_2,class_test_leak14_small,
                                      class_test_leak24_small,class_test_leak31_small],axis=0)

In [19]:
data_test_classification_mid_leak = pd.concat([class_test_noleak_2,class_test_leak14_mid,
                                      class_test_leak24_mid,class_test_leak31_mid],axis=0)

In [20]:
data_test_classification_large_leak = pd.concat([class_test_noleak_2,class_test_leak14_large,
                                      class_test_leak24_large,class_test_leak31_large],axis=0)

In [20]:
#check
data_test_classification_1.head(3)

Unnamed: 0,obs12,prd12,stat12,pval12,obs13,prd13,stat13,pval13,obs14,prd14,...,obs57,prd57,stat57,pval57,obs67,prd67,stat67,pval67,leak,leak_num
0,0.019687,-0.02399,0.102,0.010969,0.201138,-0.001874,0.05,0.560022,-0.002109,0.000208,...,-0.032839,-0.053999,0.038,0.863677,-0.004158,0.018671,0.046,0.665922,0,0
1,0.012894,0.009576,0.056,0.413486,0.046078,-0.00222,0.036,0.902691,-0.000248,0.001087,...,0.113001,-0.063693,0.09,0.034795,0.064291,0.011749,0.062,0.291925,0,0
2,0.004322,0.017788,0.074,0.129396,0.135952,-0.115709,0.084,0.058689,0.000148,-0.00072,...,-0.030916,-0.051476,0.034,0.935112,-0.021604,0.091268,0.078,0.095465,0,0


## Saving the datasets created 

In [40]:
# Training Set
data_train_classification = data_train_classification.sample(frac=1)
# Output folder name defined
datafiles_folder_name = 'Data_files'

# Output file names defined
datafile_training = 'data_training_classification_pred_error_based.csv'

# Creating file paths. Note that 'path_parent' has been defined earlier
path_training = os.path.join(path_parent,datafiles_folder_name,datafile_training)

# Creating the 'Data_files' folder if it doesn't exist
os.makedirs(os.path.dirname(path_training), exist_ok=True)

# Saving the output datasets as csv files whose paths have been defined above
data_train_classification.to_csv(path_training, index=None)

In [41]:
# Test set
data_test_classification_1 = data_test_classification_1.sample(frac=1)
datafile_test1 = 'data_testing_classification_pred_error_based_BaseDemand.csv'
# Creating file paths. Note that 'path_parent' has been defined earlier
path_test1 = os.path.join(path_parent,datafiles_folder_name,datafile_test1)
# Saving the output datasets as csv files whose paths have been defined above
data_test_classification_1.to_csv(path_test1, index=None)

In [23]:
# Single Leak Test set

# Output folder name defined
datafiles_folder_name = 'Data_files'

data_test_classification_small_leak = data_test_classification_small_leak.sample(frac=1)
data_test_classification_mid_leak = data_test_classification_mid_leak.sample(frac=1)
data_test_classification_large_leak = data_test_classification_large_leak.sample(frac=1)

datafile_small = 'data_testing_classification_small_leak.csv'
datafile_mid = 'data_testing_classification_mid_leak.csv'
datafile_large = 'data_testing_classification_large_leak.csv'

# Creating file paths. Note that 'path_parent' has been defined earlier

path_test1 = os.path.join(path_parent,datafiles_folder_name,datafile_small)
path_test2 = os.path.join(path_parent,datafiles_folder_name,datafile_mid)
path_test3 = os.path.join(path_parent,datafiles_folder_name,datafile_large)

# Saving the output datasets as csv files whose paths have been defined above

data_test_classification_small_leak.to_csv(path_test1, index=None)
data_test_classification_mid_leak.to_csv(path_test2, index=None)
data_test_classification_large_leak.to_csv(path_test3, index=None)

In [None]:
## Plots showing the prediction error profiles

In [23]:
comb_num = 15 # an integer between 1 and 21

print(combs[comb_num])
print('first sensor is ',combs[comb_num][0])
print('second sensor is ',combs[comb_num][1])

(4, 5)
first sensor is  4
second sensor is  5
