In [24]:
import numpy as np
import pandas as pd
import os
import wntr
import networkx as nx
import copy
import seaborn as sbn

## Loading datafiles required for training and testing the classification model

In [25]:
# Getting path for the 'parent folder'
path_cwd = os.getcwd()
path_parent = os.path.abspath(os.path.join(path_cwd, os.pardir))

# Getting path for the data files
datafiles_folder_name = 'Data_files'

file_train = 'data_base_demand_train_1.csv'
file_test_base_demand = 'data_base_demand_test.csv'
file_test_diff_demand_low = 'data_diff_demand_low.csv'
file_test_diff_demand_high = 'data_diff_demand_high.csv'

file_leak14 = 'data_leak_in_14.csv'
file_leak24 = 'data_leak_in_24.csv'
file_leak31 = 'data_leak_in_31.csv'

path_train = os.path.join(path_parent,datafiles_folder_name,file_train)
path_test_base_demand = os.path.join(path_parent,datafiles_folder_name,file_test_base_demand)
path_test_diff_demand_low = os.path.join(path_parent,datafiles_folder_name,file_test_diff_demand_low)
path_test_diff_demand_high = os.path.join(path_parent,datafiles_folder_name,file_test_diff_demand_high)
path_leak14 = os.path.join(path_parent,datafiles_folder_name,file_leak14)
path_leak24 = os.path.join(path_parent,datafiles_folder_name,file_leak24)
path_leak31 = os.path.join(path_parent,datafiles_folder_name,file_leak31)

In [26]:
# Proxy for historical observations
data_train_df = pd.read_csv(path_train)

# Proxy for recent observations, those need to be tested for leak
data_test_df = pd.read_csv(path_test_base_demand)
leak_file_14 = pd.read_csv(path_leak14)
leak_file_24 = pd.read_csv(path_leak24)
leak_file_31 = pd.read_csv(path_leak31)
data_test_df_diff_demand_low = pd.read_csv(path_test_diff_demand_low)
data_test_df_diff_demand_high = pd.read_csv(path_test_diff_demand_high)

## Creating feature names that will be used to while creating train and test data for classification model

In [35]:
# sensor_list = [[4,4],[9,8],[18,17],[20,20],[26,27],[28,30],[32,33]]

In [36]:
cols_noleak = ['Node_head'+str(i) for i in [4,9,18,20,26,28,32]]+['Link_flow'+str(i) for i in [4,8,17,20,27,30,33]]

In [37]:
cols_leak = ['leak_head_'+str(i) for i in [4,9,18,20,26,28,32]]+['leak_flow_'+str(i) for i in [4,8,17,20,27,30,33]]

In [38]:
cols_means_ref = ['ref_mean_head'+str(i) for i in [4,9,18,20,26,28,32]]+['ref_mean_flow'+str(i) for i in [4,8,17,20,27,30,33]]

In [39]:
cols_means = ['mean_head'+str(i) for i in [4,9,18,20,26,28,32]]+['mean_flow'+str(i) for i in [4,8,17,20,27,30,33]]

## Dividing leak data into train and test parts

In [33]:
leak_data_14 = leak_file_14.loc[leak_file_14.leak_area.isin([0.0005,0.002,0.003,0.004])]
leak_data_24 = leak_file_24.loc[leak_file_24.leak_area.isin([0.0005,0.002,0.003,0.004])]
leak_data_31 = leak_file_31.loc[leak_file_31.leak_area.isin([0.0005,0.002,0.003,0.004])]

leak_data_14_test = leak_file_14.loc[leak_file_14.leak_area.isin([0.0001,0.001,0.005])]
leak_data_24_test = leak_file_24.loc[leak_file_24.leak_area.isin([0.0001,0.001,0.005])]
leak_data_31_test = leak_file_31.loc[leak_file_31.leak_area.isin([0.0001,0.001,0.005])]

In [34]:
print(leak_data_14.shape,leak_data_14_test.shape)

(204, 168) (153, 168)


## Defining length of the random samples that would be selected from above mentioned datasets and their means would be stored


In [48]:
sample_length = 5
trainfrac = sample_length/len(data_train_df)
testfrac = sample_length/len(data_test_df)
testfrac_diff_low = sample_length/len(data_test_df_diff_demand_low)
testfrac_diff_high = sample_length/len(data_test_df_diff_demand_high)
leakfrac = sample_length/len(leak_data_14)

In [49]:
# check
print(trainfrac,testfrac,testfrac_diff_low,testfrac_diff_high,leakfrac)

0.10416666666666667 0.09615384615384616 0.21739130434782608 0.23809523809523808 0.024509803921568627


In [50]:
## Defining size of training set. Prefer choosing a multiple of 6 
num_training_sample_total = 180
num_training_sample_leak = int(num_training_sample_total/6)
num_training_sample_noleak = num_training_sample_total - 3* (num_training_sample_leak)

In [51]:
# check
print(num_training_sample_leak,num_training_sample_noleak)

30 90


## Training data with and without leak
* We have assumed 7 sensor locations, each measuring flowrate and pressure
* First 7 pressure head + 7 flowrate columns that represent mean values of the  'expected' sensor observations

In [52]:
mean_train_reference=[]
for i in range (num_training_sample_total):
    mean_train_reference.append(list(data_train_df[cols_noleak].sample(frac=trainfrac).mean()))
mean_train_df_ref = pd.DataFrame(columns=cols_means_ref,data=np.array(mean_train_reference))

* Next 7 pressure and 7 flowrate columns that represent 'observed' means that are known to belong to a leak or 'no leak' case
* 2 set of labels are added, first one in named 'leak' and is binary. Other is named 'num_leak' and has four values i.e. 0,14,24 and 31

In [53]:
mean_train=[]
for i in range (num_training_sample_noleak):
    mean_train.append(list(data_train_df[cols_noleak].sample(frac=trainfrac).mean()))
    
mean_train_df = pd.DataFrame(columns=cols_means,data=np.array(mean_train))
mean_train_df['leak']=0
mean_train_df['num_leak']=0

mean_leak=[]
leaknum=[]
for i in range (num_training_sample_leak):
    mean_leak.append(list(leak_data_14[cols_leak].sample(frac=leakfrac).mean()))
    leaknum.append(14)
    
for i in range (num_training_sample_leak):
    mean_leak.append(list(leak_data_24[cols_leak].sample(frac=leakfrac).mean()))
    leaknum.append(24)
    
for i in range (num_training_sample_leak):
    mean_leak.append(list(leak_data_31[cols_leak].sample(frac=leakfrac).mean()))
    leaknum.append(31)
    
mean_leak_df = pd.DataFrame(columns=cols_means,data=np.array(mean_leak))
mean_leak_df['leak']=1
mean_leak_df['num_leak']=leaknum

* Combining the leak and no leak data together to form 'training set' for classification model

In [54]:
df_temp1 = mean_train_df.append(mean_leak_df,ignore_index=True)
training_classification = pd.concat([mean_train_df_ref,df_temp1],axis=1)
training_classification = training_classification.sample(frac=1)

In [55]:
#check
training_classification.head(3)

Unnamed: 0,ref_mean_head4,ref_mean_head9,ref_mean_head18,ref_mean_head20,ref_mean_head26,ref_mean_head28,ref_mean_head32,ref_mean_flow4,ref_mean_flow8,ref_mean_flow17,...,mean_head32,mean_flow4,mean_flow8,mean_flow17,mean_flow20,mean_flow27,mean_flow30,mean_flow33,leak,num_leak
166,140.322642,120.588803,132.632025,131.8687,108.880792,108.608765,107.066691,2.401865,1.278237,-0.438791,...,147.474237,2.292302,1.220457,-0.428846,2.373272,-0.075435,0.16564,0.179139,1,31
29,157.148621,137.286484,150.284364,148.406133,123.677857,124.120527,121.212152,2.388643,1.295577,-0.429482,...,90.428589,2.419493,1.316568,-0.429192,2.503072,-0.06032,0.158231,0.147877,0,0
33,188.814297,167.048915,182.006465,179.037187,152.447853,154.376955,149.482185,2.520204,1.363075,-0.438384,...,137.22215,2.224304,1.175552,-0.391558,2.301164,-0.056522,0.134292,0.123306,0,0


## Test data without leak (same and diff demand) and with leak (same demand)

* First 7 pressure and 7 flowrate features representing 'expected' sensor readings

In [56]:
mean_test_reference=[]
for i in range (num_training_sample_total):
    mean_test_reference.append(list(data_train_df[cols_noleak].sample(frac=trainfrac).mean()))

mean_test_df_ref = pd.DataFrame(columns=cols_means_ref,data=np.array(mean_test_reference))

* Next 7 plus 7 features representing the 'recently observed' or 'to be tested for leak' observations or sensor readings across 7 sensor locations

In [61]:
mean_test_same_demand=[]
for i in range (num_training_sample_noleak):
    mean_test_same_demand.append(list(data_test_df[cols_noleak].sample(frac=testfrac).mean()))
    
mean_test_df_same_demand = pd.DataFrame(columns=cols_means,data=np.array(mean_test_same_demand)) 
mean_test_df_same_demand['leak']=0
mean_test_df_same_demand['num_leak']=0


mean_test_diff_demand_low=[]
for i in range (num_training_sample_noleak):
    mean_test_diff_demand_low.append(list(data_test_df_diff_demand_low[cols_noleak].sample(frac=testfrac_diff_low).mean()))
    
mean_test_df_diff_demand_low = pd.DataFrame(columns=cols_means,data=np.array(mean_test_diff_demand_low)) 
mean_test_df_diff_demand_low['leak']=0
mean_test_df_diff_demand_low['num_leak']=0

mean_test_diff_demand_high=[]
for i in range (num_training_sample_noleak):
    mean_test_diff_demand_high.append(list(data_test_df_diff_demand_high[cols_noleak].sample(frac=testfrac_diff_high).mean()))
    
mean_test_df_diff_demand_high = pd.DataFrame(columns=cols_means,data=np.array(mean_test_diff_demand_high)) 
mean_test_df_diff_demand_high['leak']=0
mean_test_df_diff_demand_high['num_leak']=0


mean_leak_same_demand=[]
leaknum_test_same=[]
for i in range (num_training_sample_leak):
    mean_leak_same_demand.append(list(leak_data_14_test[cols_leak].sample(frac=leakfrac).mean()))
    leaknum_test_same.append(14)
    
for i in range (num_training_sample_leak):
    mean_leak_same_demand.append(list(leak_data_24_test[cols_leak].sample(frac=leakfrac).mean()))
    leaknum_test_same.append(24)
    
for i in range (num_training_sample_leak):
    mean_leak_same_demand.append(list(leak_data_31_test[cols_leak].sample(frac=leakfrac).mean()))
    leaknum_test_same.append(31)
    
mean_leak_df_same_demand = pd.DataFrame(columns=cols_means,data=np.array(mean_leak_same_demand))
mean_leak_df_same_demand['leak']=1
mean_leak_df_same_demand['num_leak']=leaknum_test_same

## Creating test sets

* Only original demand based test sets, with both 'no leak' and 'with leak' scenario

In [62]:
df_temp1=mean_test_df_same_demand.append(mean_leak_df_same_demand,ignore_index=True)
test_classification_1=pd.concat([mean_test_df_ref,df_temp1],axis=1)

* Set combining the 'no leak' cases from different demand based scenarios AND leak cases based on 'original' demand

In [63]:
df_temp2=mean_test_df_diff_demand_low.append(mean_leak_df_same_demand,ignore_index=True)
test_classification_2=pd.concat([mean_test_df_ref,df_temp2],axis=1)

df_temp3=mean_test_df_diff_demand_high.append(mean_leak_df_same_demand,ignore_index=True)
test_classification_3=pd.concat([mean_test_df_ref,df_temp3],axis=1)

In [64]:
test_classification_1=test_classification_1.sample(frac=1)
test_classification_2=test_classification_2.sample(frac=1)
test_classification_3=test_classification_3.sample(frac=1)

In [67]:
# check
print(test_classification_1.shape)

(180, 30)


## Saving the training and test sets

In [66]:
# Output folder name defined
datafiles_folder_name = 'Data_files'

# Output file names defined
datafile_training = 'data_training_classification_mean_based.csv'
datafile_test1 = 'data_testing_classification_mean_based_BaseDemand.csv'
datafile_test2 = 'data_testing_classification_mean_based_DiffLow.csv'
datafile_test3 = 'data_testing_classification_mean_based_DiffHigh.csv'

# Creating file paths. Note that 'path_parent' has been defined earlier
path_training = os.path.join(path_parent,datafiles_folder_name,datafile_training)
path_test1 = os.path.join(path_parent,datafiles_folder_name,datafile_test1)
path_test2 = os.path.join(path_parent,datafiles_folder_name,datafile_test2)
path_test3 = os.path.join(path_parent,datafiles_folder_name,datafile_test3)

# Creating the 'Data_files' folder if it doesn't exist
os.makedirs(os.path.dirname(path_training), exist_ok=True)

# Saving the output datasets as csv files whose paths have been defined above
training_classification.to_csv(path_training, index=None)
test_classification_1.to_csv(path_test1, index=None)
test_classification_2.to_csv(path_test2, index=None)
test_classification_3.to_csv(path_test3, index=None)