In [14]:
import numpy as np
import pandas as pd
import os
import copy
import seaborn as sbn
import matplotlib.pylab as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from itertools import permutations, combinations

## Loading train and test datafiles

In [2]:
# Getting path for the 'parent folder'
path_cwd = os.getcwd()
path_parent = os.path.abspath(os.path.join(path_cwd, os.pardir))

# Getting path for the data files
datafiles_folder_name = 'Data_files'
trainfile = 'data_base_demand_train_1.csv'
testfile = 'data_base_demand_test.csv'
path_trainfile = os.path.join(path_parent,datafiles_folder_name,trainfile)
path_testfile = os.path.join(path_parent,datafiles_folder_name,testfile)

In [3]:
num_demand = 32
num_nodes = 32
num_links = 34

In [4]:
data_train_df = pd.read_csv(path_trainfile)
data_test_df = pd.read_csv(path_testfile)

In [5]:
data_train = np.array(data_train_df)
data_test = np.array(data_test_df)

In [6]:
np.random.shuffle(data_train)
np.random.shuffle(data_test)

## In this experiment, regression model is trained to learn deltaH given flow1 and flow2 at two sensor locations. The function below extracts flow1 and flow2 as 'input data' and calculates 'head1 - head2' as output data for the model. 

In [9]:
def train_test_deltaH(data,link_list,deltaH_list):
       
    data_demand = data[:,:num_demand]
    data_head = data[:,num_demand:num_nodes+num_demand]
    data_flow = data[:,num_nodes+num_demand:]*1000        
    
    linklist = [i-1 for i in link_list]
    deltalist = [i-2 for i in deltaH_list]

    train_out= data_head[:,deltalist[0]]-data_head[:,deltalist[1]]    
    train_in = data_flow[:,linklist]
    
    return train_in, train_out

## The function below uses the above function to get the input and output data for a given sensor pair, trains a linear regression model and outputs the predictions and the test values

In [10]:
sensor_list = [[4,4],[9,8],[18,17],[20,20],[26,27],[28,30],[32,33]]

def sensor_pair_pred(sensor_pair):
    
    h1= sensor_list[sensor_pair[0]-1][0]
    h2= sensor_list[sensor_pair[1]-1][0]
    f1= sensor_list[sensor_pair[0]-1][1]
    f2= sensor_list[sensor_pair[1]-1][1]
    knwn_link_list = [f1,f2]
    delta_nodes = [h1,h2]
    X_train, y_train = train_test_deltaH(data_train,knwn_link_list,delta_nodes)
    X_test, y_test = train_test_deltaH(data_test,knwn_link_list,delta_nodes)
    n=2
    poly = PolynomialFeatures(degree=n)
    X_train_poly = poly.fit_transform(X_train)    
    lin_model = LinearRegression()
    lin_model.fit(X_train_poly,y_train)
    X_test_poly = poly.fit_transform(X_test)
    pred = lin_model.predict(X_test_poly).reshape(-1)
    error = y_test - pred
    
    return y_test,pred


## We now use the above functions for all sensor pairs

In [15]:
# Create sensor pair combinations
sensor_nums=[1,2,3,4,5,6,7]
pair_combs = list(combinations(sensor_nums,2))

In [16]:
#check
pair_combs

[(1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (1, 7),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (3, 4),
 (3, 5),
 (3, 6),
 (3, 7),
 (4, 5),
 (4, 6),
 (4, 7),
 (5, 6),
 (5, 7),
 (6, 7)]

## Running the model across all sensor pairs and storing results

In [17]:
df_mean_std = pd.DataFrame()
paircomb=[]
mean_test=[]
mean_pred=[]
mean_error=[]
std_test=[]
std_pred=[]
std_error=[]
r2=[]

for pair in pair_combs:
    ytest,pred = sensor_pair_pred(list(pair))
    error=ytest-pred
    paircomb.append(pair)
    mean_test.append(np.mean(ytest))
    mean_pred.append(np.mean(pred))
    mean_error.append(np.mean(error))
    std_test.append(np.std(ytest))
    std_pred.append(np.std(pred))
    std_error.append(np.std(error))
    r2.append(r2_score(ytest,pred))

df_mean_std['sensor_pair']=paircomb
df_mean_std['mean_test']=mean_test
df_mean_std['mean_pred']=mean_pred
df_mean_std['mean_error']=mean_error
df_mean_std['std_test']=std_test
df_mean_std['std_pred']=std_pred
df_mean_std['std_error']=std_error
df_mean_std['R_Squared']=r2

In [18]:
df_mean_std

Unnamed: 0,sensor_pair,mean_test,mean_pred,mean_error,std_test,std_pred,std_error,R_Squared
0,"(1, 2)",19.592008,19.707121,-0.115113,4.999658,5.059864,0.269352,0.996567
1,"(1, 3)",7.227478,7.299451,-0.071973,1.680638,1.42984,0.933784,0.68946
2,"(1, 4)",8.416239,8.439284,-0.023045,2.132318,2.15065,0.129554,0.996192
3,"(1, 5)",31.657801,31.829424,-0.171623,9.097251,9.278491,3.663023,0.837516
4,"(1, 6)",29.642807,31.1973,-1.554493,10.336416,9.680304,4.808505,0.760971
5,"(1, 7)",33.86641,34.639818,-0.773407,10.428226,12.066871,3.430812,0.886263
6,"(2, 3)",-12.364531,-12.337225,-0.027306,3.952541,3.507503,1.492438,0.857378
7,"(2, 4)",-11.175769,-11.137794,-0.037975,3.236123,2.72311,1.29609,0.839456
8,"(2, 5)",12.065793,12.281583,-0.215791,4.509026,4.271367,2.79252,0.614155
9,"(2, 6)",10.050799,11.802284,-1.751485,6.188719,5.544509,4.505445,0.389906
