### Library and File Imports

The following libraries have been used to analyze the given dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from scipy.stats import norm

import chart_studio.plotly as py
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
init_notebook_mode(connected = True)
cf.go_offline()

In [None]:
#we read the dataset into a python dataframe named swat

In [None]:
swat = pd.read_csv('SWAT.csv')

### Basic Information on the Dataset




In [None]:
swat

In [None]:
swat.head(5)

In [None]:
t = swat.describe()
t.to_csv('describe.csv')
t

### Correlation of the DataSet

In [None]:
#we have computed the correlation between the different columns of the dataset in this section

In [None]:
t = swat.corr()

In [None]:
plt.figure(figsize = (19,13))
sns.heatmap(t, cmap="coolwarm")

In [None]:
swat.shape

In [None]:
#we save the highly correlated values into a csv file named topcorrelation for future use.

In [None]:
st = t.unstack().sort_values(kind="quicksort",ascending = False)
st = st.dropna()


In [None]:
st.to_csv('topcorrelation.csv')

### Causality

In [None]:
#Granger-Causality Test
#The granger causality test is used for predicting whether a given time series is useful for forecasting another time series
#The test returns a F test statistic with a corresponding p-value. 
#if he p-value is less than a certain significance level (i.e. α = .05),we can say the time series causes the second time series

In [None]:
#Below is an example of a granger causality test and the results it returns

In [None]:
from statsmodels.tsa.stattools import grangercausalitytests

grangercausalitytests(swat[['LIT101', 'FIT101']], maxlag=[1])


In [None]:
#From the above result we can infer that FIT101 can forecast LIT101 because the corresponding p value is lesser than 0.05

In [None]:
#we will drop all the null values for better accuracy of the test and save it in a new dataset

In [None]:
swatnew = swat.dropna()

In [None]:
#The dataset has no null values and hence no changes are reflected in the dataset
swatnew.info()

In [None]:
#Below we have automated the process of the Granger Causality test for all the columns 
#All the column pairs that pass the test will be saved in a dataframe
#Certain Columns as mentioned in the second line of the block of code have been dropped as they have a constant value throughout the dataset
#Please wait as the block of code will take a couple of minutes to finish executing

In [None]:
nm = []
swatfinal = swatnew.drop(['P102','P201','P202','P204','P206','P401','P403','P404','P502','P601','P603'],axis = 1)
for y in swatfinal:
    for x in swatfinal:
        gc = grangercausalitytests(swatfinal[[x, y]], maxlag=[1],verbose = False)
        res = gc[1][0]['ssr_ftest'][1]
        if(res<0.05):
            nm.append((y,'Cause',x, res))       

causalityvalues = pd.DataFrame(nm,columns = ['Column Name','Relation', 'Column Name', 'P_Score'])
causalityvalues

In [None]:
#Saving the dataset according to the columns and their causation
causalityvalues.to_csv('CausalityUnsorted.csv')

In [None]:
#Sorting the dataset according to the pairs which gave the best result and saving it

In [None]:
causalityvalues = pd.DataFrame(nm,columns = ['Column Name','Relation', 'Column Name', 'P_Score'])
causalityvalues = causalityvalues.sort_values(causalityvalues.columns[3]).reset_index(drop = True)
causalityvalues.to_csv('CausalitySorted.csv')
causalityvalues
#As we can see there are around 53 pairs in the dataset which have  perfect causality

In [None]:
#FROM THE FOLLOWING TESTS WE CAN SEE THAT FIT201 HAS MUCH BETTER CAUSALITY WITH AIT201,AIT202,AIT203 THAT FIT101 indicating different stages.
#FROM THE TESTS WE CAN IDENTIFY THAT FIT201 CAUSES LIT101 AND NOT VICE VERSA
#FROM THE TESTS WE CAN IDENTIFY THAT MV201 CAUSES MV101 AND NOT VICE VERSA
#FROM THE TESTS WE CAN IDENTIFY THAT AIT201 CAUSES MV101 AND NOT VICE VERSA


In [None]:
#COMPARING THE PROCESSES AT SEWAGWE WATER TREATMENT PLANTS,WE CAN CONCLUDE FROM THE ABOVE TESTS THAT THE FIRST DIGIT OF THE LABEL
#OF THE COLUMN INDICATES THE STAGE OF THE PROCESS.
#EXAMPLE:P101 INDICATES FIRST STAGE SINCE THE DIGIT AFTER 'P' IS 1.



In [None]:
#OUR INFERENCE AFTER STUDYING THE PROCESS THAT HAPPENS AT WATER TREATMENT PLANTS:
#FIT HAS TO BE THE FLOW INDICATOR
#MV HAS TO BE THE VALVE(PROBABLY AS IT HAS DISCRETE VALUES)
#LIT HAS TO BE THE LEVEL INDICATOR
#P HAS TO BE THE PUMP AS MOST OF THE VALUES ARE BINARY
#AIT HAS TO BE THE QUALITY INDICATOR AS IT IS NOT PRESENT IN STAGE 1.
#UV MIGHT BE RELATED TO ULTRAVIOLET LIGHT AS IT HAS 2 DISCRETE VALUES.

### Curves that fit the Gaussian Distribution

In [None]:
#import plotly.io as pio
#pio.renderers.default = "colab"

In [None]:

swat['LIT101'].iplot(kind= 'hist',bins = 500)

In [None]:

data = swat['LIT101']
data1 = data[data.between(450,550)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=24, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between 400 and 600)".format(mu, std)
plt.title(title)
  
plt.show()

In [None]:

swat['FIT101'].iplot(kind= 'hist',bins = 200)

In [None]:

data = swat['FIT101']
a = 2.415
b = 2.675
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['AIT201'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['AIT201']
a = 251.6
b = 264.4
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=20, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['AIT202'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['AIT202']
a = 8.29
b = 8.45
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['AIT203'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['AIT203']
a = 300
b = 346
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['FIT201'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['FIT201']
a = 2.43
b = 2.47
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=9, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['DPIT301'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['DPIT301']
a = 18.4
b = 20.8
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['FIT301'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['FIT301']
a = 2.13
b = 2.29
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['LIT301'].iplot(kind= 'hist',bins = 200)

In [None]:
swat['AIT402'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['AIT402']
a = 178.5
b = 184.5
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['LIT401'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['LIT401']
a = 770
b = 1000
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['AIT501'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['AIT501']
a = 7.81
b = 7.87
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['AIT502'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['AIT502']
a = 166.5
b = 178
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['AIT503'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['AIT503']
a = 264.2
b = 278.4
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['AIT504'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['AIT504']
a = 10
b = 14
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['FIT501'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['FIT501']
a = 1.68
b = 1.72
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['FIT502'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['FIT502']
a = 1.21
b = 1.31
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['FIT503'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['FIT503']
a = 0.735
b = 0.745
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['PIT501'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['PIT501']
a = 250
b = 258
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['PIT502'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['PIT502']
a = 0.83
b = 1.13
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=7, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['PIT503'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['PIT503']
a = 190
b = 197
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
swat['FIT601'].iplot(kind= 'hist',bins = 200)

In [None]:
data = swat['FIT601']
a = -0.005
b = 0.005
data1 = data[data.between(a,b)]
  
mu, std = norm.fit(data1) 

plt.hist(data1, bins=10, density=True, alpha=0.6, color='b')
  
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
  
plt.plot(x, p, 'k', linewidth=2)
title = "Fit Values: {:.2f} and {:.2f} (values between {:.2f} and {:.2f})".format(mu, std,a,b)
plt.title(title)
  
plt.show()

In [None]:
#The following columns are a good fit to the normal distribution:
#LIT101,LIT401
#FIT201,FIT301,FIT502
#AIT402,AIT502,AIT503,AIT504
#PIT501,PIT502,PIT503