# Presentation of gathered results


In [1]:
import plotly.express as px
import plotly.graph_objects as go

import math
import pandas as pandas
from pandas import concat
import numpy as np
from scipy.stats import pearsonr
from statistics import mean
from numpy.random import shuffle

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

data = pandas.read_json("../data/water_loss_data_set_1.json")
data = data[(data.timeStamp != "0000-00-00 00:00:00" ) & (data.timeStamp != "2000-01-01 00:00:00")]
data = data[(data.tot1 != 0 ) & (data.analog2 != 0)]
data["timeStamp"] = pandas.to_datetime(data["timeStamp"], format="%Y-%m-%d %H:%M")

data_with_minutes = data.copy(deep=True)

# Normalisation of pressure data
# data["analog2"] = ((data["analog2"] - data["analog2"].min()) / (data["analog2"].max() - data["analog2"].min()))

data_249_min = data[data['idflowmeter'] == "MAG8000_024905H318"]
data_248_min = data[data['idflowmeter'] == "MAG8000_024805H318"]
data_249_min

Unnamed: 0,timeStamp,idflowmeter,tot1,tot2,analog2
105,2018-11-23 08:55:00,MAG8000_024905H318,0.43,0.00,1.11
106,2018-11-23 08:56:00,MAG8000_024905H318,0.89,0.00,1.11
107,2018-11-23 08:57:00,MAG8000_024905H318,1.38,0.00,1.11
108,2018-11-23 08:58:00,MAG8000_024905H318,1.90,0.00,1.11
109,2018-11-23 08:59:00,MAG8000_024905H318,2.45,0.00,1.11
...,...,...,...,...,...
1395620,2020-04-01 08:35:00,MAG8000_024905H318,797188.56,-293.97,0.82
1395621,2020-04-01 08:36:00,MAG8000_024905H318,797192.50,-293.97,0.81
1395622,2020-04-01 08:37:00,MAG8000_024905H318,797195.06,-293.97,0.81
1395623,2020-04-01 08:38:00,MAG8000_024905H318,797197.62,-293.97,0.81


In [2]:
data

Unnamed: 0,timeStamp,idflowmeter,tot1,tot2,analog2
105,2018-11-23 08:55:00,MAG8000_024905H318,0.43,0.00,1.11
106,2018-11-23 08:56:00,MAG8000_024905H318,0.89,0.00,1.11
107,2018-11-23 08:57:00,MAG8000_024905H318,1.38,0.00,1.11
108,2018-11-23 08:58:00,MAG8000_024905H318,1.90,0.00,1.11
109,2018-11-23 08:59:00,MAG8000_024905H318,2.45,0.00,1.11
...,...,...,...,...,...
1395740,2020-04-01 14:05:00,MAG8000_024805H318,977428.25,-4713.21,1.54
1395741,2020-04-01 14:06:00,MAG8000_024805H318,977428.88,-4713.21,1.54
1395742,2020-04-01 14:07:00,MAG8000_024805H318,977429.50,-4713.21,1.54
1395743,2020-04-01 14:08:00,MAG8000_024805H318,977430.12,-4713.21,1.54


In [3]:
def data_sel_time(start, end, data_fun):
    return data_fun.loc[(data_fun["timeStamp"] >= start) & (data_fun["timeStamp"] <= end)]   

def derivate(x_data, y_data):
    x_arr = []
    y_arr = []
    derivate_y = np.diff(y_data, n=2) # second derivative
    idx_max_dy = np.argmax(derivate_y)

    """    # initialize N  
    N = 20  # Indices of N largest elements in list 
    res = sorted(range(len(derivate_y)), key = lambda sub: derivate_y[sub])[-N:][::-1] """
    # print(sorted(set(derivate_y)))
    
    for index, value in enumerate(derivate_y):
        if abs(value) >= 0.05:
            # print(np.datetime_as_string(x_data[index], unit='m'), " value:", y_data[index])
            x_arr.append(x_data[index])
            y_arr.append(y_data[index])
    
    return x_arr, y_arr

### Overview of all data with an option to zoom on specific part 

List of dates where an anomaly occurs:
- 24.1.2019 7.27
- 25.1.2019 6.25

In [4]:
# anomaly_rows_248 = data_248_min.loc[(data_248_min['analog2'] <= 0.72)]
data_249_processed = data_sel_time("2018-11-28 08:00:00", "2020-06-26 08:00:00", data_248_min)
anomaly_rows_249 = data_249_processed.loc[(data_249_processed['analog2'] <= 0.72)]
dates_249 = anomaly_rows_249["timeStamp"].map(pandas.Timestamp.date).unique()
dates_249 = [i.strftime("%d-%m-%Y") for i in dates_249]

with pandas.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(dates_249)

['15-12-2018', '19-12-2018', '20-12-2018', '22-12-2018', '24-12-2018', '26-12-2018', '27-12-2018', '28-12-2018', '30-12-2018', '01-01-2019', '02-01-2019', '03-01-2019', '04-01-2019', '05-01-2019', '09-01-2019', '10-01-2019', '11-01-2019', '12-01-2019', '13-01-2019', '14-01-2019', '15-01-2019', '16-01-2019', '18-01-2019', '19-01-2019', '20-01-2019', '22-01-2019', '24-01-2019', '25-01-2019', '26-01-2019', '27-01-2019', '29-01-2019', '30-01-2019', '31-01-2019', '01-02-2019', '02-02-2019', '04-02-2019', '05-02-2019', '06-02-2019', '08-02-2019', '09-02-2019', '10-02-2019', '11-02-2019', '12-02-2019', '13-02-2019', '15-02-2019', '20-02-2019', '23-02-2019', '24-02-2019', '28-02-2019', '01-03-2019', '02-03-2019', '03-03-2019', '06-03-2019', '07-03-2019', '09-03-2019', '12-03-2019', '13-03-2019', '16-03-2019', '17-03-2019', '19-03-2019', '20-03-2019', '21-03-2019', '22-03-2019', '23-03-2019', '24-03-2019', '25-03-2019', '26-03-2019', '27-03-2019', '28-03-2019', '29-03-2019', '30-03-2019', '31-0

Graph with visualization of all data

In [5]:
"""df = data_sel_time("2018-11-20 08:00:00", "2020-06-26 08:00:00", data_249_min)
x = df["timeStamp"].values
y = df["analog2"].values
# print(x)

fig = px.line(df, x="timeStamp", y="analog2", height=600)
fig.show()"""

'df = data_sel_time("2018-11-20 08:00:00", "2020-06-26 08:00:00", data_249_min)\nx = df["timeStamp"].values\ny = df["analog2"].values\n# print(x)\n\nfig = px.line(df, x="timeStamp", y="analog2", height=600)\nfig.show()'

Graph with criticical points they are only determined after an anomaly already happends

In [6]:
x_d, y_d = derivate(x, y)

"""
# With "critical points"
fig = go.Figure()
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y))
fig.add_trace(go.Scatter(x=x_d, y=y_d, mode='markers', name='markers'))
""";

NameError: name 'x' is not defined

Prasi kaj je blo za te datume:  
['09-01-2019', '10-01-2019', '13-01-2019', '24-01-2019', '25-01-2019', '26-01-2019', '12-03-2019', '31-03-2019', '04-04-2019', '05-04-2019', '09-04-2019', '14-04-2019', '15-04-2019', '17-04-2019', '27-04-2019', '04-05-2019', '06-05-2019', '12-05-2019', '13-05-2019', '14-05-2019', '15-05-2019', '01-06-2019', '05-06-2019', '08-06-2019', '10-06-2019', '12-06-2019', '13-06-2019', '15-06-2019', '17-06-2019', '18-06-2019', '19-06-2019', '22-06-2019', '24-06-2019', '03-08-2019', '28-08-2019', '30-09-2019', '03-10-2019', '31-10-2019', '27-11-2019', '18-12-2019', '30-12-2019', '08-01-2020', '05-02-2020', '27-02-2020']
  
   
Also kaj je blo med 24.1.2019 in 4.2.2019 in med aprilom in oktobrom v obeh primerih so vrhi nizji. Poglej si Isolation Forest

In [None]:
https://scikit-learn.org/stable/auto_examples/ensemble/plot_isolation_forest.html
from sklearn.ensemble import IsolationForest

rng = np.random.RandomState(42)

# Generate train data
X = 0.3 * rng.randn(100, 2)
X_train = np.r_[X + 2, X - 2]
# Generate some regular novel observations
X = 0.3 * rng.randn(20, 2)
X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))

# fit the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)

In [None]:
"""
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y,
                    mode='lines+markers',
                    name='lines+markers'))

fig.show()""";