In [35]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.io as pio
pd.options.plotting.backend = "plotly"
pio.renderers.default = 'colab'
pio.templates.default  = 'simple_white'


In [36]:
folder = '../data'

# define plot function

In [37]:
def plot_anomaly(ts,anomaly_pred = None,anomaly_true=None,file_name = 'file'):
    fig = go.Figure()
    yhat = go.Scatter(
    x = ts.index,
    y = ts,
    mode = 'lines',  name = ts.name)
    fig.add_trace(yhat)
    if  anomaly_pred is not None:
        status = go.Scatter(
        x = anomaly_pred.index,
        y = ts.loc[anomaly_pred.index],
        mode = 'markers',  name = anomaly_pred.name,marker= {'color':'red','size':10,'symbol':'star','line_width':0})
        fig.add_trace(status)
    if  anomaly_true is not None:
        status = go.Scatter(
        x = anomaly_true.index,
        y = ts.loc[anomaly_true.index],
        mode = 'markers',  name = anomaly_true.name,marker= {'color':'yellow','size':10,'symbol':'star-open','line_width':2})
        fig.add_trace(status)   
    fig.show()
    
def plot_anomaly_window(ts,anomaly_pred = None,file_name = 'file',window='1h'):
    fig = go.Figure()
    yhat = go.Scatter(
    x = ts.index,
    y = ts,
    mode = 'lines',  name = ts.name)
    fig.add_trace(yhat)
    if  anomaly_pred is not None:
        for i in anomaly_pred.index:
            fig.add_vrect(x0=i - pd.Timedelta(window),x1= i,line_width=0, fillcolor="red", opacity=0.2) 
    fig.show()

# find outlier by applying threshold to raw measurement

In [67]:
df = pd.read_csv(folder+'/temperature.csv')
df['Time']= pd.to_datetime(df['Time'])
df.set_index('Time',inplace=True)
col = df.columns[0]
df.plot()

## manually set the threshold

In [68]:
min_t = 12
max_t = 30
df[col+'threshold_alarm'] = (df[col].clip(lower = min_t,upper=max_t) != df[col])
plot_anomaly(df[col],anomaly_pred = df[df[col+'threshold_alarm']==True][col+'threshold_alarm'],anomaly_true=None,file_name = 'file')

## Using statistics to set the threshold

In [69]:
min_t = df[col].quantile(0.03)
max_t = df[col].quantile(0.97)
df[col+'threshold_alarm'] = (df[col].clip(lower = min_t,upper=max_t) != df[col])
plot_anomaly(df[col],anomaly_pred = df[df[col+'threshold_alarm']==True][col+'threshold_alarm'],anomaly_true=None,file_name = 'file')

## interquartile range (IQR)
[Q1−c×IQR, Q3+c×IQR] 

In [41]:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3- Q1
c = 2
min_t = Q1 - c*IQR
max_t = Q3 + c*IQR
df[col+'threshold_alarm'] = (df[col].clip(lower = min_t,upper=max_t) != df[col])
plot_anomaly(df[col],anomaly_pred = df[df[col+'threshold_alarm']==True][col+'threshold_alarm'],anomaly_true=None,file_name = 'file')

# find outlier by applying threshold to change

In [42]:
window = 1
df[col+'_diff'] = df[col].diff(periods= window).fillna(0)

Q1 = df[col+'_diff'].quantile(0.25)
Q3 = df[col+'_diff'].quantile(0.75)
IQR = Q3- Q1
c = 2
min_t = Q1 - c*IQR
max_t = Q3 + c*IQR
df[col+'diff_alarm'] = (df[col+'_diff'].clip(lower = min_t,upper=max_t)!= df[col+'_diff'])
plot_anomaly_window(df[col],anomaly_pred = df[df[col+'diff_alarm']==True][col+'diff_alarm'],file_name = 'file',window=f'{window}h')

## what if there are some noise in measurement (more dynamic)

In [43]:
df = pd.read_csv(folder+'/cpu.csv')
df['Time']= pd.to_datetime(df['Time'])
df.set_index('Time',inplace=True)
col = df.columns[0]
df.plot()

In [53]:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3- Q1
c = 2
min_t = Q1 - c*IQR
max_t = Q3 + c*IQR
df[col+'threshold_alarm'] = (df[col].clip(lower = min_t,upper=max_t) != df[col])
plot_anomaly_window(df[col],anomaly_pred = df[df[col+'threshold_alarm']==True][col+'threshold_alarm'],window='1h',file_name = 'file')

# Can we detect the noise?

In [46]:
window = 5
df[col+'ma'] = df[col].rolling(window=window,closed='left').mean()
kpi_col = col+'ma'+'diff'
df[kpi_col] = (df[col]-df[col+'ma']).fillna(0)


Q1 = df[kpi_col].quantile(0.25)
Q3 = df[kpi_col].quantile(0.75)
IQR = Q3- Q1
c = 2
min_t = Q1 - c*IQR
max_t = Q3 + c*IQR
df[kpi_col+'threshold_alarm'] = (df[kpi_col].clip(lower = min_t,upper=max_t) != df[kpi_col])
plot_anomaly_window(df[col],anomaly_pred = df[df[kpi_col+'threshold_alarm']==True][kpi_col+'threshold_alarm'],file_name = 'file',window=f'{window}h')

## can we detect the condition/ level change?

In [51]:
window = 10
df[col+'ma'] = df[col].rolling(window=window,closed='left').median()
df[col+'ma_shift'] = df[col+'ma'].shift(periods=window)
kpi_col = col+'ma'+'shift'+'diff'
df[kpi_col] = (df[col+'ma']-df[col+'ma_shift']).fillna(0)


Q1 = df[kpi_col].quantile(0.25)
Q3 = df[kpi_col].quantile(0.75)
IQR = Q3- Q1
c = 2
min_t = Q1 - c*IQR
max_t = Q3 + c*IQR
df[kpi_col+'threshold_alarm'] = (df[kpi_col].clip(lower = min_t,upper=max_t) != df[kpi_col])
plot_anomaly_window(df[col],anomaly_pred = df[df[kpi_col+'threshold_alarm']==True][kpi_col+'threshold_alarm'],file_name = 'file',window=f'{2*window}h')

## Feel free to use other agg, e.g. std 

In [58]:
df = pd.read_csv(folder+'/seismic.csv')
df['Time']= pd.to_datetime(df['Time'])
df.set_index('Time',inplace=True)
col = df.columns[0]
df.plot()

In [65]:
window = 5
df[col+'ma'] = df[col].rolling(window=window,closed='left').std()
kpi_col = col+'ma'+'diff'
df[kpi_col] = (df[col]-df[col+'ma']).fillna(0)


Q1 = df[kpi_col].quantile(0.25)
Q3 = df[kpi_col].quantile(0.75)
IQR = Q3- Q1
c = 2
min_t = Q1 - c*IQR
max_t = Q3 + c*IQR
df[kpi_col+'threshold_alarm'] = (df[kpi_col].clip(lower = min_t,upper=max_t) != df[kpi_col])
plot_anomaly_window(df[col],anomaly_pred = df[df[kpi_col+'threshold_alarm']==True][kpi_col+'threshold_alarm'],file_name = 'file',window=f'{window/2}s')