In [2]:
import pandas as pd
import pickle
import datetime
import numpy as np
from adtk.detector import AutoregressionAD
from adtk.detector import PersistAD
from adtk.aggregator import AndAggregator
from adtk.detector import LevelShiftAD
from pyculiarity import detect_ts
#from rstl import STL
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
import scipy

In [3]:
def read_data(file_path, interval):
    df=pd.read_json(file_path)
    df["Date"] = df.Date.dt.strftime("%Y-%m-%d %H:%M:%S")
    df["Date"] = pd.to_datetime(df["Date"])
    # Fill the empty days with zero counts
    df.loc[:,"_area"] = "D"
    df_area = df.groupby('_area').apply(lambda x : x.set_index('Date').resample(interval).mean().fillna(0)).reset_index()
    df_area = df_area[["Date","Count"]]
    df_area.set_index('Date', inplace=True)
    return df_area

def mad(y, axis = None):
    m = np.median(y)
    abs_dev = np.abs(y-m)
    mad_est = np.median(abs_dev)
    return mad_est

def anomaly_tool4(df):
    data = pd.DataFrame(df.reset_index())
    res = []
    for x in range(0, len(data)):
        if x == 0:
            res.append(0)
        elif x == 1 :
            res.append(0)
        elif x == 2 :
            mad_ = 0.6745*(np.abs(data.Count[x]-np.median(data.Count[:2])))/(mad(data.Count[:2]))
            if mad_ > 10:
                res.append(1)
            else:
                res.append(0)
        elif x == 3 :
            mad_ = 0.6745*(np.abs(data.Count[x]-np.median(data.Count[:3])))/(mad(data.Count[:3]))
            if mad_ > 10:
                res.append(1)
            else:
                res.append(0)
        else:
            mad_ = 0.6745*(np.abs(data.Count[x]-np.median(data.Count[x-2:x])))/(mad(data.Count[x-2:x]))
            if mad_ > 10:
                res.append(1)
            else:
                res.append(0)
    return res


def anomaly_hour(df, week, c_ad = 1, c_ls = 40, n_steps = 2, window = 1):
    data_final = pd.DataFrame(df.reset_index())
    week_list = [g for n, g in data_final.groupby(pd.Grouper(key='Date',freq='W'))]
    data_final = week_list[week]
    
    scaler = StandardScaler()
    df_ = data_final.set_index("Date")
    df_["count_norm"] = scaler.fit_transform(df_[["Count"]])
    autoregression_ad = AutoregressionAD(n_steps=n_steps, step_size=1, c=c_ad)
    anomalies = autoregression_ad.fit_detect(df_.count_norm)
    anomalies = anomalies.fillna(0)
    data_final["AD_norm"] = list(anomalies)
    
    level_shift_ad = LevelShiftAD(c=c_ls, side='both', window=window)
    anomalies_ls = level_shift_ad.fit_detect(df_.count_norm)
    anomalies_ls = anomalies_ls.fillna(0)
    data_final["ls_norm"] = list(anomalies_ls)
    
    anomalies_agg = AndAggregator().aggregate(
    {"AutoRegression": anomalies,
     "level_shift": anomalies_ls})
    
    data_final["anomalies_agg"] = list(anomalies_agg*1)
    
    return data_final

def plot_func(df, week):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.Date, y=df.Count, 
                             fill='tozeroy', 
                             mode='markers'
                            )
                 )
    fig.update_traces(marker=dict(size=5,
                                  line=dict(width=1,color='DarkSlateGrey'),
                                  color = df["anomalies_agg"]
                                 )
                     )

    fig.update_layout(
        title="Fabric_1h_week_%s" %week,
        yaxis_title="Count",
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="#7f7f7f"
        )
    )
    return fig


def anomaly_15min(df, week):
    df_fabric_test = df
    df_ = df_fabric_test.set_index("Date")
    df_["count_norm"] = scaler.fit_transform(df_[["Count"]])
    autoregression_ad = AutoregressionAD(n_steps=2, step_size=1, c=5)
    anomalies = autoregression_ad.fit_detect(df_.count_norm)
    anomalies = anomalies.fillna(0)
    df_fabric_test["AD_norm"] = list(anomalies)
    
    level_shift_ad = LevelShiftAD(c=20, side='both', window=1)
    anomalies_ls = level_shift_ad.fit_detect(df_.count_norm)
    anomalies_ls = anomalies_ls.fillna(0)
    df_fabric_test["ls_norm"] = list(anomalies_ls)


    anomalies_agg = AndAggregator().aggregate(
        {"AutoRegression": anomalies,
         "level_shift": anomalies_ls})
    
    df_fabric_test["anomalies_agg"] = list(anomalies_agg*1)

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df_fabric_test.Date, y=df_fabric_test.Count, 
                         fill='tozeroy', 
                         mode='markers'))
    fig.update_traces(marker=dict(size=5,
                             line=dict(width=1,color='DarkSlateGrey'),
                              color = df_fabric_test["anomalies_agg"]
                                 )
                     )
    fig.update_layout(
        title="Fabric_15min_week_%s" %(week),
        yaxis_title="Count",
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="#7f7f7f"
        )
    )


    return fig

In [115]:
logmind_1h = "C:/Users/aelri/PycharmProjects/logmind/Elasticsearch_output/logmind_1h_fabric"
df_fabric_hr = read_data(logmind_1h, "h")

data_ = pd.DataFrame(df_fabric_hr.reset_index())
interval_len = [g for n, g in data_.groupby(pd.Grouper(key='Date', freq='W'))]

data_train = interval_len[0]
data_final = interval_len[1]


df_train = data_train.set_index("Date")
data_final = data_final.set_index("Date")
AD_model = AutoregressionAD(n_steps=2, step_size=1, c=5)
AD_model.fit(df_train.Count)
anomalies = AD_model.predict(data_final.Count)
anomalies = anomalies.fillna(0)
data_final["AD_norm"] = list(anomalies)
ls_model = LevelShiftAD(c=10, side='both', window=1)
ls_model.fit(df_train.Count)
anomalies_ls = ls_model.predict(data_final.Count)
anomalies_ls = anomalies_ls.fillna(0)

anomalies_agg = AndAggregator().aggregate(
        {"AutoRegression": anomalies,
         "level_shift": anomalies_ls})
anomaly_temp_ = pd.Series(list(anomalies_agg * 1))
    
nonzero_no = (anomaly_temp_ != 0).sum(axis=0)
all_no = data_final.shape[0]
fraction = nonzero_no/all_no
print(fraction)
if fraction > 0.120 and fraction < 0.146:
    ls_model = LevelShiftAD(c=30, side='both', window=1)
    ls_model.fit(df_train.Count)
    anomalies_ls = ls_model.predict(data_final.Count)
    anomalies_ls = anomalies_ls.fillna(0)

    anomalies_agg = AndAggregator().aggregate(
            {"AutoRegression": anomalies,
             "level_shift": anomalies_ls})
    
    data_final["anomalies_agg"]  = list(anomalies_agg * 1)
    
elif fraction > 0.146 and fraction < 0.2:
    ls_model = LevelShiftAD(c=40, side='both', window=1)
    ls_model.fit(df_train.Count)
    anomalies_ls = ls_model.predict(data_final.Count)
    anomalies_ls = anomalies_ls.fillna(0)

    anomalies_agg = AndAggregator().aggregate(
            {"AutoRegression": anomalies,
             "level_shift": anomalies_ls})
    
    data_final["anomalies_agg"] = list(anomalies_agg * 1)
    
elif fraction > 0.2:
    ls_model = LevelShiftAD(c=60, side='both', window=1)
    ls_model.fit(df_train.Count)
    anomalies_ls = ls_model.predict(data_final.Count)
    anomalies_ls = anomalies_ls.fillna(0)

    anomalies_agg = AndAggregator().aggregate(
            {"AutoRegression": anomalies,
             "level_shift": anomalies_ls})
    
    data_final["anomalies_agg"] = list(anomalies_agg * 1)
        
else:
    ls_model = LevelShiftAD(c=10, side='both', window=1)
    ls_model.fit(df_train.Count)
    anomalies_ls = ls_model.predict(data_final.Count)
    anomalies_ls = anomalies_ls.fillna(0)

    anomalies_agg = AndAggregator().aggregate(
        {"AutoRegression": anomalies,
         "level_shift": anomalies_ls})

    data_final["anomalies_agg"] = list(anomalies_agg * 1)
    
#print(list(data_final["anomalies_agg"]))
data_final["anomalies_agg_f"] = data_final["anomalies_agg"]


data_final = data_final.reset_index()


data_final["test_det"] = 0
for i in range(0, len(data_final["Count"])):
    med_ = data_train.Count.median()
    std_ = data_train.Count.std()
    tot_p = med_ + 3*std_
    tot_n = med_ - 3*std_
    if data_final.iloc[i, data_final.columns.get_loc('Count')]  > tot_p or  data_final.iloc[i, data_final.columns.get_loc('Count')] < tot_n:
        data_final.iloc[i, data_final.columns.get_loc('test_det')] = 1
    else:
        data_final.iloc[i, data_final.columns.get_loc('test_det')] = 0
        

for i in range(0, len(data_final["anomalies_agg_f"])):
    if data_final.iloc[i, data_final.columns.get_loc('anomalies_agg')] == 1:
        if data_final.iloc[i, data_final.columns.get_loc('Count')] < 500 and data_final.iloc[i, data_final.columns.get_loc('Count')]>200 :
            data_final.iloc[i, data_final.columns.get_loc('anomalies_agg_f')] = 0
            

for i in range(0, len(data_final["anomalies_agg_f"])):
    if data_final.iloc[i, data_final.columns.get_loc('AD_norm')] == 1:
        med_ = data_train.Count.median()
        if data_final.iloc[i, data_final.columns.get_loc('Count')]  < 1000 and med_ > 500:
            data_final.iloc[i, data_final.columns.get_loc('anomalies_agg_f')] = 2

fig = go.Figure()
fig.add_trace(go.Scatter(x=data_final.Date, y=data_final.Count,
                             fill='tozeroy',
                             mode='markers'
                             )
                  )
fig.update_traces(marker=dict(size=5,
                                  line=dict(width=1, color='DarkSlateGrey'),
                                  color=data_final["anomalies_agg_f"]
                                  #colorscale=[[0, 'green'], [0.5, 'yellow'], [1.0, 'red']]
                                  )
                      )

fig.update_layout(
        title="Fabric_1h_week_%s" % 3,
        yaxis_title="Count",
        xaxis_title="Yellow is a mild anomaly (the hourly count is different but within the previous count range) <br> Red is a critic anomaly (the hourly count is outside the previous count range)",
        font=dict(
            family="Courier New, monospace",
            size=14,
            color="#7f7f7f"
        )
    )
fig.show()

fig = go.Figure()
fig.add_trace(go.Scatter(x=data_final.Date, y=data_final.Count,
                             fill='tozeroy',
                             mode='markers'
                             )
                  )
fig.update_traces(marker=dict(size=5,
                                  line=dict(width=1, color='DarkSlateGrey'),
                                  color=data_final["test_det"]
                                  )
                      )

fig.update_layout(
        title="Fabric_1h_week_%s" % 3,
        yaxis_title="Count",
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="#7f7f7f"
        )
    )
fig.show()

0.0


In [50]:
from fbprophet import Prophet

logmind_fabric = "C:/Users/aelri/PycharmProjects/logmind/Elasticsearch_output/logmind_fabric"
df_fabric_hr = read_data(logmind_fabric, "h")

data_final = pd.DataFrame(df_fabric_hr.reset_index())
week_list = [g for n, g in data_final.groupby(pd.Grouper(key='Date',freq='3D'))]


def fit_predict_model(dataframe, interval_width = 0.95, changepoint_range = 0.7):
    m = Prophet(daily_seasonality = True, yearly_seasonality = False, weekly_seasonality = True,
                seasonality_mode = 'multiplicative', 
                interval_width = interval_width,
                changepoint_range = changepoint_range)
    m = m.fit(dataframe)
    forecast = m.predict(dataframe)
    forecast['fact'] = dataframe['y'].reset_index(drop = True)
    return forecast

def detect_anomalies(forecast):
    forecasted = forecast[['ds','trend', 'yhat', 'yhat_lower', 'yhat_upper', 'fact']].copy()
    #forecast['fact'] = df['y']

    forecasted['anomaly'] = 0
    forecasted.loc[forecasted['fact'] > forecasted['yhat_upper'], 'anomaly'] = 1
    forecasted.loc[forecasted['fact'] < forecasted['yhat_lower'], 'anomaly'] = 1
    
    return forecasted



week_no = [0, 1, 3, 6, 11, 12, 15, 17]

for i in week_no:
    data_final = week_list[i]
    data_final = data_final.rename(columns={"Date": "ds", "Count": "y"})
    pred = fit_predict_model(data_final)
    pred = detect_anomalies(pred)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=pred.ds, y=pred.fact, 
                            fill='tozeroy', 
                            mode='markers'
                            )
                 )
    fig.update_traces(marker=dict(size=5,
                                  line=dict(width=1,color='DarkSlateGrey'),
                                  color = pred.anomaly
                                 )
                     )

    fig.update_layout(
        title="Fabric_1h_week_Prophet_%s" %(i),
        yaxis_title="Count",
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="#7f7f7f"
        )
    )


    fig.show()

In [87]:
import seaborn as sns
import matplotlib.pyplot as plt

logmind_fabric = "C:/Users/aelri/PycharmProjects/logmind/Elasticsearch_output/logmind_fabric"
df_fabric_hr = read_data(logmind_fabric, "h")

x_train = pd.DataFrame(df_fabric_hr.reset_index()).iloc[:2000,:]

x_test = pd.DataFrame(df_fabric_hr.reset_index()).iloc[2001:,:]

day_test = [g for n, g in x_test.groupby(pd.Grouper(key='Date',freq='h'))]

day_list_4 = [g for n, g in x_train.groupby(pd.Grouper(key='Date',freq='4D'))]


df_train = day_list_4[-1]
df_train = df_train.iloc[1:]
#data_final = data_final.append(day_test[0:1])
#data_final = data_final.reset_index(drop=True)

interval = 1
df_tr = df_train.set_index("Date")
df_ts = pd.DataFrame(day_test[interval-1:interval][0]).set_index("Date")
autoregression_ad = AutoregressionAD(n_steps=1, step_size=1, c=0.1)
autoregression_ad.fit(df_tr.Count)
anomalies = autoregression_ad.predict(df_ts.Count)
print(anomalies)
anomalies = anomalies.fillna(0)
print(anomalies)
#df_ts["Ad_norm"] = list(anomalies)
    
level_shift_ad = LevelShiftAD(c=7.5, side='both', window=1)
level_shift_ad.fit(df_tr)
anomalies_ls = level_shift_ad.predict(df_ts)
anomalies_ls = anomalies_ls.fillna(0)
#df_ts["ls_norm"] = list(anomalies_ls)

anomalies_agg = AndAggregator().aggregate(
    {"AutoRegression": anomalies,
     "level_shift": anomalies_ls})

df_train = df_train.append(day_test[interval-1:interval])
df_train["anomalies_agg"] = 0
df_train.iloc[-1, df_train.columns.get_loc('anomalies_agg')] = list(anomalies_agg*1)
#print(df_train) 
#fig.show()

def anomaly_animation_func(data, test_data, interval):
    if interval == 1:
        df_train = data.iloc[interval:]
    
        df_tr = df_train.set_index("Date")
        df_ts = pd.DataFrame(day_test[interval-1:interval][0]).set_index("Date")
        autoregression_ad = AutoregressionAD(n_steps=2, step_size=1, c=1)
        autoregression_ad.fit(df_tr.Count)
        anomalies = autoregression_ad.predict(df_ts.Count)
        anomalies = anomalies.fillna(0)
    
        level_shift_ad = LevelShiftAD(c=5, side='both', window=1)
        level_shift_ad.fit(df_tr.Count)
        anomalies_ls = level_shift_ad.predict(df_ts.Count)
        anomalies_ls = anomalies_ls.fillna(0)
    
        anomalies_agg = AndAggregator().aggregate(
        {"AutoRegression": anomalies, "level_shift": anomalies_ls})
    
        df_train = df_train.append(day_test[interval-1:interval])
        #print(df_train)
        df_train["anomalies_agg"] = 0
        df_train.iloc[-1, df_train.columns.get_loc('anomalies_agg')] = list(anomalies_agg*1)
        
        a_ = df_train.copy()
        pickle.dump(a_, open( "a.pkl", "wb" ) ) 
        
    else:
        a_ = pickle.load( open( "a.pkl", "rb" ) ) 
        df_train = a_.iloc[1:]
        df_tr = df_train.set_index("Date")
        df_ts = pd.DataFrame(day_test[interval-1:interval][0]).set_index("Date")
        autoregression_ad = AutoregressionAD(n_steps=1, step_size=1, c=1)
        autoregression_ad.fit(df_tr.Count)
        print(df_ts.Count)
        anomalies = autoregression_ad.predict(df_ts.Count)
        anomalies = anomalies.fillna(0)
        
        level_shift_ad = LevelShiftAD(c=1, side='both', window=1)
        level_shift_ad.fit(df_tr.Count)
        anomalies_ls = level_shift_ad.predict(df_ts.Count)
        anomalies_ls = anomalies_ls.fillna(0)
    
        anomalies_agg = AndAggregator().aggregate(
        {"AutoRegression": anomalies, "level_shift": anomalies_ls})
        
        df_train = df_train.append(day_test[interval-1:interval])
        df_train.iloc[-1, df_train.columns.get_loc('anomalies_agg')] = list(anomalies_agg*1)
        
        b_ = df_train
        pickle.dump(b_, open( "a.pkl", "wb" ) ) 
        

        fig = go.Figure()
    
        fig.add_trace(go.Scatter(x=df_train.Date, y=df_train.Count, 
                         fill='tozeroy', 
                         mode='markers'))
        fig.update_traces(marker=dict(size=5,
                             line=dict(width=1,color='DarkSlateGrey'),
                              color = df_train.anomalies_agg
                                 )
                     )
        fig.update_layout(
            title="Fabric_4D/hr_week_%s" %(interval),
            yaxis_title="Count",
            font=dict(
                family="Courier New, monospace",
                size=18,
                color="#7f7f7f"
            )
        )
        
        fig.show()

Date
2019-12-20 01:00:00    NaN
2019-12-20 02:00:00    1.0
2019-12-20 03:00:00    1.0
2019-12-20 04:00:00    0.0
2019-12-20 05:00:00    0.0
                      ... 
2019-12-23 03:00:00    0.0
2019-12-23 04:00:00    0.0
2019-12-23 05:00:00    0.0
2019-12-23 06:00:00    0.0
2019-12-23 07:00:00    1.0
Name: Count, Length: 79, dtype: float64
Date
2019-12-20 01:00:00    0.0
2019-12-20 02:00:00    1.0
2019-12-20 03:00:00    1.0
2019-12-20 04:00:00    0.0
2019-12-20 05:00:00    0.0
                      ... 
2019-12-23 03:00:00    0.0
2019-12-23 04:00:00    0.0
2019-12-23 05:00:00    0.0
2019-12-23 06:00:00    0.0
2019-12-23 07:00:00    1.0
Name: Count, Length: 79, dtype: float64


ValueError: Must have equal len keys and value when setting with an iterable

In [85]:
logmind_fabric = "C:/Users/aelri/PycharmProjects/logmind/Elasticsearch_output/logmind_fabric"

df_fabric_hr = read_data(logmind_fabric, "h")
x_train = pd.DataFrame(df_fabric_hr.reset_index()).iloc[:2000,:]
x_test = pd.DataFrame(df_fabric_hr.reset_index()).iloc[2001:,:]

day_test = [g for n, g in x_test.groupby(pd.Grouper(key='Date',freq='h'))]

day_list_4 = [g for n, g in x_train.groupby(pd.Grouper(key='Date',freq='4D'))]

data_one = day_list_4[-1]

for i in range(1, 20):
    anomaly_animation_func(data_one, day_test, i)

Date
2019-12-23 10:00:00    35495.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-23 11:00:00    63228.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-23 12:00:00    68949.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-23 13:00:00    69349.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-23 14:00:00    68988.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-23 15:00:00    66250.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-23 16:00:00    69128.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-23 17:00:00    69270.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-23 18:00:00    69244.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-23 19:00:00    68894.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-23 20:00:00    69041.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-23 21:00:00    68858.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-23 22:00:00    68967.0
Name: Count, dtype: float64


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.








Date
2019-12-23 23:00:00    69660.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-24    69228.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-24 01:00:00    68612.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Date
2019-12-24 02:00:00    69020.0
Name: Count, dtype: float64


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.








Date
2019-12-24 03:00:00    68894.0
Name: Count, dtype: float64



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.



