In [None]:
import datetime as dt
import datetime
import math
import os

import holoviews as hv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import param
import paramnb
import seaborn as sns
import sklearn.metrics as metrics
from influxdb import DataFrameClient
from matplotlib.colors import ListedColormap
from scipy import stats
from sklearn import metrics as metrics
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.externals import joblib
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix, f1_score,
                             fbeta_score, make_scorer, precision_score,
                             recall_score, roc_auc_score)
from sklearn.model_selection import (KFold, StratifiedKFold, cross_val_score,
                                     train_test_split)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import ricercando as ric

database_ip = '46.101.250.119'
ric.set_connection_params(host=database_ip)
cli = DataFrameClient(database_ip, 8086, 'monroe', 'secure', 'monroe')
cli.switch_database('monroe')

In [None]:
# Page-Hinkley algorithm

class PageHinkley:
    def __init__(self, delta_=0.005, lambda_=50, alpha_=1 - 0.0001, detect_negative=False):
        self.delta_ = delta_
        self.lambda_ = lambda_
        self.alpha_ = alpha_
        self.sum = 0
        # incrementally calculated mean of input data
        self.x_mean = 0
        # number of considered values
        self.num = 0
        self.change_detected = False
        self.detect_negative = detect_negative

    def __reset_params(self):
        """
        Every time a change has been detected, all the collected statistics are reset.
        :return:
        """
        self.num = 0
        self.x_mean = 0
        self.sum = 0

    def set_input(self, x):
        """
        Main method for adding a new data value and automatically detect a possible concept drift.
        :param x: input data
        :return: boolean
        """
        self.__detect_drift(x)
        return self.change_detected

    def __detect_drift(self, x):
        """
        Concept drift detection following the formula from 'Knowledge Discovery from Data Streams' by João Gamma (p. 76)
        :param x: input data
        """
        # calculate the average and sum
        self.num += 1
        self.x_mean = (x + self.x_mean * (self.num - 1)) / self.num

        if self.detect_negative:
            self.sum = min(0.0, self.sum * self.alpha_ + (x - self.x_mean + self.delta_))
        else:
            self.sum = max(0.0, self.sum * self.alpha_ + (x - self.x_mean - self.delta_))

        self.change_detected = True if abs(self.sum) > self.lambda_ else False
        if self.change_detected:
            self.__reset_params()


In [None]:
train_nodes = [
    {
        "node_id": '601',
        "ICCID": '89390100001965067610',
        "start_time": '2018-01-01',
        "end_time": '2018-01-30'
    },
    {
        "node_id": '608',
        "ICCID": '8946071512360089522',
        "start_time": '2018-01-01',
        "end_time": '2018-01-30'
    },
    {
        "node_id": '609',
        "ICCID": '89460850007007786482',
        "start_time": '2018-01-01',
        "end_time": '2018-01-30'
    },
    {
        "node_id": '610',
        "ICCID": '8939104160000392272',
        "start_time": '2018-01-01',
        "end_time": '2018-01-30'
    },
    {
        "node_id": '612',
        "ICCID": '8939104160000392231',
        "start_time": '2018-01-01',
        "end_time": '2018-01-29'
    },
    {
        "node_id": '613',
        "ICCID": '89390100001965068626',
        "start_time": '2018-01-01',
        "end_time": '2018-01-29'
    }
    
]

In [None]:
# define custom scoring

# combining intervals
def customise_score(y_testt, y_predd, offset = 5, mark_as=1):
    
    
    y_t = np.copy(y_testt)
    y_p = np.copy(y_predd)

    
    # Fill-in gaps betwwen test
    for i in range(len(y_t)):
        if y_t[i] and any(y_t[i+1:i+offset+1]):
            for j in range(1,offset+1):
                if y_t[i+j]:
                    break
                else:
                    y_t[i+j] = mark_as
        
    # Fill-in gaps betwwen pred
    for i in range(len(y_p)):
        if y_p[i] and any(y_p[i+1:i+offset+1]):
            for j in range(1,offset+1):
                if y_p[i+j]:
                    break
                else:
                    y_p[i+j] = mark_as
                
    return y_t, y_p

# counting intervals

def customise_score_for_readable(y_testt, y_predd, offset = 8, offset_pred = 8, mark_as = 1, mark_as_inverse = 0):
    
    
    y_t = np.copy(y_testt)
    y_p = np.copy(y_predd)


    # Fill-in gaps between test marked True Classes
    for i in range(len(y_t)):
        if y_t[i] and any(y_t[i+1:i+offset+1]):
            for j in range(1,offset+1):
                if y_t[i+j]:
                    break
                else:
                    y_t[i+j] = mark_as
                    
                    
    # Fill-in gaps between pred marked True Classes
    for i in range(len(y_p)):
        if y_p[i] and any(y_p[i+1:i+offset_pred+1]):
            for j in range(1,offset_pred+1):
                if y_p[i+j]:
                    break
                else:
                    y_p[i+j] = mark_as
                
    return y_t, y_p
            

def customise_score_readable(*args, **kwargs):
   
    y_t, y_p = customise_score_for_readable(*args, **kwargs)
    
    if len(y_t) != len(y_p):
        raise Exception("Invalid length od y_p and y_t, should be same")
    
    i = 0
    
    new_y_t = []
    new_y_p = []
    
    
    num_TN = 1
    
    # find TP and Fn
    
    while i < len(y_t):
        if y_t[i]:
            j = 1
            while (i+j)<len(y_t) and y_t[i+j]:
                j += 1
            
            if any(y_p[i:i+j]):
                new_y_t.append(1)
                new_y_p.append(1)
                num_TN += 1
                i = i + j
                continue
            else:
                new_y_t.append(1)
                new_y_p.append(0)
                i = i + j
                
        i += 1
    
    # find TN - they dont matter- but number same as number of anomaly zones
    
    for i in range(num_TN):
        new_y_t.append(0)
        new_y_p.append(0)
    
    
    # find FP
                
    while i < len(y_p):
        if y_p[i]:
            j = 1
            while y_p[i+j]:
                j += 1
            
            if not any(y_t[i:i+j]):
                new_y_t.append(0)
                new_y_p.append(1)
                i = i + j
                continue
            else:
                i = i + j
                continue
        i += 1
        
    
    return new_y_t, new_y_p



def my_custom_loss_func(y_true, y_pred):
    
    #y_test_custom, y_pred_custom = customise_score(y_true ,y_pred,10)
    y_test_custom, y_pred_custom = customise_score_readable(y_true, y_pred, offset = 10, offset_pred = 10)
    precision = f1_score(y_true, y_pred) 
    
    return precision

my_custom_score = make_scorer(my_custom_loss_func, greater_is_better=True)

In [None]:
# Preprocess input

X_array = []
X_train_array = []
X_test_array = []

y_array = []
y_train_array = []
y_test_array = []

print_only_once = True

columns_page = ["Delta","Lambda","Alpha"]
page_hinkley_params = []

for node in train_nodes:
    node_id = node["node_id"]
    ICCID = node["ICCID"]
    start_time = node["start_time"]
    end_time = node["end_time"]

    datasets = cli.query("select * from class_1m where NodeId='{}' and time >= '{}' and time <= '{}' ".format(node_id,start_time,end_time))
    df = ric.getdf(tables="ping", nodeid=node_id,  start_time= start_time, end_time=end_time, freq="1m")
    df = df[df['Iccid'] == ICCID]

    # merge together class and df
    class_feature = datasets['class_1m'].copy()

    class_feature = class_feature.drop(columns=['NodeId'])
    class_feature.index = class_feature.index.tz_localize(None)
    class_feature['time'] = class_feature.index
    df['time'] = df.index
    df.index.name = None
    df = pd.merge(df, class_feature,  how='inner', left_on=['Iccid','time'], right_on = ['Iccid','time'])
    df.index = df['time']
    df = df.drop(columns=['time'])
    df.index.name = 'time'
    df_analise = df.copy()



    # calculate page_hinkley parameters

    seznam = dict()

    adwin = []
    page = []

    df = df_analise.copy()

    iccids = list(df["Iccid"].unique())

    df = df[df["Iccid"] == ICCID]
    
    seznam = dict()

    seznam["index"] = []

    seznam["razred"] = []

    seznam["page"] = []
    seznam["pageneg"] = []
    seznam["pageNames"] = []

    page_index = 0
    print("calculating page for nodeID",node_id)
    for i in [0.005]:
        for j in [80,100,150,200,300]:
            for k in [0.01,0.05,0.03,0.08]:
                seznam["page"].append(PageHinkley(delta_=i, lambda_=j, alpha_=1 - k))
                seznam["pageneg"].append(PageHinkley(delta_=i, lambda_=j, alpha_=1 - k, detect_negative=True))
                
                if print_only_once:
                    print("page_index {} delta: {} lambda: {} alpha: {}".format(page_index,i,j,1-k))
                    page_hinkley_params.append([i,j,k])
                    page_index += 1
                    

    print_only_once = False
    seznam["pageRes"] = [[] for x in range(len(seznam["page"]))]
    seznam["pageResneg"] = [[] for x in range(len(seznam["pageneg"]))]

    
    print(len(df.index))
    ppp = 0
    for index, row in df.iterrows():
        ppp += 1
        if ppp % 1000 == 0:
            print(ppp)
        
        iccid = row["Iccid"]

        if not math.isnan(row["RTT"]):
            seznam["index"].append(index)

        for i, page in enumerate(seznam["page"]):

            if not math.isnan(row["RTT"]):
                seznam["pageRes"][i].append(page.set_input(row["RTT"]))
                seznam["pageResneg"][i].append(page.set_input(row["RTT"]))


    i = 6
    feature_list = []

    for k, page in enumerate(seznam["pageRes"]):
        # or operation over two lists
        new_list = [a or b for a,b in zip(page,seznam["pageRes"][k])]
        
        Type_new = pd.Series(new_list,index=seznam["index"])

        df.insert(len(list(df.columns.values)), "Page_"+ str(k+1), Type_new)

        i+=1


    pima = df.copy()

    #fill NaN with False
    for k in range(1,len(seznam["pageRes"])+1):
        page_name = "Page_{}".format(k)
        pima[page_name] = pima[page_name].fillna(False)

    # add delay
    print("adding delay")
    seznam_column = list(df.columns)
    for index, row in pima.iterrows():
        for column in seznam_column:
            if "Page" in column and row[column]:
                for i in range(1,4):
                    time = index-datetime.timedelta(minutes=i)
                    if time in pima.index:
                        pima.at[index-datetime.timedelta(minutes=i), column] = True


    for index in reversed(pima.index):
        for column in seznam_column:
            if "Page" in column and pima.loc[index, column]:

                for i in range(1,4):
                    time = index+datetime.timedelta(minutes=i)
                    if time in pima.index:
                        pima.at[index+datetime.timedelta(minutes=i), column] = True
    print("done, pima is final df")

    features_names = list(filter(lambda x : "Page_" in x, seznam_column)) 
    y = pima['Class'].values
    y = y * 1
    X = pima[features_names].values

    X_array.append(X)
    y_array.append(y)
    
    
X = np.concatenate(X_array)
y = np.concatenate(y_array)

DATA_SPLIT_PCT = 0.4
X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y), test_size=DATA_SPLIT_PCT, shuffle=False)



In [None]:
res = []

# evaluate Page-Hinkley parameters

colummns = ["TN","FP","FN","TP","Precision","Recall","F1"]
colummns = colummns + columns_page 

for i in range(X.shape[1]):
    x_pred = X[:,i]
    x_predd = x_pred * 1
    yy = y * 1
    
    #y_testt, y_predd = customise_score(yy, x_predd,  offset = 8)
    y_testt, y_predd = yy, x_predd
    pre = precision_score(y_testt,y_predd)
    recall = recall_score(y_testt,y_predd)
    f1 = f1_score(y_testt,y_predd)
    
    conf_matrix = confusion_matrix(y_testt,y_predd)
    tn, fp, fn, tp = confusion_matrix(y_testt, y_predd).ravel()
    
    row = [i,tn,fp,fn,tp,pre,recall,f1]
    row = row + page_hinkley_params[i]
    res.append(row)

res = np.array(res)
df_page = pd.DataFrame(data=res[:,1:],index=res[:,0],columns=colummns)
df_page.sort_values(by=["F1"],ascending=False)

In [None]:
from sklearn.metrics import classification_report
import sklearn.metrics as metrics
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split as tts
from sklearn.svm import LinearSVC

# hyper parameterization

scores = ["my_custom_score"]
LABELS = ["False","True"]

names = ["Linear SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes"]

my_cv = StratifiedKFold(n_splits=4, shuffle=False)
grid_params = [
    {
        'C': [0.01,0.1,1,10]
    },
    
    {
        'max_depth': [4,8,16,None],
        'min_samples_split': np.linspace(0.1, 0.6, 4, endpoint=True),
        'min_samples_leaf' : [1,2,4],
        'max_features': ['auto','log2',None]
        
    },
    
     {
    'n_estimators': [16,32,64,256],
    'criterion': ['gini'],
    "max_features" : ['auto','log2',None],
    "max_depth" : [None,2,4]
    },
    
     {
    'n_estimators': [32,64,128,256],
    'learning_rate':[0.1,1]
    },
    
     {
    },
        
    
    
]


classifiers = [
    LinearSVC(),
    DecisionTreeClassifier(max_depth=100),
    RandomForestClassifier(max_depth=20, n_estimators=20, max_features="auto"),
    AdaBoostClassifier(),
    GaussianNB(),
    ]




models = []
for score in scores:
    node_table = []
    i = 0
    for classifier in classifiers:
        
        row = dict()
        row["name"] = names[i]
        
        print("# Tuning hyper-parameters for %s" % score)
        print()
        print("Classifier: ", names[i])

        tuned_parameters=grid_params[i]
        i += 1

        clf = GridSearchCV(classifier, tuned_parameters, cv=my_cv,
                           scoring=my_custom_score, n_jobs=1)
        models.append(clf)
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        
        classifier_cv = []
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            
            row_param = params.copy()
            row_param["mean"] = mean
            row_param["std"] = std
            classifier_cv.append(row_param)
            
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()
        
        row["best_params"] = clf.best_params_
        row["cv_params"] = classifier_cv
        

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, clf.predict(X_test)

        print(classification_report(y_true, y_pred))
        
        row["f1_score"] = metrics.f1_score(y_true,y_pred)
        row["accuracy"] = metrics.accuracy_score(y_true, y_pred)
        row["precision"] = metrics.precision_score(y_true, y_pred)
        row["recall"] = metrics.recall_score(y_true, y_pred)
        
        conf_matrix = confusion_matrix(y_test,y_pred)
        
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        row["tn"] = tn
        row["fp"] = fp
        row["fn"] = fn
        row["tp"] = tp
        

        plt.figure(figsize=(6, 6))
        sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
        plt.title("Confusion matrix")
        plt.ylabel('True class')
        plt.xlabel('Predicted class')
        plt.show()
        print()

        # fit_predict

        fpr, tpr, threshold = metrics.roc_curve(y_true, y_pred)
        roc_auc = metrics.auc(fpr, tpr)
        
        row["auc"] = roc_auc

        plt.title('Receiver Operating Characteristic')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.show()

        # fit predict prob

        '''fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_proba[:,1])

        roc_auc = metrics.auc(fpr, tpr)

        plt.title('Receiver Operating Characteristic')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.show()'''

         # CUSTOM

        y_custom_test, y_custom_pred = customise_score(y_test,y_pred,10)

        print("Accuracy for custom test: ",metrics.accuracy_score(y_custom_test, y_custom_pred))
        print(classification_report(y_custom_test, y_custom_pred))
        conf_matrix = confusion_matrix(y_custom_test,y_custom_pred)
        
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        row["tn_ad"] = tn
        row["fp_ad"] = fp
        row["fn_ad"] = fn
        row["tp_ad"] = tp
        
        # row["classification_report_ad"] = classification_report(y_custom_test, y_custom_pred, output_dict =True)
        
        row["f1_score_ad"] = metrics.f1_score(y_custom_test,y_custom_pred)
        row["accuracy_ad"] = metrics.accuracy_score(y_custom_test, y_custom_pred)
        row["precision_ad"] = metrics.precision_score(y_custom_test, y_custom_pred)
        row["recall_ad"] = metrics.recall_score(y_custom_test, y_custom_pred)

        plt.figure(figsize=(6, 6))
        sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
        plt.title("Confusion matrix for adjusted results")
        plt.ylabel('True class')
        plt.xlabel('Predicted class')
        plt.show()
        print()

        fpr, tpr, threshold = metrics.roc_curve(y_custom_test, y_custom_pred)

        roc_auc = metrics.auc(fpr, tpr)
        
        row["auc_ad"] = roc_auc

        plt.title('Receiver Operating Characteristic for adjusted results')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.show()
        
        node_table.append(row)


In [None]:
# CV with best parameters

LABELS = ["False","True"]

names = [ "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "Linear SVM"]


classifiers = [
    
    DecisionTreeClassifier(max_depth=None,max_features='log2',min_samples_leaf=4,min_samples_split=0.1),
    RandomForestClassifier(criterion='gini', max_depth=None, n_estimators=32, max_features="log2"),
    AdaBoostClassifier(learning_rate=1, n_estimators=32),
    GaussianNB(),
    LinearSVC(C=1),
    ]

i = 0

# fit model
kfold = KFold(n_splits = 5, shuffle=False)

for logreg in classifiers:
    vse = []
    print("Classifier: ", names[i])
    for train, test in kfold.split(X, y):
        row = dict()
        #print('train: %s, test: %s' % (len(X[train]), len(X[test])))
        
        logreg.fit(X[train], y[train])
        
        y_test = y[test]
        y_true = y_test
        
        y_pred = logreg.predict(X[test])
        
        # print("Accuracy: ",metrics.accuracy_score(y_test, y_pred))
        row["f1_score"] = metrics.f1_score(y_true,y_pred)
        row["accuracy"] = metrics.accuracy_score(y_true, y_pred)
        row["precision"] = metrics.precision_score(y_true, y_pred)
        row["recall"] = metrics.recall_score(y_true, y_pred)
        
        
        y_true, y_pred = customise_score(y_test,y_pred,10)
        
        row["f1_score_1"] = metrics.f1_score(y_true,y_pred)
        row["accuracy_1"] = metrics.accuracy_score(y_true, y_pred)
        row["precision_1"] = metrics.precision_score(y_true, y_pred)
        row["recall_1"] = metrics.recall_score(y_true, y_pred)
        
        y_true, y_pred = customise_score_readable(y_true, y_pred, offset = 10, offset_pred = 10)
        
        row["f1_score_2"] = metrics.f1_score(y_true,y_pred)
        row["accuracy_2"] = metrics.accuracy_score(y_true, y_pred)
        row["precision_2"] = metrics.precision_score(y_true, y_pred)
        row["recall_2"] = metrics.recall_score(y_true, y_pred)
        
        vse.append(row)
        
    v = {k: [dic[k] for dic in vse] for k in vse[0]}
    print(v)
    
    for key in v:
        print(f"{key}: %0.2f (+/- %0.2f)" % (np.array(v[key]).mean(), np.array(v[key]).std() ))
    
    i+=1
