In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from operator import itemgetter  

In [None]:
correlations = pd.read_csv('../calculated_correlation/correlation_current.csv')
correlations = correlations.drop('Unnamed: 0',axis=1)

In [None]:
fpd = pd.read_csv('../dataframe_creation/complete_dataframe.csv',index_col=0, parse_dates=True)
fpd = fpd.fillna(1)

In [None]:
sel = ['K302','K173','K414','K158','K402','K305']

In [None]:
def plot_scores(auc,f1,prec,title):
    # set width of bar
    barWidth = 0.25

    # Set position of bar on X axis
    r1 = np.arange(len(auc))
    r2 = [x + barWidth for x in r1]
    r3 = [x + barWidth for x in r2]

    # Make the plot
    plt.figure(figsize=(15,10))
    plt.title(title)
    plt.bar(r1, auc, color='#7f6d5f', width=barWidth, edgecolor='white', label='auc')
    plt.bar(r2, f1, color='#557f2d', width=barWidth, edgecolor='white', label='f1')
    plt.bar(r3, prec, color='#2d7f5e', width=barWidth, edgecolor='white', label='prec')

    # Add xticks on the middle of the group bars
    plt.xlabel('Location', fontweight='bold')
    plt.xticks([r + barWidth for r in range(len(auc))], ints)
    # Create legend & Show graphic

    plt.legend()
    plt.show()
    return

In [None]:
def predictor(data,intersections,predictor,k,features):
    predictors = {}
    performance = {}
    confusion_matrices = {}
    used_features = {}
    importances = {}
    sel = ['K302','K173','K414','K158','K402','K305']
    
    for label in intersections:
        try:
            feats = features[label]
        except:
            return [predictors,performance,confusion_matrices,used_features,importances]
        print('\nLabel: ',label)
        df = data.copy(deep=True)
        df = df[feats]
        y = df[label][2:] #set label to be current intersection
        
        boundary = y.sort_values().head(k)[-1]#Get K lowest LOF score and use as boundary 
        print('Boundary is set to: ',boundary)
        y = y.apply(make_binary,boundary=boundary)
        
        y,y_test = split_data(y)
        #print('Length of y: ',len(y))

        for feature in intersections: # for each intersection create t-1 & t-2 feature
            if feature in list(df.columns):
                df['t-1'+feature] = df[feature].shift(periods=1)
                df['t-2'+feature] = df[feature].shift(periods=2)
                #df['t-3'+feature] = df[feature].shift(periods=3)
                #df['t-4'+feature] = df[feature].shift(periods=4)
                df = df.drop(feature,axis=1) #Then, drop the original column
            else:
                pass
        
        if label in sel:
            X = df[2:]
            X,X_test = split_data(X)
            #print('Length of X: ',len(X))

            used_features[label] = X.columns

            #Create predictor and get best hyperparameters with grid search
            if predictor == 'rf':
                model = rf_grid_search(X,y)

            
            #Test model performance and print results
            y_pred = model.predict(X_test)
            try:
                auc = roc_auc_score(y_test, y_pred)
                accuracy = metrics.accuracy_score(y_test, y_pred)
                f1 = metrics.f1_score(y_test, y_pred)
                precision = metrics.precision_score(y_test, y_pred)

                print('\nPrediction results: ')
                print('Area under the ROC curve: ',auc)
            except:
                print('!!!Error here when runnig AUC!!!')
            print("Accuracy:",accuracy)
            print("f1:",f1)
            print("Precision:",precision)
            print("Recall:",metrics.recall_score(y_test, y_pred))
            try:
                tn,fp,fn,tp = confusion_matrix(y_test, y_pred).ravel()
                print('tn',tn, 'fp',fp, 'fn',fn, 'tp',tp)

                predictors[label] = model
                performance[label] = auc,accuracy,f1,precision
                confusion_matrices[label] = list(confusion_matrix(y_test, y_pred).ravel())
                importances[label] = model.feature_importances_
                
                #if label == 'K414':
                    
                
            except:
                print('!!!Error in last part!!!')
        else:
            pass
        
    return [predictors,performance,confusion_matrices,used_features,importances]

In [None]:
#Split data into train/val and test based on date
def split_data(df):
    df_test = df['2019-01-01':]
    df = df[:'2018-12-31']
    return df,df_test

#Make the label a binary of outlier or not.
def make_binary(row,boundary):
    if row <= boundary:
        row = 1
    else:
        row = 0
    return row

def rf_grid_search(X,y):
    #param_grid = {'n_estimators': [10,100] }
    rf = RandomForestClassifier(random_state = 1,n_estimators = 100)
    #model = GridSearchCV(rf, param_grid,cv=5, scoring="roc_auc")
    rf.fit(X, y)
    return rf

In [None]:
intersections = [
    'K302','K173','K414','K158','K402','K305',
    'K071','K097','K124','K128','K159','K184','K189','K206','K225','K270','K304','K405','K406','K424','K430','K703','K704','K707','K711','K561','K504','K145','K250',
    'a12in','a12out','n211in','n211out','n141in','n141out','n142in','n142out','n143in','n143out'
]

In [None]:
def feature_selection(treshold,cor_data):
    features = {}
    for i in range(6):
        feature_selection = []
        for (columnName, columnData) in cor_data.loc[i].iteritems():
            if float(columnData) > treshold:
                feature_selection.append(columnName)
            else:
                pass  
        features[sel[i]] = feature_selection
    return features

In [None]:
#features = feature_selection(0.4,correlations)
#a14,a24,a34,a44,feat4 = predictor(fpd,intersections,'rf',50,features)

In [None]:
#features = feature_selection(0.3,correlations)
#a13,a23,a33,a43,feat3 = predictor(fpd,intersections,'rf',50,features)

In [None]:
#features = feature_selection(0.2,correlations)
#a12,a22,a32,a42,feat2 = predictor(fpd,intersections,'rf',50,features)

In [None]:
#features = feature_selection(0.1,correlations)
#a11,a21,a31,a41,feat1 = predictor(fpd,intersections,'rf',50,features)

In [None]:
features = feature_selection(0.05,correlations)
a50 = predictor(fpd,intersections,'rf',50,features)

In [None]:
#features = feature_selection(0.0,correlations)
#a10,a20,a30,a40,feat0= predictor(fpd,intersections,'rf',50,features)

In [None]:
x=a205
y=a22
ints = x.keys()

auc=[]
acc=[]
f1=[]
prec=[]

for i in ints:
    if i in sel:
        '''
        if i =='K302':
            auc.append(y[i][0])
            acc.append(y[i][1])
            f1.append(y[i][2])
            prec.append(y[i][3])            
        else:'''
        auc.append(x[i][0])
        acc.append(x[i][1])
        f1.append(x[i][2])
        prec.append(x[i][3])
    else: pass

In [None]:
plot_scores(auc,f1,prec,'Random Forest')

# Feature importance

In [None]:
for intersection in sel:
    print('\n',intersection)
    columns = a42[intersection]
    importance = feat2[intersection]

    feature_importance = dict(zip(columns, importance))
    for key, value in sorted(feature_importance.items(), key = itemgetter(1), reverse = True):
        print(key, value)

In [None]:
a43['K302']

In [None]:
feat3['K302']

## Iterations

In [None]:
sel

In [None]:
xlabs = ['0','0.05','0.1','0.2','0.3','0.4']

In [None]:
def plot_cors(xlabs,sel):
    for i in sel:
        inters = i
        z = []
        z.append(a20[inters][0])
        z.append(a205[inters][0])
        z.append(a21[inters][0])
        z.append(a22[inters][0])
        z.append(a23[inters][0])
        z.append(a24[inters][0])

        plt.figure(figsize=(15,10))
        plt.title(i)
        plt.bar(np.arange(len(z)),z, color='#7f6d5f', edgecolor='white', label='auc')
        plt.xlabel('group', fontweight='bold')
        plt.xticks([r for r in range(len(z))], xlabs)
        plt.show()
    return

In [None]:
plot_cors(xlabs,sel)

# Visualize dec. tree

In [None]:
from sklearn.tree import export_graphviz

In [None]:
intkey = 'K414'

In [None]:
estimator = a105[intkey].estimators_[5]

In [None]:
feat_names = a405[intkey]

In [None]:
labels = ['0','1']

In [None]:
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = feat_names,
                class_names = labels,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')