In [44]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np

%matplotlib inline

import os

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline

In [45]:
DATA_ROOT = 'Data'
DB_NAME = 'amiunique-1month-aug17.sqlite3.db'

PATH_TO_DB = os.path.join(DATA_ROOT, DB_NAME)

In [46]:
con = sqlite3.connect(PATH_TO_DB)
df = pd.read_sql_query('select * from fpData', con)

In [47]:
# find a substring 'bot' in the userAgentHttp column (assuming that only good bots openly identify themselves as bots)
df["GoodBot"] = df['userAgentHttp'].str.contains("bot") # creates a column of Boolean True or False values
df.GoodBot = df.GoodBot.astype(int) # converts True/False to 1/0

In [48]:
# multicolumn label encoder (borrowed from Stockoverflow) to encode categorical values

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        '''
        Note that .fit returned self: this is standard behavior for .fit methods in scikit-learn.
        '''
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

LE = MultiColumnLabelEncoder(columns = None)

In [49]:
# As isoforest is an unsupervised learning algorithm, it will only use feature data.
# Return the anomaly score of each sample using the IsolationForest algorithm, Inliers are labeled 1, outliers are labeled -1.

IFT = IsolationForest(n_estimators=100, max_samples=10000)

In [50]:
pipeline = Pipeline([('LE', LE), ('IFT', IFT)])
model = pipeline.fit(df)
predicted = model.predict(df)

In [51]:
print('predicted:', model.predict(df))

predicted: [-1 -1  1 ...,  1  1  1]


In [52]:
# Put the predicted results back into the original dataframe
df['iForest_prediction'] = model.predict(df)

In [53]:
my=pd.crosstab(df.iForest_prediction, df.GoodBot, margins=True)
my

GoodBot,0,1,All
iForest_prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,2431,180,2611
1,23499,0,23499
All,25930,180,26110


In [54]:
# It seems that IsolationForest does predict at least the good bots about which we know that they are bots.
# Also, considering that bots typically don't have adblock, Isolation Forest seems to take that into consideration

In [55]:
pd.crosstab(df.iForest_prediction, [df.GoodBot, df.adBlock],  margins=True)

GoodBot,0,0,0,1,1,1,All
adBlock,no,no JS,yes,no,no JS,yes,Unnamed: 7_level_1
iForest_prediction,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
-1,1172,327,932,44,128,8,2611
1,15386,3218,4895,0,0,0,23499
All,16558,3545,5827,44,128,8,26110
