In [1]:
from sklearn.ensemble import BaggingClassifier, VotingClassifier

In [2]:
import re
from urllib.parse import urlparse
from datetime import datetime, date
import numpy as np
from scipy.sparse import csr_matrix, hstack
import pandas as pd
import pandas_profiling
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
# from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import accuracy_score as accuracy, precision_score as precision, recall_score as recall, roc_auc_score as auc, confusion_matrix as confm
from sklearn.pipeline import Pipeline

In [84]:
pd.set_option('display.max_rows', None)

In [3]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
engine = create_engine('postgresql://fhaqftqhkzrvnq:b76a214722d9525e3647b40f44a6051e915270788d40d531aecfbdf4d102c435@ec2-3-221-243-122.compute-1.amazonaws.com:5432/dee2cchssk4u3u')
df = pd.read_sql_table('tab_info', con=engine)

In [4]:
engine = create_engine('postgresql://fhaqftqhkzrvnq:b76a214722d9525e3647b40f44a6051e915270788d40d531aecfbdf4d102c435@ec2-3-221-243-122.compute-1.amazonaws.com:5432/dee2cchssk4u3u')
df = pd.read_sql_table('keyboard_timing', con=engine)

In [5]:
df = df.sort_values('id')
df

Unnamed: 0,id,user_id,keypress,keyup,timestamp
0,6,1,"[{'alt': False, 'code': 'KeyP', 'ctrl': False,...","[{'code': 'ShiftLeft', 'timestamp': 11315.9900...",2021-04-16 12:37:38.432
1,7,1,"[{'alt': False, 'code': 'Space', 'ctrl': False...","[{'code': 'Space', 'timestamp': 38889.98999999...",2021-04-16 12:38:21.263
2,8,1,"[{'alt': False, 'code': 'Digit0', 'ctrl': Fals...","[{'code': 'ArrowRight', 'timestamp': 25940.814...",2021-04-16 12:52:17.503
10,9,1,"[{'alt': False, 'code': 'Digit0', 'ctrl': Fals...","[{'code': 'ArrowRight', 'timestamp': 36934.019...",2021-04-16 13:01:09.166
3,10,1,"[{'alt': False, 'code': 'KeyS', 'ctrl': False,...","[{'code': 'KeyS', 'timestamp': 304650.77000000...",2021-04-16 15:00:17.685
...,...,...,...,...,...
371,379,1,"[{'alt': False, 'code': 'Digit4', 'ctrl': Fals...","[{'code': 'Backspace', 'timestamp': 20023727.5...",2021-04-27 18:13:32.005
374,380,1,"[{'alt': False, 'code': 'KeyH', 'ctrl': False,...","[{'code': 'KeyH', 'timestamp': 20119890.565}, ...",2021-04-27 18:14:20.852
376,381,1,"[{'alt': False, 'code': 'Minus', 'ctrl': False...","[{'code': 'Minus', 'timestamp': 20168747.565},...",2021-04-27 18:23:18.547
377,382,1,"[{'alt': False, 'code': 'BracketLeft', 'ctrl':...","[{'code': 'BracketLeft', 'timestamp': 20706426...",2021-04-27 18:36:33.573


In [86]:
def process_kb(keypress, keyup):
    hold = []
    interkey = []
    for kp, ku in zip(keypress, keyup):
        kpc = kp.copy()
        kuc1 = ku.copy()
        kuc2 = []
        for i in kpc:
            rem1 = []
            for j in range(len(kuc1)):
                if i['code'] == kuc1[j]['code']:
                    delay = kuc1[j]['timestamp'] - i['timestamp']
                    rem1.append(j)
                    if delay > 0:
                        hold.append([i['code'], delay, i['ctrl'], i['shift'], i['alt'], i['meta']])
                        if kuc1[j] not in kuc2:
                            kuc2.append(kuc1[j])
                        break
            kuc1 = np.delete(kuc1, rem1)
        for i in range(1, len(kpc)):
            rem1 = []
            for j in range(len(kuc2)):
                if kpc[i - 1]['code'] == kuc2[j]['code']:
                    rem1.append(j)
                    delay = kpc[i]['timestamp'] - kuc2[j]['timestamp']
                    interkey.append([kpc[i - 1]['code'] + kpc[i]['code'], delay])
                    break
            kuc2 = np.delete(kuc2, rem1)
    hold = pd.DataFrame(hold, columns=['key', 'delay', 'ctrl', 'shift', 'alt', 'meta'])
    interkey = pd.DataFrame(interkey, columns=['key', 'delay'])
#     interkey = interkey[interkey.delay < 5000]
    interkey = interkey[interkey.delay < interkey.delay.quantile(0.75)]
    return hold, interkey

In [81]:
def process_kb(keypress, keyup):
    hold = []
    interkey = []
    both = []
    for kp, ku in zip(keypress, keyup):
        kpc = kp.copy()
        kuc1 = ku.copy()
        kuc2 = []
        for i in kpc:
            rem1 = []
            for j in range(len(kuc1)):
                if i['code'] == kuc1[j]['code']:
                    delay = kuc1[j]['timestamp'] - i['timestamp']
                    rem1.append(j)
                    if delay > 0:
                        hold.append([i['code'], delay, i['ctrl'], i['shift'], i['alt'], i['meta']])
                        if kuc1[j] not in kuc2:
                            kuc2.append(kuc1[j])
                        break
            kuc1 = np.delete(kuc1, rem1)
        for i in range(1, len(kpc)):
            rem1 = []
            for j in range(len(kuc2)):
                if kpc[i - 1]['code'] == kuc2[j]['code']:
                    rem1.append(j)
                    delay1 = kuc2[j]['timestamp'] - kpc[i - 1]['timestamp']
                    delay2 = kpc[i]['timestamp'] - kuc2[j]['timestamp']
                    interkey.append([kpc[i - 1]['code'] + kpc[i]['code'], delay2])
                    both.append([kpc[i - 1]['code'], kpc[i]['code'], delay1, delay2, kpc[i - 1]['shift']])
                    break
            kuc2 = np.delete(kuc2, rem1)
    hold = pd.DataFrame(hold, columns=['key', 'delay', 'ctrl', 'shift', 'alt', 'meta'])
    interkey = pd.DataFrame(interkey, columns=['key', 'delay'])
#     interkey = interkey[interkey.delay < 5000]
    interkey = interkey[interkey.delay < interkey.delay.quantile(0.75)]
    both = pd.DataFrame(both, columns=['key1', 'key2', 'hold', 'interkey', 'shift'])
#     both = both[both.interkey < 5000]
    both = both[both.interkey < both.interkey.quantile(0.75)]
    return hold, interkey, both

In [7]:
df1 = df[df.user_id == 1]
df2 = df[df.user_id == 2]

In [197]:
hold1, interkey1, both1 = process_kb(df1.keypress, df1.keyup)
hold2, interkey2, both2 = process_kb(df2.keypress, df2.keyup)

In [31]:
hold = pd.concat([hold1, hold2])
hold = pd.get_dummies(hold, columns=['key'])
hold1 = hold.iloc[:13166]
hold2 = hold.iloc[13166:]
hold1 = hold1.drop(columns=['ctrl', 'shift', 'alt', 'meta', 'code', 'true'])
hold2 = hold2.drop(columns=['ctrl', 'shift', 'alt', 'meta', 'code', 'true'])

In [136]:
both = pd.concat([both1, both2])
both = pd.get_dummies(both, columns=['key1', 'key2'])
both1 = both.iloc[:9641]
both2 = both.iloc[9641:]
both1 = both1.drop(columns=['shift'])
both2 = both2.drop(columns=['shift'])

In [194]:
keys1 = set(both1.key1)
keys2 = set(both1.key2)
keys3 = set(both2.key1)
keys4 = set(both2.key2)
keys = keys1 | keys2 | keys3 | keys4
keys = dict(zip(keys, range(len(keys))))

In [195]:
both1.key1 = both1.key1.map(keys)
both1.key2 = both1.key2.map(keys)
both2.key1 = both2.key1.map(keys)
both2.key2 = both2.key2.map(keys)

In [62]:
both1['shift'] = both1['shift'].apply(int)
both2['shift'] = both2['shift'].apply(int)

In [198]:
both1 = pd.concat([hold1, interkey1])
both2 = pd.concat([hold2, interkey2])

In [12]:
hold1['code'] = pd.factorize(hold1.key)[0]
hold2['code'] = pd.factorize(hold2.key)[0]

In [83]:
interkey1['code'] = pd.factorize(interkey1['key'])[0]
interkey2['code'] = pd.factorize(interkey2['key'])[0]

In [199]:
both1['code'] = pd.factorize(both1['key'])[0]
both2['code'] = pd.factorize(both2['key'])[0]

In [14]:
hold1['shift'] = hold1['shift'].apply(int)
hold2['shift'] = hold2['shift'].apply(int)

In [77]:
hold1.describe(include='all')

Unnamed: 0,key,delay,ctrl,shift,alt,meta,code
count,11479,11479.0,11479,11479,11479,11479,11479.0
unique,50,,2,2,1,1,
top,Space,,False,False,False,False,
freq,844,,11474,10451,11479,11479,
mean,,109.45466,,,,,19.13425
std,,34.62076,,,,,13.50451
min,,11.0,,,,,0.0
25%,,88.0,,,,,7.0
50%,,108.0,,,,,14.0
75%,,132.0,,,,,32.0


In [78]:
hold2.describe(include='all')

Unnamed: 0,key,delay,ctrl,shift,alt,meta,code
count,2067,2067.0,2067,2067,2067,2067,2067.0
unique,42,,1,2,1,1,
top,Space,,False,False,False,False,
freq,330,,2067,1944,2067,2067,
mean,,94.44097,,,,,14.60232
std,,22.70347,,,,,9.72962
min,,47.735,,,,,0.0
25%,,79.805,,,,,6.0
50%,,88.865,,,,,12.0
75%,,99.605,,,,,24.0


In [93]:
interkey1.describe(include='all')

Unnamed: 0,key,delay,code
count,8397,8397.0,8397.0
unique,789,,
top,CommaSpace,,
freq,219,,
mean,,85.96642,234.5028
std,,142.53161,181.10775
min,,-253.0,0.0
25%,,-22.0,109.0
50%,,55.0,167.0
75%,,154.0,315.0


In [94]:
interkey2.describe(include='all')

Unnamed: 0,key,delay,code
count,1500,1500.0,1500.0
unique,317,,
top,SlashSpace,,
freq,39,,
mean,,133.31183,104.24267
std,,109.90925,77.10268
min,,-100.86,0.0
25%,,71.87,45.0
50%,,131.2475,88.0
75%,,204.53875,149.0


In [112]:
hold_train, hold_test = train_test_split(hold1.copy(), test_size=0.2, random_state=42)
hold_test['true'] = 1
hold2['true'] = -1
hold_test = pd.concat([hold_test, hold2])

lof = LocalOutlierFactor(novelty=True, contamination=0.3, n_neighbors=6)
iforest = IsolationForest(random_state=42, contamination=0.38)
svm = OneClassSVM(nu=0.2, gamma=0.4)

res = pd.DataFrame()
res['true'] = hold_test.true

X_train = hold_train[['delay']]
X_test = hold_test[['delay']]

for model in [(lof, 'lof'), (iforest, 'iforest'), (svm, 'svm')]:
    model[0].fit(X_train)
    res[model[1]] = model[0].predict(X_test)
    res[model[1] + '_df'] = model[0].decision_function(X_test)

metrics = pd.DataFrame(
    [
        [accuracy(res.true, res.lof), accuracy(res.true, res.iforest), accuracy(res.true, res.svm)],
        [precision(res.true, res.lof), precision(res.true, res.iforest), precision(res.true, res.svm)],
        [recall(res.true, res.lof), recall(res.true, res.iforest), recall(res.true, res.svm)],
        [auc(res.true, res.lof_df), auc(res.true, res.iforest_df), auc(res.true, res.svm_df)]
    ], ['accuracy', 'precision', 'recall', 'auc'], ['lof', 'iforest', 'svm'])
print(metrics)
print(confm(res.true, res.lof), '\n')
print(confm(res.true, res.iforest), '\n')
print(confm(res.true, res.svm), '\n')

              lof  iforest     svm
accuracy  0.93384  0.75282 0.70134
precision 0.99828  0.89655 0.71369
recall    0.88345  0.63174 0.77980
auc       0.99444  0.74681 0.59040
[[2063    4]
 [ 307 2327]] 

[[1875  192]
 [ 970 1664]] 

[[1243  824]
 [ 580 2054]] 



In [129]:
interkey_train, interkey_test = train_test_split(interkey1.copy(), test_size=0.2, random_state=42)
interkey_test['true'] = 1
interkey2['true'] = -1
interkey_test = pd.concat([interkey_test, interkey2])

lof = LocalOutlierFactor(novelty=True, contamination=0.2, n_neighbors=1)
iforest = IsolationForest(random_state=42, contamination=0.38)
svm = OneClassSVM(nu=0.2, gamma=0.4)

res = pd.DataFrame()
res['true'] = interkey_test.true

X_train = interkey_train[['delay']]
X_test = interkey_test[['delay']]

for model in [(lof, 'lof'), (iforest, 'iforest'), (svm, 'svm')]:
    model[0].fit(X_train)
    res[model[1]] = model[0].predict(X_test)
    res[model[1] + '_df'] = model[0].decision_function(X_test)

metrics = pd.DataFrame(
    [
        [accuracy(res.true, res.lof), accuracy(res.true, res.iforest), accuracy(res.true, res.svm)],
        [precision(res.true, res.lof), precision(res.true, res.iforest), precision(res.true, res.svm)],
        [recall(res.true, res.lof), recall(res.true, res.iforest), recall(res.true, res.svm)],
        [auc(res.true, res.lof_df), auc(res.true, res.iforest_df), auc(res.true, res.svm_df)]
    ], ['accuracy', 'precision', 'recall', 'auc'], ['lof', 'iforest', 'svm'])
print(metrics)
print(confm(res.true, res.lof), '\n')
print(confm(res.true, res.iforest), '\n')
print(confm(res.true, res.svm), '\n')

              lof  iforest     svm
accuracy  0.93147  0.62321 0.70400
precision 0.93977  0.68357 0.77597
recall    0.93831  0.61483 0.66615
auc       0.95505  0.66403 0.74025
[[1384  116]
 [ 119 1810]] 

[[ 951  549]
 [ 743 1186]] 

[[1129  371]
 [ 644 1285]] 



In [66]:
both1.describe()

Unnamed: 0,hold,interkey,key1_Backquote,key1_Backslash,key1_BracketLeft,key1_BracketRight,key1_Comma,key1_Digit0,key1_Digit1,key1_Digit2,...,key2_KeyW,key2_KeyX,key2_KeyY,key2_KeyZ,key2_Minus,key2_Period,key2_Quote,key2_Semicolon,key2_Slash,key2_Space
count,9641.0,9641.0,9641.0,9641.0,9641.0,9641.0,9641.0,9641.0,9641.0,9641.0,...,9641.0,9641.0,9641.0,9641.0,9641.0,9641.0,9641.0,9641.0,9641.0,9641.0
mean,111.02552,86.31138,0.0001,0.0028,0.01577,0.0027,0.02614,0.01234,0.00819,0.00477,...,0.00249,0.00425,0.01805,0.00197,0.01753,0.02323,0.00892,0.00311,0.00062,0.07748
std,36.02666,142.94229,0.01018,0.05285,0.12458,0.05186,0.15955,0.11042,0.09015,0.06891,...,0.04983,0.06508,0.13313,0.04435,0.13124,0.15065,0.09403,0.0557,0.02494,0.26737
min,11.0,-253.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,88.0,-22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,109.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,132.0,154.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,344.0,516.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [134]:
both1

Unnamed: 0,hold,interkey,true,key1_Backquote,key1_Backslash,key1_BracketLeft,key1_BracketRight,key1_Comma,key1_Digit0,key1_Digit1,...,key2_KeyW,key2_KeyX,key2_KeyY,key2_KeyZ,key2_Minus,key2_Period,key2_Quote,key2_Semicolon,key2_Slash,key2_Space
2,89.00000,198.00000,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,66.00000,29.00000,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,79.00000,122.00000,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,66.00000,263.00000,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,88.00000,11.00000,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12850,79.00000,64.00000,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12851,77.00000,163.00000,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12852,102.00000,120.00000,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12853,89.00000,241.00000,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [168]:
both_train, both_test = train_test_split(both1.copy(), test_size=0.2, random_state=42)
both_test['true'] = 1
both2['true'] = -1
both_test = pd.concat([both_test, both2])

lof = LocalOutlierFactor(novelty=True, contamination=0.2, n_neighbors=6)
iforest = IsolationForest(random_state=42, contamination=0.2, bootstrap=True)
svm = OneClassSVM(nu=0.1, gamma=0.04)

res = pd.DataFrame()
res['true'] = both_test['true']

X_train = both_train
X_test = both_test.drop(columns=['true'])
# X_train = both_train[['hold', 'interkey']]
# X_test = both_test[['hold', 'interkey']]

for model in [(lof, 'lof'), (iforest, 'iforest'), (svm, 'svm')]:
    model[0].fit(X_train)
    res[model[1]] = model[0].predict(X_test)
    res[model[1] + '_df'] = model[0].decision_function(X_test)

metrics = pd.DataFrame(
    [
        [accuracy(res.true, res.lof), accuracy(res.true, res.iforest), accuracy(res.true, res.svm)],
        [precision(res.true, res.lof), precision(res.true, res.iforest), precision(res.true, res.svm)],
        [recall(res.true, res.lof), recall(res.true, res.iforest), recall(res.true, res.svm)],
        [auc(res.true, res.lof_df), auc(res.true, res.iforest_df), auc(res.true, res.svm_df)]
    ], ['accuracy', 'precision', 'recall', 'auc'], ['lof', 'iforest', 'svm'])
print(metrics)
print(confm(res.true, res.lof), '\n')
print(confm(res.true, res.iforest), '\n')
print(confm(res.true, res.svm), '\n')

              lof  iforest     svm
accuracy  0.79003  0.56460 0.70954
precision 0.82343  0.58098 0.94556
recall    0.79782  0.81078 0.51322
auc       0.86044  0.53419 0.85261
[[1170  330]
 [ 390 1539]] 

[[ 372 1128]
 [ 365 1564]] 

[[1443   57]
 [ 939  990]] 



In [187]:
both_train, both_test = train_test_split(both1.copy(), test_size=0.2, random_state=42)
both_test['true'] = 1
both2['true'] = -1
both_test = pd.concat([both_test, both2])

lof = LocalOutlierFactor(novelty=True, contamination=0.3, n_neighbors=3)
iforest = IsolationForest(random_state=42, contamination=0.1, bootstrap=True)
svm = OneClassSVM(nu=0.1, gamma=0.05)

res = pd.DataFrame()
res['true'] = both_test.true

X_train = both_train[['hold', 'interkey']]
X_test = both_test[['hold', 'interkey']]

for model in [(lof, 'lof'), (iforest, 'iforest'), (svm, 'svm')]:
    model[0].fit(X_train)
    res[model[1]] = model[0].predict(X_test)
    res[model[1] + '_df'] = model[0].decision_function(X_test)

metrics = pd.DataFrame(
    [
        [accuracy(res.true, res.lof), accuracy(res.true, res.iforest), accuracy(res.true, res.svm)],
        [precision(res.true, res.lof), precision(res.true, res.iforest), precision(res.true, res.svm)],
        [recall(res.true, res.lof), recall(res.true, res.iforest), recall(res.true, res.svm)],
        [auc(res.true, res.lof_df), auc(res.true, res.iforest_df), auc(res.true, res.svm_df)]
    ], ['accuracy', 'precision', 'recall', 'auc'], ['lof', 'iforest', 'svm'])
print(metrics)
print(confm(res.true, res.lof), '\n')
print(confm(res.true, res.iforest), '\n')
print(confm(res.true, res.svm), '\n')

              lof  iforest     svm
accuracy  0.68037  0.51094 0.80140
precision 0.71261  0.53925 0.88806
recall    0.72369  0.89736 0.74028
auc       0.74540  0.53018 0.85390
[[ 937  563]
 [ 533 1396]] 

[[  21 1479]
 [ 198 1731]] 

[[1320  180]
 [ 501 1428]] 



In [203]:
both_train, both_test = train_test_split(both1.copy(), test_size=0.2, random_state=42)
both_test['true'] = 1
both2['true'] = -1
both_test = pd.concat([both_test, both2])

lof = LocalOutlierFactor(novelty=True, contamination=0.3, n_neighbors=2)
iforest = IsolationForest(random_state=42)
svm = OneClassSVM(nu=0.2, gamma=0.4)

res = pd.DataFrame()
res['true'] = both_test.true

X_train = both_train[['delay']]
X_test = both_test[['delay']]

for model in [(lof, 'lof'), (iforest, 'iforest'), (svm, 'svm')]:
    model[0].fit(X_train)
    res[model[1]] = model[0].predict(X_test)
    res[model[1] + '_df'] = model[0].decision_function(X_test)

metrics = pd.DataFrame(
    [
        [accuracy(res.true, res.lof), accuracy(res.true, res.iforest), accuracy(res.true, res.svm)],
        [precision(res.true, res.lof), precision(res.true, res.iforest), precision(res.true, res.svm)],
        [recall(res.true, res.lof), recall(res.true, res.iforest), recall(res.true, res.svm)],
        [auc(res.true, res.lof_df), auc(res.true, res.iforest_df), auc(res.true, res.svm_df)]
    ], ['accuracy', 'precision', 'recall', 'auc'], ['lof', 'iforest', 'svm'])
print(metrics)
print(confm(res.true, res.lof), '\n')
print(confm(res.true, res.iforest), '\n')
print(confm(res.true, res.svm), '\n')

              lof  iforest     svm
accuracy  0.93923  0.48456 0.69172
precision 0.96996  0.53186 0.72603
recall    0.92021  0.68062 0.72381
auc       0.97396  0.55363 0.63664
[[3437  130]
 [ 364 4198]] 

[[ 834 2733]
 [1457 3105]] 

[[2321 1246]
 [1260 3302]] 



In [4]:
df_copy = df.copy()

In [5]:
df = df_copy.copy()

In [6]:
df.describe(include='all', datetime_is_numeric=True)

Unnamed: 0,id,user_id,url,timestamp,tab_count,lang
count,10285.0,10285.0,10285,10285,10285.0,10285
unique,,,6530,,,6
top,,,https://www.youtube.com/,,,en
freq,,,358,,,5626
mean,5597.724259,1.309285,,2021-03-28 03:03:25.856050944,4.966845,
min,374.0,1.0,,2021-02-20 19:27:31.911000,1.0,
25%,2977.0,1.0,,2021-03-16 17:27:31.696000,3.0,
50%,5592.0,1.0,,2021-03-31 09:48:07.003000064,5.0,
75%,8247.0,2.0,,2021-04-11 10:56:43.508000,6.0,
max,10818.0,2.0,,2021-04-20 21:21:53.288000,21.0,


In [7]:
df['weekday'] = df.timestamp.apply(datetime.isoweekday)
df['hour'] = df['timestamp'].dt.hour + 1
df.lang = df.lang.replace(['uk', ''], ['ua', 'und'])
df.loc[:, 'lang'] = df.lang.map({'und': 1, 'ua': 2, 'ru': 3, 'en': 4})

df

df[['url', 'tab_count', 'lang', 'weekday', 'hour']].profile_report()

In [27]:
0.438266*(596+465)

465.000226

In [8]:
df1 = df[df.user_id == 1].copy()
df2 = df[df.user_id == 2].copy().sample(frac=0.35)

In [9]:
df1.shape, df2.shape

((7104, 8), (1113, 8))

nourl = df[['user_id', 'tab_count', 'lang', 'weekday', 'hour']].copy()
nourl.loc[:, 'lang'] = nourl.lang.map({'und': 1, 'ua': 2, 'ru': 3, 'en': 4})
nourl

nourl.profile_report()

nourl1 = nourl[nourl.user_id == 1]
nourl2 = nourl[nourl.user_id == 2].sample(frac=0.35)
nourl1 = nourl1.drop(columns=['user_id'])
nourl2 = nourl2.drop(columns=['user_id'])

nourl_train, nourl_test = train_test_split(nourl1.copy(), test_size=0.15, random_state=42)
nourl_test['true'] = 1
nourl2['true'] = -1

nourl_train.shape, nourl_test.shape, nourl2.shape

In [10]:
df

Unnamed: 0,id,user_id,url,timestamp,tab_count,lang,weekday,hour
0,10127,2,https://www.google.com/search?q=%D0%BF%D0%B5%D...,2021-04-19 06:59:33.862,4,3,1,7
1,10128,2,https://pethouse.ua/,2021-04-19 06:59:42.925,4,3,1,7
2,10131,2,https://pethouse.ua/shop/sobakam/igrushki/joys...,2021-04-19 07:03:01.192,4,3,1,8
3,10133,2,https://pethouse.ua/shop/koshkam/napolniteli-d...,2021-04-19 07:04:20.867,4,3,1,8
4,10134,2,https://pethouse.ua/shop/koshkam/napolniteli-d...,2021-04-19 07:04:25.009,4,3,1,8
...,...,...,...,...,...,...,...,...
10280,10116,2,https://thejigsawpuzzles.com/Food-and-Bakery/A...,2021-04-18 21:52:52.829,3,3,7,22
10281,10117,1,https://www.youtube.com/channel/UCgUlPeG3lQvla...,2021-04-18 22:05:53.800,5,4,7,23
10282,10122,1,https://www.youtube.com/watch?v=BHOkdt-UUeg,2021-04-18 22:17:17.026,6,4,7,23
10283,10123,1,https://www.youtube.com/watch?v=T-rhtd_WIaQ,2021-04-18 22:19:15.110,6,4,7,23


df = pd.read_csv('dataclips_vitlsnqrfxkmvlwbzhtqqxowespx (1).csv')
df = df.iloc[4:, :]
df = df[df.cpm < 6000]
# df['weekday'] = df.timestamp.apply(datetime.isoweekday)
# df['hour'] = df['timestamp'].dt.hour + 1

In [11]:
def train_models(train, test, columns=None, true=None, cv=False, tfidf=False,
          cv_analyzer='char', ngram=(5, 5), precision_recall=True):
    train_new = train.copy()
    test_new = test.copy()
    res = pd.DataFrame()
    if true is None:
        true = test_new['true'].values
    if columns is None:
        columns = list(test.columns)
    if cv:
        cv = CountVectorizer(analyzer=cv_analyzer)
        if cv_analyzer == 'char':
            cv.ngram_range = ngram
        train_new = cv.fit_transform(train_new[columns])
        test_new = cv.transform(test_new[columns])
        if tfidf:
            tf = TfidfTransformer()
            train_new = tf.fit_transform(train_new)
            test_new = tf.transform(test_new)
    lof = LocalOutlierFactor(novelty=True)
    iforest = IsolationForest(random_state=42)
    svm = OneClassSVM()
    for model in [(lof, 'lof'), (iforest, 'iforest'), (svm, 'svm')]:
        model[0].fit(train_new)
        res[model[1]] = model[0].predict(test_new) if cv else model[0].predict(test_new[columns])
        res[model[1] + '_df'] = model[0].decision_function(test_new) if cv else model[0].decision_function(test_new[columns])
    res['true'] = true
    print(f'LOF accuracy: \t\t{accuracy(res.true, res.lof)}\n'
          f'iForest accuracy: \t{accuracy(res.true, res.iforest)}\n'
          f'SVM accuracy: \t\t{accuracy(res.true, res.svm)}\n')
    if precision_recall:
        print(f'LOF precision: \t\t{precision(res.true, res.lof)}\n'
              f'iForest precision: \t{precision(res.true, res.iforest)}\n'
              f'SVM precision: \t\t{precision(res.true, res.svm)}\n\n'
              f'LOF recall: \t\t{recall(res.true, res.lof)}\n'
              f'iForest recall: \t{recall(res.true, res.iforest)}\n'
              f'SVM recall: \t\t{recall(res.true, res.svm)}\n\n'
              f'LOF auc: \t\t{auc(res.true, res.lof_df)}\n'
              f'iForest auc: \t\t{auc(res.true, res.iforest_df)}\n'
              f'SVM auc: \t\t{auc(res.true, res.svm_df)}\n')
    metrics = pd.DataFrame(
        [
            [accuracy(res.true, res.lof), accuracy(res.true, res.iforest), accuracy(res.true, res.svm)],
            [precision(res.true, res.lof), precision(res.true, res.iforest), precision(res.true, res.svm)],
            [recall(res.true, res.lof), recall(res.true, res.iforest), recall(res.true, res.svm)],
            [auc(res.true, res.lof_df), auc(res.true, res.iforest_df), auc(res.true, res.svm_df)]
        ], ['accuracy', 'precision', 'recall', 'auc'], ['lof', 'iforest', 'svm'])
    return metrics

train(x_train, x_test1, ['tab_count', 'lang', 'weekday', 'hour'])

train(x_train, nourl2, ['tab_count', 'lang', 'weekday', 'hour'])

nourl_res = train_models(nourl_train, pd.concat([nourl_test, nourl2]), ['tab_count', 'lang', 'weekday', 'hour'])
with open(f'nourl{date.today()}.html', 'w') as f:
    f.write(nourl_res.to_html())

In [12]:
STOPWORDS = ['http', 'https', 'www', 'com', 'org', 'net', 'int', 'edu', 'gov', 'mil', '']
DELIMITERS = ['&', '$', '+', ',', '/', ':', ';', '=', '?', '!', '@', '#', '-', '.', '_', '~', '%']
PATTERN = '|'.join(map(re.escape, DELIMITERS))

def process_url(url):
    url = urlparse(url.lower())
    url = url.netloc + url.path
    url = re.split(PATTERN, url)
    url = ''.join([word for word in url if word not in STOPWORDS])
    #ngrams
    n = 5
    url = [url[i : i + n] for i in range(len(url) - n + 1)]
    return url

pipeline1 = Pipeline([
    ('cv', CountVectorizer(max_features=500)),
    ('tfidf', TfidfTransformer()),
    ('clf', LocalOutlierFactor(novelty=True))
])
pipeline2 = Pipeline([
    ('cv', CountVectorizer(max_features=500)),
    ('tfidf', TfidfTransformer()),
    ('clf', IsolationForest(random_state=42))
])
pipeline3 = Pipeline([
    ('cv', CountVectorizer(max_features=500)),
    ('tfidf', TfidfTransformer()),
    ('clf', OneClassSVM())
])
pipeline4 = Pipeline([
    ('cv', CountVectorizer(max_features=500, analyzer=process_url)),
    ('tfidf', TfidfTransformer()),
    ('clf', LocalOutlierFactor(novelty=True))
])
pipeline5 = Pipeline([
    ('cv', CountVectorizer(max_features=500, analyzer=process_url)),
    ('tfidf', TfidfTransformer()),
    ('clf', IsolationForest(random_state=42))
])
pipeline6 = Pipeline([
    ('cv', CountVectorizer(max_features=500, analyzer=process_url)),
    ('tfidf', TfidfTransformer()),
    ('clf', OneClassSVM())
])
pipeline7 = Pipeline([
    ('cv', CountVectorizer(max_features=500, analyzer='char', ngram_range=(5, 5))),
    ('tfidf', TfidfTransformer()),
    ('clf', LocalOutlierFactor(novelty=True))
])
pipeline8 = Pipeline([
    ('cv', CountVectorizer(max_features=500, analyzer='char', ngram_range=(5, 5))),
    ('tfidf', TfidfTransformer()),
    ('clf', IsolationForest(random_state=42))
])
pipeline9 = Pipeline([
    ('cv', CountVectorizer(max_features=500, analyzer='char', ngram_range=(5, 5))),
    ('tfidf', TfidfTransformer()),
    ('clf', OneClassSVM())
])

pipeline11 = Pipeline([
    ('cv', CountVectorizer(max_features=500)),
    ('clf', LocalOutlierFactor(novelty=True))
])
pipeline21 = Pipeline([
    ('cv', CountVectorizer(max_features=500)),
    ('clf', IsolationForest(random_state=42))
])
pipeline31 = Pipeline([
    ('cv', CountVectorizer(max_features=500)),
    ('clf', OneClassSVM())
])
pipeline41 = Pipeline([
    ('cv', CountVectorizer(max_features=500, analyzer=process_url)),
    ('clf', LocalOutlierFactor(novelty=True))
])
pipeline51 = Pipeline([
    ('cv', CountVectorizer(max_features=500, analyzer=process_url)),
    ('clf', IsolationForest(random_state=42))
])
pipeline61 = Pipeline([
    ('cv', CountVectorizer(max_features=500, analyzer=process_url)),
    ('clf', OneClassSVM())
])
pipeline71 = Pipeline([
    ('cv', CountVectorizer(max_features=500, analyzer='char', ngram_range=(5, 5))),
    ('clf', LocalOutlierFactor(novelty=True))
])
pipeline81 = Pipeline([
    ('cv', CountVectorizer(max_features=500, analyzer='char', ngram_range=(5, 5))),
    ('clf', IsolationForest(random_state=42))
])
pipeline91 = Pipeline([
    ('cv', CountVectorizer(max_features=500, analyzer='char', ngram_range=(5, 5))),
    ('clf', OneClassSVM())
])

cv0 = CountVectorizer()
cv1 = CountVectorizer(analyzer=process_url)
cv2 = CountVectorizer(analyzer='char', ngram_range=(5, 5))
tfidf = TfidfTransformer()
lof = LocalOutlierFactor(novelty=True)
iforest = IsolationForest(random_state=42)
svm = OneClassSVM()

a1 = cv1.build_analyzer()
a2 = cv2.build_analyzer()

url = df1.iloc[450].url
url

a1(url)

url_train.shape, url_test.shape, df2.shape

test = pd.concat([url_test, df2])
res = pd.DataFrame()
res['true'] = test['true']
metrics = pd.DataFrame(index=range(1, 19), columns=['accuracy', 'precision', 'recall'])
for i, pipe in enumerate(
    [pipeline1, pipeline2, pipeline3, pipeline4, pipeline5, pipeline6, pipeline7, pipeline8, pipeline9,
    pipeline11, pipeline21, pipeline31, pipeline41, pipeline51, pipeline61, pipeline71, pipeline81, pipeline91]):
    pipe.fit(url_train.url)
    res['pred'] = pipe.predict(test.url)
    metrics.iloc[i, :] = [accuracy(res.true, res.pred), precision(res.true, res.pred), recall(res.true, res.pred)]

metrics

iforest = IsolationForest(random_state=42)
cv = CountVectorizer(analyzer=process_url, max_features=5000)
test = pd.concat([url_test, df2])
res = pd.DataFrame()
res['true'] = test['true']
train = cv.fit_transform(url_train.url)
test = cv.transform(test.url)
iforest.fit(train)
res['pred'] = iforest.predict(test)
print(accuracy(res.true, res.pred))
print(precision(res.true, res.pred))
print(recall(res.true, res.pred))
print(confm(res.true, res.pred))

url_train

len(cv.vocabulary_)

cv.stop_words_

url_char = train(url_train, url_test, 'url', cv=True, tfidf=True)

train(url_train, df2, 'url', cv=True, tfidf=True)

In [13]:
url_train, url_test = train_test_split(df1.copy(), test_size=0.15, random_state=42)
url_test['true'] = 1
df2['true'] = -1

In [14]:
url_char_tf = train_models(url_train, pd.concat([url_test, df2]), 'url', cv=True, tfidf=True)
with open(f'url_char_tf{date.today()}.html', 'w') as f:
    f.write(url_char_tf.to_html())

LOF accuracy: 		0.49105094079853145
iForest accuracy: 	0.48921523634694813
SVM accuracy: 		0.7333639284075264

LOF precision: 		0.4853242320819113
iForest precision: 	0.48921523634694813
SVM precision: 		0.8263795423956931

LOF recall: 		0.6669793621013134
iForest recall: 	1.0
SVM recall: 		0.575984990619137

LOF auc: 		0.620478348158974
iForest auc: 		0.28730515534473194
SVM auc: 		0.8138294823752716



In [15]:
url_char = train_models(url_train, pd.concat([url_test, df2]), 'url', cv=True)
with open(f'url_char{date.today()}.html', 'w') as f:
    f.write(url_char.to_html())

LOF accuracy: 		0.6888480954566315
iForest accuracy: 	0.48921523634694813
SVM accuracy: 		0.45250114731528224

LOF precision: 		0.7194570135746606
iForest precision: 	0.48921523634694813
SVM precision: 		0.455994455994456

LOF recall: 		0.5966228893058161
iForest recall: 	1.0
SVM recall: 		0.6172607879924953

LOF auc: 		0.6177382595928385
iForest auc: 		0.36715079674122475
SVM auc: 		0.36522110348617487



In [16]:
url_custom_tf = train_models(url_train, pd.concat([url_test, df2]), 'url', cv=True, tfidf=True, cv_analyzer=process_url)
with open(f'url_custom_ngram_tf{date.today()}.html', 'w') as f:
    f.write(url_custom_tf.to_html())

LOF accuracy: 		0.6200091785222579
iForest accuracy: 	0.48921523634694813
SVM accuracy: 		0.5557595227168426

LOF precision: 		0.5670800450958287
iForest precision: 	0.48921523634694813
SVM precision: 		0.7355769230769231

LOF recall: 		0.9437148217636022
iForest recall: 	1.0
SVM recall: 		0.14352720450281425

LOF auc: 		0.8474981836693756
iForest auc: 		0.20815654662870492
SVM auc: 		0.8061233520276319



In [17]:
url_custom = train_models(url_train, pd.concat([url_test, df2]), 'url', cv=True, cv_analyzer=process_url)
with open(f'url_custom_ngram{date.today()}.html', 'w') as f:
    f.write(url_custom.to_html())

LOF accuracy: 		0.7930243230839835
iForest accuracy: 	0.48921523634694813
SVM accuracy: 		0.31206975676916016

LOF precision: 		0.8482446206115515
iForest precision: 	0.48921523634694813
SVM precision: 		0.2920268972142171

LOF recall: 		0.702626641651032
iForest recall: 	1.0
SVM recall: 		0.2851782363977486

LOF auc: 		0.7837441359070443
iForest auc: 		0.21319760160073092
SVM auc: 		0.2759815349553039



df.loc[:, 'lang'] = df.lang.map({'und': 1, 'ua': 2, 'ru': 3, 'en': 4})
df1 = df[df.user_id == 1].copy()
df2 = df[df.user_id == 2].copy().sample(frac=0.35)
df1.shape, df2.shape

In [18]:
X_train, X_test = train_test_split(df1.copy(), test_size=0.15, random_state=42)
X_test['true'] = 1
df2['true'] = -1
X_test = pd.concat([X_test, df2])
# X_train.shape, X_test.shape, df2.shape

lof = LocalOutlierFactor(novelty=True, contamination=0.3)
iforest = IsolationForest(random_state=42)
svm = OneClassSVM(nu=0.2, gamma=0.4)
cv = CountVectorizer(analyzer=process_url, max_features=10)
tf = TfidfTransformer()

res = pd.DataFrame()
res['true'] = X_test.true

# urls_train = cv.fit_transform(X_train.url)
# urls_test = cv.transform(X_test.url)
# urls_train = tf.fit_transform(urls_train)
# urls_test = tf.transform(urls_test)
# X_train = hstack([urls_train, csr_matrix(X_train[['tab_count', 'lang', 'weekday', 'hour']])])
# X_test = hstack([urls_test, csr_matrix(X_test[['tab_count', 'lang', 'weekday', 'hour']])])

X_train = X_train[['tab_count', 'lang', 'weekday', 'hour']]
X_test = X_test[['tab_count', 'lang', 'weekday', 'hour']]

lof.fit(X_train)
iforest.fit(X_train)
svm.fit(X_train)

for model in [(lof, 'lof'), (iforest, 'iforest'), (svm, 'svm')]:
    model[0].fit(X_train)
    res[model[1]] = model[0].predict(X_test)
    res[model[1] + '_df'] = model[0].decision_function(X_test)

metrics = pd.DataFrame(
    [
        [accuracy(res.true, res.lof), accuracy(res.true, res.iforest), accuracy(res.true, res.svm)],
        [precision(res.true, res.lof), precision(res.true, res.iforest), precision(res.true, res.svm)],
        [recall(res.true, res.lof), recall(res.true, res.iforest), recall(res.true, res.svm)],
        [auc(res.true, res.lof_df), auc(res.true, res.iforest_df), auc(res.true, res.svm_df)]
    ], ['accuracy', 'precision', 'recall', 'auc'], ['lof', 'iforest', 'svm'])
print(metrics)
print(confm(res.true, res.lof), '\n')
print(confm(res.true, res.iforest), '\n')
print(confm(res.true, res.svm), '\n')

                lof   iforest       svm
accuracy   0.793942  0.759982  0.735659
precision  0.858304  0.867388  0.757895
recall     0.693246  0.601313  0.675422
auc        0.809878  0.766038  0.793931
[[991 122]
 [327 739]] 

[[1015   98]
 [ 425  641]] 

[[883 230]
 [346 720]] 



In [19]:
X_train, X_test = train_test_split(df1.copy(), test_size=0.15, random_state=42)
X_test['true'] = 1
df2['true'] = -1
X_test = pd.concat([X_test, df2]).copy()
# X_train.shape, X_test.shape, df2.shape

# lof = LocalOutlierFactor(novelty=True, contamination=0.5)
# iforest = IsolationForest(random_state=42)
# svm = OneClassSVM(nu=0.2, gamma=0.4)
# cv = CountVectorizer(analyzer=process_url, max_features=3)

lof = LocalOutlierFactor(novelty=True)
iforest = IsolationForest(random_state=42)
svm = OneClassSVM()
cv = CountVectorizer(analyzer=process_url)

tf = TfidfTransformer()

res = pd.DataFrame()
res['true'] = X_test.true

X_train = cv.fit_transform(X_train.url)
X_test = cv.transform(X_test.url)
# urls_train = tf.fit_transform(urls_train)
# urls_test = tf.transform(urls_test)
# X_train = hstack([urls_train, csr_matrix(X_train[['tab_count', 'lang', 'weekday', 'hour']])])
# X_test = hstack([urls_test, csr_matrix(X_test[['tab_count', 'lang', 'weekday', 'hour']])])

# X_train = X_train[['tab_count', 'lang', 'weekday', 'hour']]
# X_test = X_test[['tab_count', 'lang', 'weekday', 'hour']]

lof.fit(X_train)
iforest.fit(X_train)
svm.fit(X_train)

for model in [(lof, 'lof'), (iforest, 'iforest'), (svm, 'svm')]:
    model[0].fit(X_train)
    res[model[1]] = model[0].predict(X_test)
    res[model[1] + '_df'] = model[0].decision_function(X_test)

metrics = pd.DataFrame(
    [
        [accuracy(res.true, res.lof), accuracy(res.true, res.iforest), accuracy(res.true, res.svm)],
        [precision(res.true, res.lof), precision(res.true, res.iforest), precision(res.true, res.svm)],
        [recall(res.true, res.lof), recall(res.true, res.iforest), recall(res.true, res.svm)],
        [auc(res.true, res.lof_df), auc(res.true, res.iforest_df), auc(res.true, res.svm_df)]
    ], ['accuracy', 'precision', 'recall', 'auc'], ['lof', 'iforest', 'svm'])
print(metrics)
print(confm(res.true, res.lof), '\n')
print(confm(res.true, res.iforest), '\n')
print(confm(res.true, res.svm), '\n')

                lof   iforest       svm
accuracy   0.793024  0.489215  0.312070
precision  0.848245  0.489215  0.292027
recall     0.702627  1.000000  0.285178
auc        0.783744  0.213198  0.275982
[[979 134]
 [317 749]] 

[[   0 1113]
 [   0 1066]] 

[[376 737]
 [762 304]] 



In [20]:
X_train, X_test = train_test_split(df1.copy(), test_size=0.15, random_state=42)
X_test['true'] = 1
df2['true'] = -1
X_test = pd.concat([X_test, df2]).copy()
# X_train.shape, X_test.shape, df2.shape

# lof = LocalOutlierFactor(novelty=True, contamination=0.5)
# iforest = IsolationForest(random_state=42)
# svm = OneClassSVM(nu=0.2, gamma=0.4)
# cv = CountVectorizer(analyzer=process_url, max_features=3)

lof = LocalOutlierFactor(novelty=True)
iforest = IsolationForest(random_state=42)
svm = OneClassSVM()
cv = CountVectorizer(analyzer=process_url)

tf = TfidfTransformer()

res = pd.DataFrame()
res['true'] = X_test.true

urls_train = cv.fit_transform(X_train.url)
urls_test = cv.transform(X_test.url)
X_train = tf.fit_transform(urls_train)
X_test = tf.transform(urls_test)
# X_train = hstack([urls_train, csr_matrix(X_train[['tab_count', 'lang', 'weekday', 'hour']])])
# X_test = hstack([urls_test, csr_matrix(X_test[['tab_count', 'lang', 'weekday', 'hour']])])

# X_train = X_train[['tab_count', 'lang', 'weekday', 'hour']]
# X_test = X_test[['tab_count', 'lang', 'weekday', 'hour']]

lof.fit(X_train)
iforest.fit(X_train)
svm.fit(X_train)

for model in [(lof, 'lof'), (iforest, 'iforest'), (svm, 'svm')]:
    model[0].fit(X_train)
    res[model[1]] = model[0].predict(X_test)
    res[model[1] + '_df'] = model[0].decision_function(X_test)

metrics = pd.DataFrame(
    [
        [accuracy(res.true, res.lof), accuracy(res.true, res.iforest), accuracy(res.true, res.svm)],
        [precision(res.true, res.lof), precision(res.true, res.iforest), precision(res.true, res.svm)],
        [recall(res.true, res.lof), recall(res.true, res.iforest), recall(res.true, res.svm)],
        [auc(res.true, res.lof_df), auc(res.true, res.iforest_df), auc(res.true, res.svm_df)]
    ], ['accuracy', 'precision', 'recall', 'auc'], ['lof', 'iforest', 'svm'])
print(metrics)
print(confm(res.true, res.lof), '\n')
print(confm(res.true, res.iforest), '\n')
print(confm(res.true, res.svm), '\n')

                lof   iforest       svm
accuracy   0.609454  0.489215  0.555760
precision  0.561393  0.489215  0.735577
recall     0.922139  1.000000  0.143527
auc        0.836386  0.208157  0.806123
[[345 768]
 [ 83 983]] 

[[   0 1113]
 [   0 1066]] 

[[1058   55]
 [ 913  153]] 



In [21]:
X_train, X_test = train_test_split(df1.copy(), test_size=0.15, random_state=42)
X_test['true'] = 1
df2['true'] = -1
X_test = pd.concat([X_test, df2])
# X_train.shape, X_test.shape, df2.shape

lof = LocalOutlierFactor(novelty=True, contamination=0.5)
iforest = IsolationForest(random_state=42)
svm = OneClassSVM(nu=0.2, gamma=0.4)
cv = CountVectorizer(analyzer=process_url)
tf = TfidfTransformer()

res = pd.DataFrame()
res['true'] = X_test.true

urls_train = cv.fit_transform(X_train.url)
urls_test = cv.transform(X_test.url)
# urls_train = tf.fit_transform(urls_train)
# urls_test = tf.transform(urls_test)
X_train = hstack([urls_train, csr_matrix(X_train[['tab_count', 'lang', 'weekday', 'hour']])])
X_test = hstack([urls_test, csr_matrix(X_test[['tab_count', 'lang', 'weekday', 'hour']])])

# X_train = X_train[['tab_count', 'lang', 'weekday', 'hour']]
# X_test = X_test[['tab_count', 'lang', 'weekday', 'hour']]

lof.fit(X_train)
iforest.fit(X_train)
svm.fit(X_train)

for model in [(lof, 'lof'), (iforest, 'iforest'), (svm, 'svm')]:
    model[0].fit(X_train)
    res[model[1]] = model[0].predict(X_test)
    res[model[1] + '_df'] = model[0].decision_function(X_test)

metrics = pd.DataFrame(
    [
        [accuracy(res.true, res.lof), accuracy(res.true, res.iforest), accuracy(res.true, res.svm)],
        [precision(res.true, res.lof), precision(res.true, res.iforest), precision(res.true, res.svm)],
        [recall(res.true, res.lof), recall(res.true, res.iforest), recall(res.true, res.svm)],
        [auc(res.true, res.lof_df), auc(res.true, res.iforest_df), auc(res.true, res.svm_df)]
    ], ['accuracy', 'precision', 'recall', 'auc'], ['lof', 'iforest', 'svm'])
print(metrics)
print(confm(res.true, res.lof), '\n')
print(confm(res.true, res.iforest), '\n')
print(confm(res.true, res.svm), '\n')

                lof   iforest       svm
accuracy   0.704452  0.489215  0.704911
precision  0.862543  0.489215  0.993007
recall     0.470919  1.000000  0.399625
auc        0.633633  0.188016  0.828720
[[1033   80]
 [ 564  502]] 

[[   0 1113]
 [   0 1066]] 

[[1110    3]
 [ 640  426]] 



In [22]:
X_train, X_test = train_test_split(df1.copy(), test_size=0.15, random_state=42)
X_test['true'] = 1
df2['true'] = -1
X_test = pd.concat([X_test, df2])
# X_train.shape, X_test.shape, df2.shape

lof = LocalOutlierFactor(novelty=True, contamination=0.3)
iforest = IsolationForest(random_state=42)
svm = OneClassSVM(nu=0.2, gamma=0.3)
cv = CountVectorizer(analyzer=process_url)
tf = TfidfTransformer()

res = pd.DataFrame()
res['true'] = X_test.true

urls_train = cv.fit_transform(X_train.url)
urls_test = cv.transform(X_test.url)
urls_train = tf.fit_transform(urls_train)
urls_test = tf.transform(urls_test)
X_train = hstack([urls_train, csr_matrix(X_train[['tab_count', 'lang', 'weekday', 'hour']])])
X_test = hstack([urls_test, csr_matrix(X_test[['tab_count', 'lang', 'weekday', 'hour']])])

# X_train = X_train[['tab_count', 'lang', 'weekday', 'hour']]
# X_test = X_test[['tab_count', 'lang', 'weekday', 'hour']]

lof.fit(X_train)
iforest.fit(X_train)
svm.fit(X_train)

for model in [(lof, 'lof'), (iforest, 'iforest'), (svm, 'svm')]:
    model[0].fit(X_train)
    res[model[1]] = model[0].predict(X_test)
    res[model[1] + '_df'] = model[0].decision_function(X_test)

metrics = pd.DataFrame(
    [
        [accuracy(res.true, res.lof), accuracy(res.true, res.iforest), accuracy(res.true, res.svm)],
        [precision(res.true, res.lof), precision(res.true, res.iforest), precision(res.true, res.svm)],
        [recall(res.true, res.lof), recall(res.true, res.iforest), recall(res.true, res.svm)],
        [auc(res.true, res.lof_df), auc(res.true, res.iforest_df), auc(res.true, res.svm_df)]
    ], ['accuracy', 'precision', 'recall', 'auc'], ['lof', 'iforest', 'svm'])
print(metrics)
print(confm(res.true, res.lof), '\n')
print(confm(res.true, res.iforest), '\n')
print(confm(res.true, res.svm), '\n')

                lof   iforest       svm
accuracy   0.766866  0.489215  0.715925
precision  0.817768  0.489215  0.747508
recall     0.673546  1.000000  0.633208
auc        0.810817  0.167430  0.777174
[[953 160]
 [348 718]] 

[[   0 1113]
 [   0 1066]] 

[[885 228]
 [391 675]] 



def fit_multiple_estimators(classifiers, X_list):

    # Fit all estimators with their respective feature arrays
    estimators_ = [clf.fit(X) for clf, X in zip([clf for _, clf in classifiers], X_list)]

    return estimators_


def predict_from_multiple_estimator(estimators, X_list):

    # Predict 'soft' voting with probabilities
    pred1 = np.asarray([clf.predict_proba(X) for clf, X in zip(estimators, X_list)])
    pred2 = np.average(pred1, axis=0)
    pred = np.argmax(pred2, axis=1)

    return pred

lof = LocalOutlierFactor(novelty=True)
bag = BaggingClassifier(base_estimator=lof, random_state=42)
X_train, X_test = train_test_split(df1.copy(), test_size=0.15, random_state=42)
X_test['true'] = 1
X_test = pd.concat([X_test, df2]).copy()

res = pd.DataFrame()
res['true'] = X_test.true

cv = CountVectorizer(analyzer=process_url)
X_train = cv.fit_transform(X_train.url)
X_test = cv.transform(X_test.url)
# X_train2 = X_train[['tab_count', 'lang', 'weekday', 'hour']]
# X_test2 = X_test[['tab_count', 'lang', 'weekday', 'hour']]
bag.fit(X_train, np.zeros(X_train.shape[0]))

bag.predict(csr_matrix(X_test))

res.lof = bag.predict(X_test)
metrics = pd.DataFrame(
    [
        [accuracy(res.true, res.lof)],
        [precision(res.true, res.lof)],
        [recall(res.true, res.lof)],
        [auc(res.true, res.lof_df)]
    ], ['accuracy', 'precision', 'recall', 'auc'], ['lof'])
print(metrics)
print('\n', confm(res.true, res.lof))

clf = [
    ('lof1', LocalOutlierFactor(novelty=True)),
    ('lof2', LocalOutlierFactor(novelty=True, contamination=0.3))
]

X_train, X_test = train_test_split(df1.copy(), test_size=0.15, random_state=42)
X_test['true'] = 1
# df2['true'] = -1
X_test = pd.concat([X_test, df2]).copy()

res = pd.DataFrame()
res['true'] = X_test.true

cv = CountVectorizer(analyzer=process_url)

X_train1 = cv.fit_transform(X_train.url)
X_test1 = cv.transform(X_test.url)
X_train2 = X_train[['tab_count', 'lang', 'weekday', 'hour']]
X_test2 = X_test[['tab_count', 'lang', 'weekday', 'hour']]

fitted = fit_multiple_estimators(clf, [X_train1, X_train2])
pred = predict_from_multiple_estimator(fitted, [X_test1, X_test2])