# Set-up

In [136]:
import json
import matplotlib.pyplot as plt
import os
import pandas as pd
import pickle
import re
import sklearn
import time
import warnings

from collections import Counter
from helpers import BloomFilter
from random import shuffle
from sseclient import SSEClient as EventSource

warnings.filterwarnings("ignore")

URL ='https://stream.wikimedia.org/v2/stream/recentchange'

In [137]:
def feature_extraction(datafile):
    import pandas as pd
    if datafile.endswith('.gz'):
        df = pd.read_csv(datafile, sep='\t', compression='gzip',)
    else:
        df = pd.read_csv(datafile, sep='\t',)
    short_df = df[['timestamp', 'user', 'bot', 'type', 'comment']].copy()
    short_df.dropna(inplace=True)
    short_df['timestamp'] = pd.to_datetime(short_df['timestamp'].loc[:], unit='s')
    short_df = short_df.assign(requests=1)
    user_df = short_df.set_index('timestamp').groupby('user')[['bot','requests',]].resample("1S", label='right').sum()
    user_df['bot'] = [1 if x > 0.5 else 0 for x in user_df['bot']]
    user_df = user_df.reset_index()
    user_df = user_df[user_df['requests'] != 0]
    user_df = user_df.groupby('user', as_index=False)[['bot','requests']].mean()
    user_df['bot'] = user_df['bot'].astype(int)
    def count_digits(string): return sum(c.isdigit() for c in string)
    user_df['n_digits_name'] = user_df['user'].apply(count_digits)
    find_lead_digits = lambda name: len(re.findall('^\d+', name)[0]) if name[0].isdigit() else 0
    user_df['lead_digits_name'] = user_df['user'].apply(find_lead_digits)
    def unique_ratio(string): return (len(set(string)) / len(string))
    user_df['uniq_char_ratio_name'] = user_df['user'].apply(unique_ratio)
    user_df['uniq_char_ratio_name'] = user_df['uniq_char_ratio_name'].round(3)
    user_df['bot_in_name'] = (user_df['user'].str.lower().str.contains('bot')).astype(int)
    dummies_df = short_df.join(short_df['type'].str.get_dummies())
    dummies_df = dummies_df.groupby('user').sum()
    dummies_df = dummies_df.drop(['bot', 'requests'], axis=1).reset_index()
    try:
        dummies_df = dummies_df.drop('142',axis=1)
    except KeyError:
        pass
    comment_df = short_df[['user', 'comment']]
    comment_df['len_comment'] = comment_df['comment'].str.len()
    def find_alnum_num(name):
        for el in name:
            if type(el) != str:
                return 0
            else:
                return sum(el.isalnum() for el in name)
    comment_df['alnum_ratio_comment'] = comment_df['comment'].astype("str").apply(find_alnum_num)/comment_df['comment'].str.len()
    comment_df['bot_in_comment'] = (comment_df['comment'].str.lower().str.contains('bot'))
    comment_df = comment_df.drop('bot_in_comment', axis=1)
    mean_df = comment_df.groupby('user').mean().rename(columns={'len_comment':'len_comment_avg', 'alnum_ratio_comment':'alnum_ratio_comment_avg'})
    min_df = comment_df.groupby('user').min().rename(columns={'len_comment':'len_comment_min', 'alnum_ratio_comment':'alnum_ratio_comment_mix'})
    max_df = comment_df.groupby('user').max().rename(columns={'len_comment':'len_comment_max', 'alnum_ratio_comment':'alnum_ratio_comment_max'})
    user_df = user_df.merge(dummies_df, on='user', how='outer')
    user_df = user_df.merge(min_df, on='user', how='outer')
    user_df = user_df.merge(mean_df, on='user', how='outer')
    user_df = user_df.merge(max_df, on='user', how='outer')
    user_df.drop(['comment_x', 'comment_y'], axis=1, inplace=True)
    return user_df

# Loading model

In [131]:
model = pickle.load(open('../model/best_model.sav', 'rb'))
model

RandomForestClassifier(max_depth=10, random_state=8)

# Stream data

In [None]:
dataset = []
time_threshold = 20.
t_0  = time.time()

for event in EventSource(URL): # start streaming
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue
            
        dataset.append(change)
            
        if (time.time() - t_0)//60 > time_threshold:
            break

In [83]:
df = pd.DataFrame(dataset)
filename = f'../data/stream_test_data_{int(time_threshold)}mins.csv'
df.to_csv(filename, sep='\t')

# Classify bots

In [104]:
user_df = feature_extraction(filename)
user_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comment_df['len_comment'] = comment_df['comment'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comment_df['alnum_ratio_comment'] = comment_df['comment'].astype("str").apply(find_alnum_num)/comment_df['comment'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comment_df['bot_in_

Unnamed: 0,user,bot,requests,n_digits_name,lead_digits_name,uniq_char_ratio_name,bot_in_name,categorize,edit,log,new,len_comment_min,alnum_ratio_comment_mix,len_comment_avg,alnum_ratio_comment_avg,len_comment_max,alnum_ratio_comment_max
0,1.145.133.93,0,1.0,9,1,0.5,0,0,1,0,0,4,1.0,4.0,1.0,4,1.0
1,1.75.230.98,0,3.0,8,1,0.818,0,2,0,0,1,16,0.545455,54.666667,0.640152,132,0.6875
2,102.23.96.7,0,1.0,8,3,0.727,0,0,1,0,0,15,0.6,15.0,0.6,15,0.6
3,103.132.155.233,0,1.0,12,3,0.4,0,0,6,0,0,14,0.5,23.166667,0.642433,31,0.72
4,103.63.92.29,0,1.0,9,3,0.583,0,0,1,0,0,25,0.8,25.0,0.8,25,0.8


In [105]:
X_val = user_df.drop(['bot', 'user'], axis=1)
y_val = user_df[['bot']]
X_val.shape, y_val.shape

((1770, 15), (1770, 1))

In [106]:
y_pred = model.predict(X_val)



In [107]:
Counter(y_pred)

Counter({0.0: 1658, 1.0: 112})

In [122]:
black_list = [user for pred, user in zip(y_pred, user_df['user']) if pred]
black_list[:10]

['AkBot',
 'AlbeROBOT',
 'Andrebot',
 'AnomieBOT',
 'BattyBot',
 'BoivieBot',
 'BookwormBot',
 'Bot Bozze',
 'Bot1058',
 'BotCancellazioni']

# Initialize Bloom filter

In [109]:
n = len(black_list) # no of items to add
p = 0.1  # false positive probability
 
bloom_filter = BloomFilter(n, p)

for item in black_list:
    bloom_filter.add(item)

bloom_filter.size, bloom_filter.hash_count

(536, 3)

# Apply Bloom filter on stream data

In [110]:
gd_dataset = []
bloom_filter_users = []
time_threshold = 3.0
t_0  = time.time()

for event in EventSource(URL): # start streaming
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue
            
        gd_dataset.append(change)
        
        user_name = change['user']
        
        if bloom_filter.check(user_name):
            bloom_filter_users.append(user_name)
            
        if (time.time() - t_0)//60 > time_threshold:
            break

In [111]:
df = pd.DataFrame(gd_dataset)
df.to_csv(f'../data/stream_validation_data_{int(time_threshold)}mins.csv', sep='\t')

# Evaluate Bloom filter

In [112]:
gd_black_set = set(df[df.bot].user.unique())
bloom_filter_set = set(bloom_filter_users)

In [114]:
len(gd_black_set), len(bloom_filter_set)

(49, 95)

In [120]:
len(bloom_filter_set.intersection(gd_black_set))/len(gd_black_set)

0.7346938775510204

In [121]:
len(bloom_filter_set.intersection(gd_black_set))/len(bloom_filter_set)

0.37894736842105264

# Unified pipeline

In [133]:
model = pickle.load(open('model/best_model.sav', 'rb'))

In [144]:
%%time

results = []

params_grid = [
    {'classifier_time_limit': 20., 'bloom_filter_time_limit': 20., },
    {'classifier_time_limit': 20., 'bloom_filter_time_limit': 15., },
    {'classifier_time_limit': 20., 'bloom_filter_time_limit': 10., },
    {'classifier_time_limit': 20., 'bloom_filter_time_limit': 5., },
    {'classifier_time_limit': 20., 'bloom_filter_time_limit': 3., },
    {'classifier_time_limit': 10., 'bloom_filter_time_limit': 3., },
    {'classifier_time_limit': 5., 'bloom_filter_time_limit': 3., },
    {'classifier_time_limit': 3., 'bloom_filter_time_limit': 1., },
]

for params in params_grid:
    result = params.copy()
    
    t_0  = time.time()
    dataset = []
    
    time_threshold = params['classifier_time_limit']
    print(f'Running stream for {time_threshold} mins...\n')

    # stream to generate data for black list
    for event in EventSource(URL): # start streaming
        if event.event == 'message':
            try:
                change = json.loads(event.data)
            except ValueError:
                continue

            dataset.append(change)

            if (time.time() - t_0)//60 > time_threshold:
                break

    filename = f'data/stream_test_data_{int(time_threshold)}mins_{t_0}.csv'
    df = pd.DataFrame(dataset)
    df.to_csv(filename, sep='\t')
    print(f'Generated file {filename}\n')
    
    t1 = time.time()
    
    user_df = feature_extraction(filename)
    X_val = user_df.drop(['bot', 'user'], axis=1)
    y_val = user_df[['bot']]
    
    y_pred = model.predict(X_val)
    
    black_list = [user for pred, user in zip(y_pred, user_df['user']) if pred]
    print(f'Black list length is {len(black_list)}\n')
    
    # generate bloom filter
    bloom_filter = BloomFilter(len(black_list), 0.1)
    for item in black_list:
        bloom_filter.add(item)
    
    t2 = time.time()
    print(f'Generated Bloom filter with {bloom_filter.size, bloom_filter.hash_count}\n')
    print(f'Took {t2 - t1} seconds to preprocess, predict and create bloom\n')
    
    gd_dataset = []
    bloom_filter_users = []
    t_0  = time.time()
    
    time_threshold = params['bloom_filter_time_limit']
    print(f'Running stream for {time_threshold} mins...\n')

    # stream to generate data for evaluation
    for event in EventSource(URL): # start streaming
        if event.event == 'message':
            try:
                change = json.loads(event.data)
            except ValueError:
                continue

            gd_dataset.append(change)

            user_name = change['user']
            if bloom_filter.check(user_name):
                bloom_filter_users.append(user_name)

            if (time.time() - t_0)//60 > time_threshold:
                break
        
    filename = f'data/stream_validation_data_{int(time_threshold)}mins_{t_0}.csv'
    df = pd.DataFrame(gd_dataset)
    df.to_csv(filename, sep='\t')
    print(f'Generated ground truth file {filename}\n')
    
    gd_black_set = set(df[df.bot].user.unique())
    bloom_filter_set = set(bloom_filter_users)
    
    result.update({
        'real_bots_count': len(gd_black_set),
        'bloom_bots_count': len(bloom_filter_set),
        'acc_rel_real': len(bloom_filter_set.intersection(gd_black_set))/len(gd_black_set),
        'acc_rel_bloom': len(bloom_filter_set.intersection(gd_black_set))/len(bloom_filter_set),
        'intersec_count': len(bloom_filter_set.intersection(gd_black_set)),
    })
    
    print(f'Result - {result}\n')
    results.append(result)

print(results)

with open('results.json', 'w') as output_file:
    json.dump(results, output_file)

Running stream for 20.0 mins...


Generated file data/stream_test_data_20mins_1633734628.8294969.csv

Black list length is 84

Generated Bloom filter with (402, 3)

Took 2.6186375617980957 seconds to preprocess, predict and create bloom

Running stream for 20.0 mins...


Generated ground truth file data/stream_validation_data_20mins_1633735921.1749954.csv

Result - {'classifier_time_limit': 20.0, 'bloom_filter_time_limit': 20.0, 'real_bots_count': 78, 'bloom_bots_count': 214, 'acc_rel_real': 0.6410256410256411, 'acc_rel_bloom': 0.2336448598130841, 'intersec_count': 50}

Running stream for 20.0 mins...


Generated file data/stream_test_data_20mins_1633737182.1160476.csv

Black list length is 117

Generated Bloom filter with (560, 3)

Took 1.8932712078094482 seconds to preprocess, predict and create bloom

Running stream for 15.0 mins...


Generated ground truth file data/stream_validation_data_15mins_1633738444.4594977.csv

Result - {'classifier_time_limit': 20.0, 'bloom_filter_time_lim

In [146]:
df_results = pd.DataFrame(results)
df_results.head()

Unnamed: 0,classifier_time_limit,bloom_filter_time_limit,real_bots_count,bloom_bots_count,acc_rel_real,acc_rel_bloom,intersec_count
0,20.0,20.0,78,214,0.641026,0.233645,50
1,20.0,15.0,87,170,0.586207,0.3,51
2,20.0,10.0,72,161,0.638889,0.285714,46
3,20.0,5.0,36,107,0.805556,0.271028,29
4,20.0,3.0,37,76,0.72973,0.355263,27


In [147]:
df_results.sort_values('acc_rel_real', ascending=False)

Unnamed: 0,classifier_time_limit,bloom_filter_time_limit,real_bots_count,bloom_bots_count,acc_rel_real,acc_rel_bloom,intersec_count
5,10.0,3.0,38,96,0.842105,0.333333,32
3,20.0,5.0,36,107,0.805556,0.271028,29
4,20.0,3.0,37,76,0.72973,0.355263,27
6,5.0,3.0,34,70,0.676471,0.328571,23
7,3.0,1.0,34,48,0.676471,0.479167,23
0,20.0,20.0,78,214,0.641026,0.233645,50
2,20.0,10.0,72,161,0.638889,0.285714,46
1,20.0,15.0,87,170,0.586207,0.3,51
