# Bot classifier

## Set-up

In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import time

from random import shuffle

## Dataset import

In [2]:
# reading prepared dataset
df = pd.read_csv('https://raw.githubusercontent.com/madmalewolf/stat-open-/main/all_data.tsv.gz', sep='\t', compression='gzip',)
print(f'df.shape = {df.shape}')
df.head(2)

df.shape = (120467, 26)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,$schema,meta,id,type,namespace,title,comment,timestamp,user,bot,minor,patrolled,length,revision,server_url,server_name,server_script_path,wiki,parsedcomment,log_id,log_type,log_action,log_params,log_action_comment
0,0,0,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q108828...,1556609000.0,edit,0,Q108828195,/* wbsetreference-add:2| */ [[Property:P570]]:...,1633634790,Quesotiotyo,False,False,True,"{'old': 28649, 'new': 29712}","{'old': 1509178938, 'new': 1509178965}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Ad...",,,,,
1,1,1,/mediawiki/recentchange/1.0.0,{'uri': 'https://www.wikidata.org/wiki/Q175855...,1556609000.0,edit,0,Q17585531,/* wbsetreference-set:2| */ [[Property:P141]]:...,1633634789,SuccuBot,True,False,True,"{'old': 16134, 'new': 16134}","{'old': 1458793714, 'new': 1509178961}",https://www.wikidata.org,www.wikidata.org,/w,wikidatawiki,"‎<span dir=""auto""><span class=""autocomment"">Fu...",,,,,


## Data preprocessing

In [3]:
# Narrow to relevant data
short_df = df[['timestamp', 'user', 'bot', 'type', 'comment']].copy()
short_df.dropna(inplace=True)
print(f'short_df.shape = {short_df.shape}')
short_df.head(2)

short_df.shape = (110072, 5)


Unnamed: 0,timestamp,user,bot,type,comment
0,1633634790,Quesotiotyo,False,edit,/* wbsetreference-add:2| */ [[Property:P570]]:...
1,1633634789,SuccuBot,True,edit,/* wbsetreference-set:2| */ [[Property:P141]]:...


### Feature engineering

In [5]:
# Convert time column to datetime type
short_df['timestamp'] = pd.to_datetime(short_df['timestamp'].loc[:], unit='s')

# Add column for number of requests
short_df = short_df.assign(requests=1)

# Group by User and Merge Number of Requests every 1Sec
user_df = short_df.set_index('timestamp').groupby('user')[['bot','requests',]].resample("1S", label='right').sum()

# Convert bot column to Boolean
user_df['bot'] = [1 if x > 0.5 else 0 for x in user_df['bot']]

# Reset index
user_df = user_df.reset_index()

# Remove useless rows (without requests)
user_df = user_df[user_df['requests'] != 0]

# Group by user and get the average number of requests
user_df = user_df.groupby('user', as_index=False)[['bot','requests']].mean()

# Make bot column boolean
user_df['bot'] = user_df['bot'].astype(int)

In [6]:
# Count amount of digits in name
def count_digits(string):
    return sum(c.isdigit() for c in string)

user_df['n_digits_name'] = user_df['user'].apply(count_digits)

# Find Lead digits
find_lead_digits = lambda name: len(re.findall('^\d+', name)[0]) if name[0].isdigit() else 0

user_df['lead_digits_name'] = user_df['user'].apply(find_lead_digits)

# Ratio of unique characters in name
def unique_ratio(string):
    return (len(set(string)) / len(string))

user_df['uniq_char_ratio_name'] = user_df['user'].apply(unique_ratio)
user_df['uniq_char_ratio_name'] = user_df['uniq_char_ratio_name'].round(3)

# Check for word 'bot' in name
user_df['bot_in_name'] = (user_df['user'].str.lower().str.contains('bot')).astype(int)

In [7]:
# Turn type into dummy variables
dummies_df = short_df.join(short_df['type'].str.get_dummies())
dummies_df = dummies_df.groupby('user').sum()
dummies_df = dummies_df.drop(['bot', 'requests'], axis=1).reset_index()
dummies_df = dummies_df.drop('142',axis=1)

In [8]:
# Work with Comment
comment_df = short_df[['user', 'comment']]

# Length of comment
comment_df['len_comment'] = comment_df['comment'].str.len()

# Ratio of alphanumeric chars to total chars
#find_alnum_num = lambda name: sum(el.isalnum() for el in name)
def find_alnum_num(name):
    for el in name:
        if type(el) != str:
            return 0
        else:
            return sum(el.isalnum() for el in name)
comment_df['alnum_ratio_comment'] = comment_df['comment'].astype("str").apply(find_alnum_num)/comment_df['comment'].str.len()

# Check for word 'bot' in the comment
comment_df['bot_in_comment'] = (comment_df['comment'].str.lower().str.contains('bot'))

comment_df = comment_df.drop('bot_in_comment', axis=1)

mean_df = comment_df.groupby('user').mean().rename(columns={'len_comment':'len_comment_avg', 'alnum_ratio_comment':'alnum_ratio_comment_avg'})
min_df = comment_df.groupby('user').min().rename(columns={'len_comment':'len_comment_min', 'alnum_ratio_comment':'alnum_ratio_comment_mix'})
max_df = comment_df.groupby('user').max().rename(columns={'len_comment':'len_comment_max', 'alnum_ratio_comment':'alnum_ratio_comment_max'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
# Merge all dfs into one
user_df = user_df.merge(dummies_df, on='user', how='outer')
user_df = user_df.merge(min_df, on='user', how='outer')
user_df = user_df.merge(mean_df, on='user', how='outer')
user_df = user_df.merge(max_df, on='user', how='outer')

# user_df.drop(['comment_x', 'comment_y'], axis=1, inplace=True)

In [10]:
len(user_df.where(user_df['bot']==True).dropna(subset=['bot'])['user'].unique()), 'unique bots in dataset'

(117, 'unique bots in dataset')

In [11]:
len(user_df.where(user_df['bot']==False).dropna(subset=['bot'])['user'].unique()), 'unique human in dataset'

(5359, 'unique human in dataset')

In [12]:
user_df.head(2)

Unnamed: 0,user,bot,requests,n_digits_name,lead_digits_name,uniq_char_ratio_name,bot_in_name,categorize,edit,log,new,len_comment_min,alnum_ratio_comment_mix,len_comment_avg,alnum_ratio_comment_avg,len_comment_max,alnum_ratio_comment_max
0,(:Julien:),0,2.0,0,0,0.9,0,2,6,0,0,26,0.676471,54.5,0.739567,116,0.827586
1,(a)nnihilation97,0,2.217391,2,0,0.688,0,90,61,0,2,16,0.625,83.96732,0.729419,494,0.810345


In [13]:
k = 1.5 #HAVE TO DECIDE

unique_bots = user_df.where(user_df['bot']==1).dropna(subset=['bot'])['user'].unique()
n_unique_bots = len(unique_bots)
unique_ppl = user_df.where(user_df['bot']==0).dropna(subset=['bot'])['user'].unique()
n_unique_ppl = len(unique_ppl)
print(f'There are {n_unique_bots} unique bots and {n_unique_ppl} unique people. % of bots: {round(n_unique_bots/(n_unique_bots+n_unique_ppl), 3)}')
n_downsmpld = int(n_unique_bots*k)

# Randomly sample n_downsmpld elements from your dataframe
df_elements = user_df.where(user_df['bot']==0).dropna().sample(n=n_downsmpld)
df_downsmpld = user_df.loc[user_df.index.isin(df_elements.index)]
df_bots = user_df.where(user_df['bot']==1).dropna()
user_df_downsmpld = pd.concat([df_bots, df_downsmpld])

There are 117 unique bots and 5359 unique people. % of bots: 0.021


In [15]:
user_df_downsmpld.bot.value_counts()

0.0    175
1.0    117
Name: bot, dtype: int64

## Modeling

In [64]:
import pickle
import xgboost as xgb

from collections import Counter
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier

random_seed = 8

In [28]:
test_size = 0.2 
X_train, X_test, y_train, y_test = train_test_split(user_df_downsmpld.drop(['bot'], axis=1), user_df_downsmpld['bot'], test_size=test_size, random_state=random_seed)
X_train = X_train.drop('user', axis=1)
X_test = X_test.drop('user', axis=1)

In [18]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((233, 15), (233,), (59, 15), (59,))

In [37]:
def evaluate_model(y_pred, y_true):
    print(classification_report(y_true, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    print(f'Confusion matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}')

### Random Forest

In [38]:
model_rf = RandomForestClassifier(random_state=random_seed)

model_rf.fit(X_train, y_train)
y_rf = model_rf.predict(X_test)

evaluate_model(y_test, y_rf)

              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99        38
         1.0       0.95      1.00      0.98        21

    accuracy                           0.98        59
   macro avg       0.98      0.99      0.98        59
weighted avg       0.98      0.98      0.98        59

Confusion matrix: TP=21, TN=37, FP=1, FN=0


In [48]:
feature_importance = pd.DataFrame(model_rf.feature_importances_, X_train.columns).reset_index()
feature_importance.columns = ['feature_name', 'feature_importance']
feature_importance.sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_name,feature_importance
4,bot_in_name,0.500404
6,edit,0.093047
9,len_comment_min,0.057794
12,alnum_ratio_comment_avg,0.048829
3,uniq_char_ratio_name,0.044282
10,alnum_ratio_comment_mix,0.043221
13,len_comment_max,0.037686
1,n_digits_name,0.034467
14,alnum_ratio_comment_max,0.03431
11,len_comment_avg,0.033953


In [52]:
%%time
params_rf = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 50, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [100, 200, 500],
    }

grid = GridSearchCV(estimator=model_rf, param_grid=params_rf, cv = 5, scoring='f1')
grid.fit(X_train, y_train)

CPU times: user 12min 47s, sys: 3.31 s, total: 12min 50s
Wall time: 12min 49s


In [54]:
grid.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [55]:
y_rf = grid.best_estimator_.predict(X_test)

evaluate_model(y_test, y_rf)

              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99        38
         1.0       0.95      1.00      0.98        21

    accuracy                           0.98        59
   macro avg       0.98      0.99      0.98        59
weighted avg       0.98      0.98      0.98        59

Confusion matrix: TP=21, TN=37, FP=1, FN=0


### XGBoost

In [56]:
model_xgb = xgb.XGBClassifier(random_state=random_seed)

model_xgb.fit(X_train, y_train)
y_xgb = model_xgb.predict(X_test)

evaluate_model(y_test, y_xgb)

              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99        38
         1.0       0.95      1.00      0.98        21

    accuracy                           0.98        59
   macro avg       0.98      0.99      0.98        59
weighted avg       0.98      0.98      0.98        59

Confusion matrix: TP=21, TN=37, FP=1, FN=0


In [57]:
feature_importance = pd.DataFrame(model_xgb.feature_importances_, X_train.columns).reset_index()
feature_importance.columns = ['feature_name', 'feature_importance']
feature_importance.sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_name,feature_importance
4,bot_in_name,0.781877
6,edit,0.035039
1,n_digits_name,0.034883
9,len_comment_min,0.026699
14,alnum_ratio_comment_max,0.022945
3,uniq_char_ratio_name,0.019672
11,len_comment_avg,0.014455
8,new,0.013094
13,len_comment_max,0.012898
0,requests,0.011307


In [58]:
%%time
params_xgb = {
    'model__max_depth': [3, 5, 7, 8, 10],
    'model__n_estimators': [3, 5, 10, 100],
    'learning_rate': [0.001, 0.1, 0.03, 0.05, 0.08],
    'colsample_bytree': [0.7, 0.5],
    'subsample': [0.6, 0.5]
}

grid = GridSearchCV(estimator=model_xgb, param_grid=params_xgb, cv = 5, scoring='f1')
grid.fit(X_train, y_train)

CPU times: user 48.7 s, sys: 1.07 s, total: 49.7 s
Wall time: 49.8 s


In [59]:
grid.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.001,
 'model__max_depth': 3,
 'model__n_estimators': 3,
 'subsample': 0.6}

In [60]:
y_xgb = grid.best_estimator_.predict(X_test)

evaluate_model(y_test, y_xgb)

              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99        38
         1.0       0.95      1.00      0.98        21

    accuracy                           0.98        59
   macro avg       0.98      0.99      0.98        59
weighted avg       0.98      0.98      0.98        59

Confusion matrix: TP=21, TN=37, FP=1, FN=0


## Pickle best model

In [62]:
model_rf = RandomForestClassifier(random_state=random_seed,
                                  bootstrap=True,
                                  max_depth=10,
                                  max_features='auto',
                                  min_samples_leaf=1,
                                  min_samples_split=2,
                                  n_estimators=100)

model_rf.fit(X_train, y_train)
y_rf = model_rf.predict(X_test)

evaluate_model(y_test, y_rf)

              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99        38
         1.0       0.95      1.00      0.98        21

    accuracy                           0.98        59
   macro avg       0.98      0.99      0.98        59
weighted avg       0.98      0.98      0.98        59

Confusion matrix: TP=21, TN=37, FP=1, FN=0


In [65]:
os.makedirs('model', exist_ok=True)

filename = '../model/best_model.sav'
pickle.dump(model_rf, open(filename, 'wb'))

# Feature Extraction in one function

In [None]:
def feature_extraction(datafile):
    if datafile.endswith('.gz'):
        df = pd.read_csv(datafile, sep='\t', compression='gzip',)
    else:
        df = pd.read_csv(datafile, sep='\t',)
    short_df = df[['timestamp', 'user', 'bot', 'type', 'comment']].copy()
    short_df.dropna(inplace=True)
    short_df['timestamp'] = pd.to_datetime(short_df['timestamp'].loc[:], unit='s')
    short_df = short_df.assign(requests=1)
    user_df = short_df.set_index('timestamp').groupby('user')[['bot','requests',]].resample("1S", label='right').sum()
    user_df['bot'] = [1 if x > 0.5 else 0 for x in user_df['bot']]
    user_df = user_df.reset_index()
    user_df = user_df[user_df['requests'] != 0]
    user_df = user_df.groupby('user', as_index=False)[['bot','requests']].mean()
    user_df['bot'] = user_df['bot'].astype(int)
    def count_digits(string): return sum(c.isdigit() for c in string)
    user_df['n_digits_name'] = user_df['user'].apply(count_digits)
    find_lead_digits = lambda name: len(re.findall('^\d+', name)[0]) if name[0].isdigit() else 0
    user_df['lead_digits_name'] = user_df['user'].apply(find_lead_digits)
    def unique_ratio(string): return (len(set(string)) / len(string))
    user_df['uniq_char_ratio_name'] = user_df['user'].apply(unique_ratio)
    user_df['uniq_char_ratio_name'] = user_df['uniq_char_ratio_name'].round(3)
    user_df['bot_in_name'] = (user_df['user'].str.lower().str.contains('bot')).astype(int)
    dummies_df = short_df.join(short_df['type'].str.get_dummies())
    dummies_df = dummies_df.groupby('user').sum()
    dummies_df = dummies_df.drop(['bot', 'requests'], axis=1).reset_index()
    dummies_df = dummies_df.drop('142',axis=1)
    comment_df = short_df[['user', 'comment']]
    comment_df['len_comment'] = comment_df['comment'].str.len()
    def find_alnum_num(name):
        for el in name:
            if type(el) != str:
                return 0
            else:
                return sum(el.isalnum() for el in name)
    comment_df['alnum_ratio_comment'] = comment_df['comment'].astype("str").apply(find_alnum_num)/comment_df['comment'].str.len()
    comment_df['bot_in_comment'] = (comment_df['comment'].str.lower().str.contains('bot'))
    comment_df = comment_df.drop('bot_in_comment', axis=1)
    mean_df = comment_df.groupby('user').mean().rename(columns={'len_comment':'len_comment_avg', 'alnum_ratio_comment':'alnum_ratio_comment_avg'})
    min_df = comment_df.groupby('user').min().rename(columns={'len_comment':'len_comment_min', 'alnum_ratio_comment':'alnum_ratio_comment_mix'})
    max_df = comment_df.groupby('user').max().rename(columns={'len_comment':'len_comment_max', 'alnum_ratio_comment':'alnum_ratio_comment_max'})
    user_df = user_df.merge(dummies_df, on='user', how='outer')
    user_df = user_df.merge(min_df, on='user', how='outer')
    user_df = user_df.merge(mean_df, on='user', how='outer')
    user_df = user_df.merge(max_df, on='user', how='outer')
    user_df.drop(['comment_x', 'comment_y'], axis=1, inplace=True)
    return user_df