In [118]:
import pandas as pd
import numpy as np
import re

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score,cross_val_predict
from sklearn.utils.validation import check_array,check_is_fitted
from user_agents import parse
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,precision_recall_curve,precision_recall_curve
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import joblib
from clean_logs import log_list_to_df

import sklearn._config
sklearn.set_config(display='diagram')

In [119]:
class Dot_file_transformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.tolist()
        else:
            self.feature_names_in_ = []
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=['Path'])

        # Ensure 'Path' is string and fill NaNs
        path_series = X['Path'].fillna('').astype(str)

        # Apply regex to detect dotfiles
        dotfile_flag = path_series.apply(lambda p: 1 if re.search(r'/\.[^/]+', p) else 0)

        return pd.DataFrame({'dotfile_access': dotfile_flag}, index=X.index)

    def get_feature_names_out(self, input_features=None):
        return np.array(['dotfile_access'])


In [120]:
class Bad_user_agent_transformer(BaseEstimator, TransformerMixin):
    def __init__(self,user_agent_col = 'User_Agent'):
        self.user_agent_col = user_agent_col
        
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.tolist()
        else:
            self.feature_names_in_ = []
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=[self.user_agent_col])

        bad_ua_flag = X[self.user_agent_col].fillna('').str.lower().str.contains(
            r"(?:bot|curl|scraper|wget|httpclient|requests|spider|crawler|expanse|censys|modat)"
        ).astype(int)

        return pd.DataFrame({'bad_user_agent': bad_ua_flag}, index=X.index)

    def get_feature_names_out(self, input_features=None):
        return np.array(['bad_user_agent'])


In [121]:
class has_referrer_transformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.tolist()
        else:
            self.feature_names_in_ = []
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=['Referrer'])

        has_ref = X['Referrer'].fillna('-').apply(lambda r: 0 if r == '-' else 1)
        return pd.DataFrame({'has_referrer': has_ref}, index=X.index)

    def get_feature_names_out(self, input_features=None):
        return np.array(['has_referrer'])


In [122]:
class Suspicious_path_transformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.tolist()
        else:
            self.feature_names_in_ = []
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=['Path'])

        suspicious_keywords = ['wp-admin', 'phpmyadmin', 'config', 'admin', 'setup',
                               'shell', 'login', 'cmd', 'api', 'backup']

        suspicious = X['Path'].fillna('').str.lower().apply(
            lambda path: int(any(keyword in path for keyword in suspicious_keywords))
        )

        return pd.DataFrame({'suspicious_path': suspicious}, index=X.index)

    def get_feature_names_out(self, input_features=None):
        return np.array(['suspicious_path'])


In [123]:
class BotLabelGenerator(BaseEstimator, TransformerMixin):
    def __init__(self,
                 dotfile_col='dotfile_access',
                 bad_ua_col='bad_user_agent',
                 suspicious_path_col='suspicious_path',
                 output_col='is_bot'):
        self.dotfile_col = dotfile_col
        self.bad_ua_col = bad_ua_col
        self.suspicious_path_col = suspicious_path_col
        self.output_col = output_col

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.tolist()
        else:
            self.feature_names_in_ = []
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame.")

        try:
            # Convert columns to boolean type first, before filling NA
            dotfile = X[self.dotfile_col]
            if dotfile.dtype == object:
                dotfile = dotfile.astype('bool')
            dotfile = dotfile.fillna(False)

            bad_ua = X[self.bad_ua_col]
            if bad_ua.dtype == object:
                bad_ua = bad_ua.astype('bool')
            bad_ua = bad_ua.fillna(False)

            suspicious = X[self.suspicious_path_col]
            if suspicious.dtype == object:
                suspicious = suspicious.astype('bool')
            suspicious = suspicious.fillna(False)

            # Combine conditions
            result = (dotfile | bad_ua | suspicious).astype(int)

        except KeyError as e:
            raise KeyError(f"Missing expected column in input DataFrame: {e}")

        return pd.DataFrame({self.output_col: result}, index=X.index)


    def get_feature_names_out(self, input_features=None):
        return np.array([self.output_col])


In [124]:
class UserAgentParser(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=['User_Agent'])
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.feature_names_in_)

        def parse_ua(ua):
            if pd.isna(ua):
                return pd.Series({
                    'ua_browser': 'no browser',
                    'ua_os': 'no OS',
                    'ua_is_mobile': 0,
                    'ua_is_pc': 0
                })
            parsed = parse(ua)
            return pd.Series({
                'ua_browser': parsed.browser.family,
                'ua_os': parsed.os.family,
                'ua_is_mobile': int(parsed.is_mobile),
                'ua_is_pc': int(parsed.is_pc),
            })

        ua_features = X.iloc[:, 0].apply(parse_ua)
        return ua_features

    def get_feature_names_out(self, input_features=None):
        return np.array(['ua_browser', 'ua_os', 'ua_is_mobile', 'ua_is_pc'])


In [125]:
class User_agent_browser_cleanup(BaseEstimator, TransformerMixin):
    def __init__(self, user_agent_browser='ua_browser'):
        self.user_agent_browser = user_agent_browser

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=[self.user_agent_browser])
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.feature_names_in_)

        col = X.columns[0]
        top_browsers = ['Chrome', 'Firefox', 'Edge', 'Chrome Mobile', 'Opera', 'no browser']
        X = X.copy()
        X[col] = X[col].apply(lambda x: x if x in top_browsers else 'Other')
        return X[[col]]

    def get_feature_names_out(self, input_features=None):
        return np.array([self.feature_names_in_[0]])


In [126]:
class User_agent_os_cleanup(BaseEstimator, TransformerMixin):
    def __init__(self, user_agent_os='ua_os'):
        self.user_agent_os = user_agent_os

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=[self.user_agent_os])
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.feature_names_in_)

        col = X.columns[0]
        top_os = ['Windows', 'iOS', 'Ubuntu', 'Linux', 'Mac OS X', 'Android', 'no OS']
        X = X.copy()
        X[col] = X[col].apply(lambda x: x if x in top_os else 'Other')
        return X[[col]]

    def get_feature_names_out(self, input_features=None):
        return np.array([self.feature_names_in_[0]])


In [127]:
class ArrayToDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, column_names=None):
        self.column_names = column_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        import pandas as pd
        return pd.DataFrame(X, columns=self.column_names)

    def get_feature_names_out(self, input_features=None):
        if self.column_names is not None:
            return np.array(self.column_names)
        else:
            # fallback if no column names were given
            if input_features is not None:
                return np.array(input_features)
            else:
                return np.array([])


In [128]:
cat_pipeline = Pipeline([
    ('impute',SimpleImputer(strategy = 'most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

In [129]:
Dot_pipeline = Pipeline([
    ('impute',SimpleImputer(strategy = 'most_frequent')),
    ('dot_transform',Dot_file_transformer())
])

Bad_ua_pipeline = Pipeline([
    ('impute',SimpleImputer(missing_values='-',strategy='constant',fill_value = np.nan)),
    ('bad_agent',Bad_user_agent_transformer())
])

sus_path_pipeline = Pipeline([
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('sus_path',Suspicious_path_transformer())
])

parse_user_agent_pipeline = Pipeline([
    ('impute',SimpleImputer(missing_values='-',strategy='constant',fill_value = np.nan)),
    ('parse_ua',UserAgentParser())
])

has_referrer_pipeline = Pipeline([
    ('check_referrer',has_referrer_transformer())
])


# Below Pipelines will only be applied when above pipelines are completed
def bot_pipeline(dot_path,user_agent_path,sus_path):
    return Pipeline([
        ('label_bot',BotLabelGenerator(dot_path,user_agent_path,sus_path))
    ])
def browser_pipeline(browser_path):
    return Pipeline([
        ('clean_browser',User_agent_browser_cleanup(browser_path)),
        ('encode',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
    ])

def os_pipeline(user_os_path):
    return Pipeline([
        ('clean_os',User_agent_os_cleanup(user_os_path)),
        ('encode',OneHotEncoder(sparse_output=False,handle_unknown='ignore'))
    ])


In [130]:
to_encode_extracting_categories = ['clean_browser__ua_browser','clean_os__ua_os']
data_featuring = ColumnTransformer([
    ('dot',Dot_pipeline,['Path']),
    ('bad_agent',Bad_ua_pipeline,['User_Agent']),
    ('sus_path',sus_path_pipeline,['Path']),
    ('parse_agent',parse_user_agent_pipeline,['User_Agent']),
    ('referrer_check',has_referrer_pipeline,['Referrer']),
    # ('encode',cat_pipeline,to_encode_featuring_categories)
])

data_extracting = ColumnTransformer([
    ('label',bot_pipeline('dot__dotfile_access','bad_agent__bad_user_agent','sus_path__suspicious_path'),['dot__dotfile_access','bad_agent__bad_user_agent','sus_path__suspicious_path']),
    ('clean_browser',browser_pipeline('parse_agent__ua_browser'),['parse_agent__ua_browser']),
    ('clean_os',os_pipeline('parse_agent__ua_os'),['parse_agent__ua_os']),
],remainder='passthrough')

In [131]:
from clean_logs import log_data_to_df

log_df = log_data_to_df('final_access.log')
log_df

Running.....


Unnamed: 0,IP,Date,Time,Time_zone,Request_method,Path,Protocol,Response_code,Response_size,Referrer,User_Agent
0,185.218.86.4,09/May/2025,00:07:20,0000,GET,/,HTTP/1.1,200,396,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
1,45.95.147.209,09/May/2025,00:14:13,0000,GET,/cgi-bin/luci/;stok=/locale?form=country&opera...,HTTP/1.1,404,162,-,Go-http-client/1.1
2,185.247.137.160,09/May/2025,00:16:26,0000,GET,/,HTTP/1.1,200,396,-,Mozilla/5.0 (compatible; InternetMeasurement/1...
3,154.81.156.7,09/May/2025,00:22:13,0000,GET,/,HTTP/1.1,200,612,-,-
4,198.235.24.17,09/May/2025,00:24:21,0000,GET,/,HTTP/1.1,200,396,-,"Expanse, a Palo Alto Networks company, searche..."
...,...,...,...,...,...,...,...,...,...,...,...
4218,185.218.84.178,10/May/2025,23:09:55,0000,GET,/,HTTP/1.1,200,396,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
4219,147.185.132.136,10/May/2025,23:22:09,0000,GET,/,HTTP/1.1,200,396,-,"Expanse, a Palo Alto Networks company, searche..."
4220,114.199.145.6,10/May/2025,23:32:48,0000,GET,/,HTTP/1.0,200,612,-,curl/7.88.1
4221,154.81.156.35,10/May/2025,23:32:53,0000,GET,/,HTTP/1.1,200,612,-,curl/7.81.0


In [132]:
train_set,test_set = train_test_split(log_df,test_size=0.2,stratify=log_df['Response_code'],random_state=42)
base_set = train_set.copy()

In [133]:
# data_featuring.fit(train_set)  # Or whatever your base DataFrame is
# featuring_columns = data_featuring.get_feature_names_out()
# data_featuring.get_feature_names_out()

In [134]:
def transform_df_for_random_forest(base_set , df_path):
    
    data_featuring.fit(base_set)  # Or whatever your base DataFrame is
    featuring_columns = data_featuring.get_feature_names_out()
    data_featuring.get_feature_names_out()

    to_df = ArrayToDataFrame(column_names=featuring_columns)

    full_pipeline = Pipeline([
        ('featuring', data_featuring),
        ('to_df', to_df),
        ('extracting', data_extracting),
    ])

    full_pipeline.fit(base_set)
    log_df_path = full_pipeline.transform(df_path)
    log_df_path_df = pd.DataFrame(log_df_path,columns = full_pipeline.get_feature_names_out(), index = df_path.index)
    # print(full_pipeline.get_feature_names_out())
    final_log_train_set = log_df_path_df.drop('label__is_bot',axis = 1).apply(pd.to_numeric)
    final_log_train_labels = log_df_path_df['label__is_bot'].astype(int)
    return final_log_train_set,final_log_train_labels


In [135]:
log_lines = [
    '192.168.1.10 - - [25/May/2025:14:05:32 +0000] "GET /index.html HTTP/1.1" 200 1024 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"',
    '203.0.113.45 - - [25/May/2025:14:05:34 +0000] "GET /.git/config HTTP/1.1" 403 512 "-" "python-requests/2.25.1"',
    '198.51.100.22 - - [25/May/2025:14:05:37 +0000] "GET /admin HTTP/1.1" 401 256 "-" "sqlmap/1.4.12"',
    '192.0.2.101 - - [25/May/2025:14:06:02 +0000] "POST /login HTTP/1.1" 200 890 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"',
    '203.0.113.99 - - [25/May/2025:14:06:10 +0000] "GET /search.php?q=../../etc/passwd HTTP/1.1" 400 300 "-" "curl/7.64.1"',
    '10.0.0.5 - - [25/May/2025:14:06:15 +0000] "GET /images/logo.png HTTP/1.1" 200 2048 "-" "Mozilla/5.0 (X11; Ubuntu; Linux x86_64)"',
    '192.168.1.15 - - [25/May/2025:14:06:20 +0000] "GET /wp-login.php HTTP/1.1" 404 123 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64)"',
    '198.51.100.55 - - [25/May/2025:14:06:45 +0000] "GET /scripts/setup.php?cmd=whoami HTTP/1.1" 403 0 "-" "nikto/2.1.6 (Evasions)"',
    '172.16.0.20 - - [25/May/2025:14:07:01 +0000] "GET /contact HTTP/1.1" 200 640 "-" "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3)"',
    '203.0.113.77 - - [25/May/2025:14:07:10 +0000] "GET /.env HTTP/1.1" 403 230 "-" "Mozilla/5.0 zgrab/0.x"',
    '10.0.0.10 - - [25/May/2025:14:07:30 +0000] "GET /dashboard HTTP/1.1" 200 3000 "-" "Mozilla/5.0 (Linux; Android 10)"',
    '192.0.2.200 - - [25/May/2025:14:07:59 +0000] "GET /cgi-bin/test-cgi HTTP/1.1" 500 105 "-" "Wget/1.20.3 (linux-gnu)"',
    '192.168.1.100 - - [25/May/2025:14:08:12 +0000] "POST /api/v1/users HTTP/1.1" 201 980 "-" "Mozilla/5.0 (Windows NT 10.0; rv:89.0)"',
    '203.0.113.15 - - [25/May/2025:14:08:30 +0000] "GET /?s=<script>alert(1)</script> HTTP/1.1" 200 450 "-" "Mozilla/5.0 (X11; Kali Linux)"',
    '198.51.100.1 - - [25/May/2025:14:08:45 +0000] "GET /phpinfo.php HTTP/1.1" 200 1500 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"',
]

dummy_log_df = log_list_to_df(log_lines)
temp_X,temp_Y = transform_df_for_random_forest(base_set,dummy_log_df)
temp_X

Unnamed: 0,clean_browser__parse_agent__ua_browser_Chrome,clean_browser__parse_agent__ua_browser_Chrome Mobile,clean_browser__parse_agent__ua_browser_Edge,clean_browser__parse_agent__ua_browser_Firefox,clean_browser__parse_agent__ua_browser_Opera,clean_browser__parse_agent__ua_browser_Other,clean_browser__parse_agent__ua_browser_no browser,clean_os__parse_agent__ua_os_Android,clean_os__parse_agent__ua_os_Linux,clean_os__parse_agent__ua_os_Mac OS X,clean_os__parse_agent__ua_os_Other,clean_os__parse_agent__ua_os_Ubuntu,clean_os__parse_agent__ua_os_Windows,clean_os__parse_agent__ua_os_iOS,clean_os__parse_agent__ua_os_no OS,remainder__parse_agent__ua_is_mobile,remainder__parse_agent__ua_is_pc,remainder__referrer_check__has_referrer
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1,0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1,0
6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0
7,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
8,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0
9,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0


In [136]:
train_set

Unnamed: 0,IP,Date,Time,Time_zone,Request_method,Path,Protocol,Response_code,Response_size,Referrer,User_Agent
2425,154.83.103.144,07/May/2025,14:34:01,0000,GET,/?content=../../../../etc/mysql/my.cnf,HTTP/1.1,200,396,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
3629,122.173.30.2,12/May/2025,14:09:59,0000,GET,/,HTTP/1.1,200,51,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
2237,154.83.103.144,07/May/2025,14:33:27,0000,GET,/retry/payment_attempts.db,HTTP/1.1,404,197,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
1685,154.83.103.144,07/May/2025,14:31:47,0000,GET,/webhook/send,HTTP/1.1,404,197,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
1,45.95.147.209,09/May/2025,00:14:13,0000,GET,/cgi-bin/luci/;stok=/locale?form=country&opera...,HTTP/1.1,404,162,-,Go-http-client/1.1
...,...,...,...,...,...,...,...,...,...,...,...
1764,154.83.103.144,07/May/2025,14:32:01,0000,GET,/admin/users/1/roles,HTTP/1.1,404,197,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
2412,154.83.103.144,07/May/2025,14:33:58,0000,GET,/?template_file=../../../../etc/hostname,HTTP/1.1,200,396,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
2371,154.83.103.144,07/May/2025,14:33:51,0000,GET,/?view=../../../../etc/hosts,HTTP/1.1,200,396,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
976,154.83.103.144,07/May/2025,14:29:39,0000,GET,/ios/Podfile,HTTP/1.1,404,197,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...


In [137]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Separate features and target

X,Y = transform_df_for_random_forest(log_df,log_df)
X_test,Y_test = transform_df_for_random_forest(log_df,test_set)
forest_classifier = RandomForestClassifier(n_estimators=200,random_state=42,class_weight='balanced')

# Fit
model = forest_classifier.fit(X, Y)

In [138]:
X

Unnamed: 0,clean_browser__parse_agent__ua_browser_Chrome,clean_browser__parse_agent__ua_browser_Chrome Mobile,clean_browser__parse_agent__ua_browser_Edge,clean_browser__parse_agent__ua_browser_Firefox,clean_browser__parse_agent__ua_browser_Opera,clean_browser__parse_agent__ua_browser_Other,clean_browser__parse_agent__ua_browser_no browser,clean_os__parse_agent__ua_os_Android,clean_os__parse_agent__ua_os_Linux,clean_os__parse_agent__ua_os_Mac OS X,clean_os__parse_agent__ua_os_Other,clean_os__parse_agent__ua_os_Ubuntu,clean_os__parse_agent__ua_os_Windows,clean_os__parse_agent__ua_os_iOS,clean_os__parse_agent__ua_os_no OS,remainder__parse_agent__ua_is_mobile,remainder__parse_agent__ua_is_pc,remainder__referrer_check__has_referrer
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4218,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0
4219,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
4220,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
4221,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0


In [139]:
test_pred = forest_classifier.predict(X_test)

In [140]:
cm_test = confusion_matrix(Y_test,test_pred)
cm_test

array([[ 82, 308],
       [ 10, 445]])

In [141]:
print(X.dtypes.value_counts())

float64    15
int64       3
Name: count, dtype: int64


In [142]:
cross_val_score(forest_classifier,X,Y,cv = 5,scoring = 'f1')

array([0.73162675, 0.69397218, 0.6929621 , 0.7346586 , 0.80571429])

In [143]:
log_preds = cross_val_predict(forest_classifier,X,Y,cv = 5)
log_preds

array([0, 1, 1, ..., 1, 1, 0], shape=(4223,))

In [144]:
cm = confusion_matrix(Y,log_preds)
cm

array([[ 406, 1573],
       [  56, 2188]])

In [145]:
X

Unnamed: 0,clean_browser__parse_agent__ua_browser_Chrome,clean_browser__parse_agent__ua_browser_Chrome Mobile,clean_browser__parse_agent__ua_browser_Edge,clean_browser__parse_agent__ua_browser_Firefox,clean_browser__parse_agent__ua_browser_Opera,clean_browser__parse_agent__ua_browser_Other,clean_browser__parse_agent__ua_browser_no browser,clean_os__parse_agent__ua_os_Android,clean_os__parse_agent__ua_os_Linux,clean_os__parse_agent__ua_os_Mac OS X,clean_os__parse_agent__ua_os_Other,clean_os__parse_agent__ua_os_Ubuntu,clean_os__parse_agent__ua_os_Windows,clean_os__parse_agent__ua_os_iOS,clean_os__parse_agent__ua_os_no OS,remainder__parse_agent__ua_is_mobile,remainder__parse_agent__ua_is_pc,remainder__referrer_check__has_referrer
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4218,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0
4219,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
4220,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
4221,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0


In [146]:
precision_score(Y,log_preds)

0.5817601701675087

In [147]:
recall_score(Y,log_preds)

0.9750445632798574

In [148]:
Y

0       0
1       0
2       0
3       0
4       1
       ..
4218    0
4219    1
4220    1
4221    1
4222    0
Name: label__is_bot, Length: 4223, dtype: int64

In [149]:
Y.value_counts()

label__is_bot
1    2244
0    1979
Name: count, dtype: int64

In [150]:
precision_score(Y_test,test_pred)

0.5909694555112882

In [151]:
recall_score(Y_test,test_pred)

0.978021978021978

In [152]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'class_weight' : ['balanced'],
    'n_estimators': [250,260,270],
    'max_features': [5,6,7,10],
    'max_depth': [3,4,5,6]
}

grid_search = GridSearchCV(
    estimator=forest_classifier,
    param_grid=param_grid,
    cv=3,
    scoring='f1',
    n_jobs= 3, 
    verbose=2   # Show progress
)

grid_search.fit(X, Y)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END class_weight=balanced, max_depth=3, max_features=5, n_estimators=250; total time=   0.4s
[CV] END class_weight=balanced, max_depth=3, max_features=5, n_estimators=250; total time=   0.4s
[CV] END class_weight=balanced, max_depth=3, max_features=5, n_estimators=250; total time=   0.3s
[CV] END class_weight=balanced, max_depth=3, max_features=5, n_estimators=260; total time=   0.4s
[CV] END class_weight=balanced, max_depth=3, max_features=5, n_estimators=260; total time=   0.4s
[CV] END class_weight=balanced, max_depth=3, max_features=5, n_estimators=260; total time=   0.3s
[CV] END class_weight=balanced, max_depth=3, max_features=5, n_estimators=270; total time=   0.4s
[CV] END class_weight=balanced, max_depth=3, max_features=5, n_estimators=270; total time=   0.5s
[CV] END class_weight=balanced, max_depth=3, max_features=5, n_estimators=270; total time=   0.5s
[CV] END class_weight=balanced, max_depth=3, max_feature

In [153]:
grid_search.best_params_

{'class_weight': 'balanced',
 'max_depth': 6,
 'max_features': 5,
 'n_estimators': 250}

In [154]:
grid_search.best_score_

np.float64(0.6543039629937925)

In [155]:
final_model = grid_search.best_estimator_

In [156]:
joblib.dump(final_model,'bot_detection_model.pkl')

['bot_detection_model.pkl']