In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    precision_score,
    recall_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
    make_scorer
)

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE



  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
with tqdm() as bar:
    # do not skip any of the rows, but update the progress bar instead
    df = pd.read_csv('data/train.csv', skiprows=lambda x: bar.update(1) and False)

df.head()

26207it [00:00, 112630.00it/s]


Unnamed: 0,created_at,default_profile,default_profile_image,description,favourites_count,followers_count,friends_count,geo_enabled,id,lang,location,profile_background_image_url,profile_image_url,screen_name,statuses_count,verified,average_tweets_per_day,account_age_days,target
0,2012-01-15 23:40:09,True,False,Cosplayer/Fitness lover. Come to me https://t....,74,7,0,False,465096524,en,unknown,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/9666745212...,reml5477,20,False,0.006,3138,1
1,2016-10-04 00:44:39,False,False,pobody‚Äôs nerfect,50443,164,590,True,783105517673648132,cy,she/her,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1281752126...,kinlibra,6469,False,4.572,1415,0
2,2009-05-23 04:04:13,False,False,gracias por participar üèÖ,9394,208,189,False,41970759,es,La diaspora,http://abs.twimg.com/images/themes/theme17/bg.gif,http://pbs.twimg.com/profile_images/1233811596...,_delaualau,30296,False,7.378,4106,0
3,2009-05-17 04:31:31,False,False,Stand Up Comedian/Actor from North Philadelphi...,46,66180,1090,True,40607946,en,"Calabasas, CA",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1184851104...,SpankHorton,164957,False,40.116,4112,0
4,2009-02-16 13:11:21,True,False,Assignment Editor at NBC10 and President of Ja...,1223,487,867,True,20983433,en,"Jenkintown, PA",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/5234863934...,javelinjt,1752,False,0.417,4201,0


## Train/Val/Test Split

The proportion of Train/Val/Test Split is:
<ul>
    <li>
        Train: 70%
    </li>
    <li>
        Val: 20%
    </li>
    <li>
        Test: 10%
    </li>
</ul>

In [3]:
X = df.drop('target', axis = 1) 
y = df['target']

In [4]:
# Initial split: 80% training, 20% validation & test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# Second split: 10% validation, 10% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1, stratify=y_temp)

## Handling Missing Values and Type Conversion

In [5]:
df.isnull().sum()

created_at                         0
default_profile                    0
default_profile_image              0
description                     5091
favourites_count                   0
followers_count                    0
friends_count                      0
geo_enabled                        0
id                                 0
lang                            5588
location                           2
profile_background_image_url    3235
profile_image_url                  1
screen_name                        0
statuses_count                     0
verified                           0
average_tweets_per_day             0
account_age_days                   0
target                             0
dtype: int64

In [6]:
# === function definitions ===
def handle_missing_values(X): # to handle the missing values
    cols = ['description', 'lang', 'location', 'profile_background_image_url', 'profile_image_url']
    values = ['', 'unknown', 'unknown', 'unknown', 'http://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png']

    for i in range(len(cols)):
        X[ cols[i] ] = X[ cols[i] ].fillna( values[i] )

def type_conversion(X):
    # to boolean int
    cols = ['default_profile','default_profile_image', 'geo_enabled', 'verified']
    for col in cols:
        X[col] = X[col].astype(int)

    # to int
    cols = ['favourites_count','followers_count','friends_count', 'statuses_count', 'account_age_days']
    for col in cols:
        X[col] = X[col].astype(int)

    # to string
    cols = ['description', 'location','profile_background_image_url','profile_image_url','screen_name']
    for col in cols:
        X[col] = X[col].astype(str)

    # to category
    cols = ['lang'] 
    for col in cols:
        X[col] = X[col].astype('category')

    # to float
    cols = ['average_tweets_per_day']
    for col in cols:
        X[col] = X[col].astype(float)

In [7]:
# handle missing values
handle_missing_values(X_train)
handle_missing_values(X_val)

# type conversion
type_conversion(X_train)
type_conversion(X_val)

In [8]:
X_train.isnull().sum()

created_at                      0
default_profile                 0
default_profile_image           0
description                     0
favourites_count                0
followers_count                 0
friends_count                   0
geo_enabled                     0
id                              0
lang                            0
location                        0
profile_background_image_url    0
profile_image_url               0
screen_name                     0
statuses_count                  0
verified                        0
average_tweets_per_day          0
account_age_days                0
dtype: int64

In [9]:
X_val.isnull().sum()

created_at                      0
default_profile                 0
default_profile_image           0
description                     0
favourites_count                0
followers_count                 0
friends_count                   0
geo_enabled                     0
id                              0
lang                            0
location                        0
profile_background_image_url    0
profile_image_url               0
screen_name                     0
statuses_count                  0
verified                        0
average_tweets_per_day          0
account_age_days                0
dtype: int64

## Feature Engineering

In [10]:
# ==== function definitions ====
# log transform skewed data
def normalize(X):
    cols = ['favourites_count', 'followers_count', 'friends_count', 'statuses_count', 'average_tweets_per_day']

    for col in cols:
        X[col] = np.log(X[col] + 1)

# feature cross
def feature_cross(X):
    # favourites_count, followers_count, friends_count, statuses_count, average_tweets_per_day,account_age_days

    X['favourites_per_day'] = X['favourites_count']/X['account_age_days']
    X['followers_per_day'] = X['followers_count']/X['account_age_days']
    X['friends_per_day'] = X['friends_count']/X['account_age_days']

    X['followers_to_friends_ratio'] = X['followers_count']/(X['friends_count']+1)

# bucketize
def bucketize(X):
    # lang
    langs = ['en', 'unknown', 'es', 'ar']
    X['lang_bucket'] = X['lang'].apply(lambda x: x if x in langs else 'others')

    # location
    location_counts = X['location'].value_counts()
    low_val_locations = location_counts[location_counts == 1].index # bucketize when location count == 1
    X['location_bucket'] = X['location'].apply(lambda x: 'others' if x in low_val_locations else x)

# one hot
def one_hot(X):
    X = X.copy()  # avoid modifying original
    
    # one-hot for lang_bucket
    lang_dummies = pd.get_dummies(X['lang_bucket'], prefix='lang').astype(int)

    
    X['hasNoBackgroundImg'] = (X['profile_background_image_url'] == 'unknown').astype(int)
    X['hasBotInScreenName'] = X['screen_name'].str.contains(r'bot', case=False, na=False).astype(int)
    X['screenNameOnlyNumeric'] = X['screen_name'].str.contains(r'[0-9]+', case=False, na=False).astype(int)

    # location
    X['isLocationUnknown'] = (X['location_bucket'] == 'unknown').astype(int)
    X['isLocationOthers'] = (X['location_bucket'] == 'others').astype(int)
  
    X = pd.concat([X, lang_dummies], axis=1)
    
    return X

def text_analysis(X):
    X = X.copy()  # avoid modifying original
    
    # Count number of '#' in each description
    X['num_hashtags'] = X['description'].str.count('#')
    # Count number of 'http' (i.e., links)
    X['num_links'] = X['description'].str.count('http')

    return X

# drop columns that are not needed for predictions
def drop_cols(X):
    cols = ['created_at', 'description', 'id', 'lang', 'location', 'profile_background_image_url', 'profile_image_url', 'screen_name', 'lang_bucket', 'location_bucket']
    return X.drop(cols, axis = 1)

In [11]:
# normalize
normalize(X_train)
normalize(X_val)

# feature cross
feature_cross(X_train)
feature_cross(X_val)

# bucketize
bucketize(X_train)
bucketize(X_val)

# one hot
X_train = one_hot(X_train)
X_val = one_hot(X_val)

# text analysis
X_train = text_analysis(X_train)
X_val = text_analysis(X_val)

# drop columns
X_train = drop_cols(X_train)
X_val = drop_cols(X_val)

In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20964 entries, 12800 to 10106
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   default_profile             20964 non-null  int64  
 1   default_profile_image       20964 non-null  int64  
 2   favourites_count            20964 non-null  float64
 3   followers_count             20964 non-null  float64
 4   friends_count               20964 non-null  float64
 5   geo_enabled                 20964 non-null  int64  
 6   statuses_count              20964 non-null  float64
 7   verified                    20964 non-null  int64  
 8   average_tweets_per_day      20964 non-null  float64
 9   account_age_days            20964 non-null  int64  
 10  favourites_per_day          20964 non-null  float64
 11  followers_per_day           20964 non-null  float64
 12  friends_per_day             20964 non-null  float64
 13  followers_to_friends_ratio  2096

## Model Training

### Logistic Regression

In [13]:
# Only show warning
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective_lr_smote(trial):
    C = trial.suggest_float('C', 1e-4, 10, log=True)
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=1)),
        ('model', LogisticRegression(C=C, solver=solver, max_iter=5000, random_state = 1))
    ])

    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='precision').mean()
    return score

study_lr = optuna.create_study(direction='maximize')
study_lr.optimize(objective_lr_smote, n_trials=50)

print("Best hyperparameters:", study_lr.best_params)

Best hyperparameters: {'C': 0.00010042365895934439, 'solver': 'saga'}


In [14]:
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=1)),
    ('model', LogisticRegression(
        C=study_lr.best_params['C'],
        solver=study_lr.best_params['solver'],
        max_iter=5000,
        random_state = 1
    ))
])

lr_pipeline.fit(X_train, y_train)

In [15]:
y_pred = lr_pipeline.predict(X_train)

In [16]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

acc = accuracy_score(y_train, y_pred)
precision, recall, fbscore, support = precision_recall_fscore_support(y_train, y_pred, average='micro')
print('Accuracy:', np.round(acc, 4))
print('Precision:', np.round(precision, 4))
print('Recall:', np.round(recall, 4))
print('f-beta score:', np.round(fbscore, 4))

[[11032  2906]
 [ 1501  5525]]
              precision    recall  f1-score   support

           0       0.88      0.79      0.83     13938
           1       0.66      0.79      0.71      7026

    accuracy                           0.79     20964
   macro avg       0.77      0.79      0.77     20964
weighted avg       0.80      0.79      0.79     20964

Accuracy: 0.7898
Precision: 0.7898
Recall: 0.7898
f-beta score: 0.7898


In [17]:
y_pred = lr_pipeline.predict(X_val)

In [18]:
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

acc = accuracy_score(y_val, y_pred)
precision, recall, fbscore, support = precision_recall_fscore_support(y_val, y_pred, average='micro')
print('Accuracy:', np.round(acc, 4))
print('Precision:', np.round(precision, 4))
print('Recall:', np.round(recall, 4))
print('f-beta score:', np.round(fbscore, 4))


[[1393  350]
 [ 194  684]]
              precision    recall  f1-score   support

           0       0.88      0.80      0.84      1743
           1       0.66      0.78      0.72       878

    accuracy                           0.79      2621
   macro avg       0.77      0.79      0.78      2621
weighted avg       0.81      0.79      0.80      2621

Accuracy: 0.7924
Precision: 0.7924
Recall: 0.7924
f-beta score: 0.7924


In [19]:
coefficients = lr_pipeline.named_steps['model'].coef_[0]

feature_names = X_train.columns
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': coefficients
}).sort_values(by='importance', ascending=False)

feature_importance

Unnamed: 0,feature,importance
0,default_profile,0.106011
23,lang_unknown,0.104604
15,hasBotInScreenName,0.062337
17,isLocationUnknown,0.061359
16,screenNameOnlyNumeric,0.059435
25,num_links,0.05856
19,lang_ar,0.044175
13,followers_to_friends_ratio,0.025974
8,average_tweets_per_day,0.011971
1,default_profile_image,0.011267


## XGBoost


In [20]:
# Only show warning
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective_xgb_smote(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.3, 0.8),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.8),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1.0, log=True),
    }

    model = XGBClassifier(**params, random_state = 1, n_jobs = -1, eval_metric= 'logloss')

    # Full pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=1)),
        ('clf', model)
    ])

    # Cross-validation
    # cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='precision').mean()
    
    return score

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb_smote, n_trials=50)

print("Best hyperparameters:", study_xgb.best_params)

Best hyperparameters: {'n_estimators': 284, 'max_depth': 7, 'learning_rate': 0.0646761364915007, 'subsample': 0.6471933025765114, 'colsample_bytree': 0.6741309220866841, 'reg_alpha': 0.09633925648298891, 'reg_lambda': 0.070236010722545}


In [21]:
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=1)),
    ('model', 
        XGBClassifier(**study_xgb.best_params, random_state = 1, n_jobs = -1, eval_metric= 'logloss')
    )
])

xgb_pipeline.fit(X_train, y_train)
y_pred = xgb_pipeline.predict(X_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

acc = accuracy_score(y_train, y_pred)
precision, recall, fbscore, support = precision_recall_fscore_support(y_train, y_pred, average='micro')
print('Accuracy:', np.round(acc, 4))
print('Precision:', np.round(precision, 4))
print('Recall:', np.round(recall, 4))
print('f-beta score:', np.round(fbscore, 4))

[[13387   551]
 [  488  6538]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     13938
           1       0.92      0.93      0.93      7026

    accuracy                           0.95     20964
   macro avg       0.94      0.95      0.94     20964
weighted avg       0.95      0.95      0.95     20964

Accuracy: 0.9504
Precision: 0.9504
Recall: 0.9504
f-beta score: 0.9504


In [22]:
y_pred = xgb_pipeline.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

acc = accuracy_score(y_val, y_pred)
precision, recall, fbscore, support = precision_recall_fscore_support(y_val, y_pred, average='micro')
print('Accuracy:', np.round(acc, 4))
print('Precision:', np.round(precision, 4))
print('Recall:', np.round(recall, 4))
print('f-beta score:', np.round(fbscore, 4))

[[1580  163]
 [ 147  731]]
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1743
           1       0.82      0.83      0.83       878

    accuracy                           0.88      2621
   macro avg       0.87      0.87      0.87      2621
weighted avg       0.88      0.88      0.88      2621

Accuracy: 0.8817
Precision: 0.8817
Recall: 0.8817
f-beta score: 0.8817


## LightGBM

In [23]:
# Only show warning
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective_lgbm_smote(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'subsample': trial.suggest_float('subsample', 0.3, 0.8),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.8),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1.0, log=True),
    }

    model = LGBMClassifier(**params, random_state = 1, n_jobs = -1, verbosity=-1)

    # Full pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=1)),
        ('clf', model)
    ])

    # Cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='precision').mean()
    
    return score

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm_smote, n_trials=50)

print("Best hyperparameters:", study_lgbm.best_params)



Best hyperparameters: {'n_estimators': 288, 'learning_rate': 0.07722643902288129, 'num_leaves': 46, 'max_depth': 8, 'subsample': 0.40309038111415124, 'colsample_bytree': 0.7644629486490249, 'reg_alpha': 0.455993386107609, 'reg_lambda': 0.0012431803227480212}




In [24]:
lgbm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=1)),
    ('model', 
         LGBMClassifier(**study_lgbm.best_params, random_state = 1, n_jobs = -1, verbosity=-1)
    )
])

lgbm_pipeline.fit(X_train, y_train)
y_pred = lgbm_pipeline.predict(X_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

acc = accuracy_score(y_train, y_pred)
precision, recall, fbscore, support = precision_recall_fscore_support(y_train, y_pred, average='micro')
print('Accuracy:', np.round(acc, 4))
print('Precision:', np.round(precision, 4))
print('Recall:', np.round(recall, 4))
print('f-beta score:', np.round(fbscore, 4))

[[13332   606]
 [  551  6475]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     13938
           1       0.91      0.92      0.92      7026

    accuracy                           0.94     20964
   macro avg       0.94      0.94      0.94     20964
weighted avg       0.94      0.94      0.94     20964

Accuracy: 0.9448
Precision: 0.9448
Recall: 0.9448
f-beta score: 0.9448




In [25]:
y_pred = lgbm_pipeline.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

acc = accuracy_score(y_val, y_pred)
precision, recall, fbscore, support = precision_recall_fscore_support(y_val, y_pred, average='micro')
print('Accuracy:', np.round(acc, 4))
print('Precision:', np.round(precision, 4))
print('Recall:', np.round(recall, 4))
print('f-beta score:', np.round(fbscore, 4))

[[1578  165]
 [ 153  725]]
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1743
           1       0.81      0.83      0.82       878

    accuracy                           0.88      2621
   macro avg       0.86      0.87      0.86      2621
weighted avg       0.88      0.88      0.88      2621

Accuracy: 0.8787
Precision: 0.8787
Recall: 0.8787
f-beta score: 0.8787




## Random Forest

In [26]:
# Only show warning
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective_rf_smote(trial):

    params = {
        'n_estimators':trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
    }

    model = RandomForestClassifier(**params, random_state = 1, n_jobs = -1)

    # Full pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=1)),
        ('clf', model)
    ])

    # Cross-validation
    # cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='precision').mean()
    
    return score

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf_smote, n_trials=50)

print("Best hyperparameters:", study_rf.best_params)

Best hyperparameters: {'n_estimators': 201, 'max_depth': 8, 'max_features': 'log2'}


In [27]:
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=1)),
    ('model', 
        RandomForestClassifier(**study_rf.best_params, random_state = 1, n_jobs = -1)
    )
])

rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

acc = accuracy_score(y_train, y_pred)
precision, recall, fbscore, support = precision_recall_fscore_support(y_train, y_pred, average='micro')
print('Accuracy:', np.round(acc, 4))
print('Precision:', np.round(precision, 4))
print('Recall:', np.round(recall, 4))
print('f-beta score:', np.round(fbscore, 4))

[[12406  1532]
 [ 1221  5805]]
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     13938
           1       0.79      0.83      0.81      7026

    accuracy                           0.87     20964
   macro avg       0.85      0.86      0.85     20964
weighted avg       0.87      0.87      0.87     20964

Accuracy: 0.8687
Precision: 0.8687
Recall: 0.8687
f-beta score: 0.8687


In [28]:
y_pred = rf_pipeline.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))
print(precision_recall_fscore_support(y_val, y_pred, average='micro'))

acc = accuracy_score(y_val, y_pred)
precision, recall, fbscore, support = precision_recall_fscore_support(y_val, y_pred, average='micro')
print('Accuracy:', np.round(acc, 4))
print('Precision:', np.round(precision, 4))
print('Recall:', np.round(recall, 4))
print('f-beta score:', np.round(fbscore, 4))

[[1517  226]
 [ 176  702]]
              precision    recall  f1-score   support

           0       0.90      0.87      0.88      1743
           1       0.76      0.80      0.78       878

    accuracy                           0.85      2621
   macro avg       0.83      0.83      0.83      2621
weighted avg       0.85      0.85      0.85      2621

(0.8466234261732163, 0.8466234261732163, 0.8466234261732163, None)
Accuracy: 0.8466
Precision: 0.8466
Recall: 0.8466
f-beta score: 0.8466


## Testing

In [29]:
handle_missing_values(X_test)
type_conversion(X_test)

normalize(X_test)
feature_cross(X_test)
bucketize(X_test) 
X_test = one_hot(X_test) 
X_test = text_analysis(X_test) 
X_test = drop_cols(X_test)

In [30]:

y_pred = lr_pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

acc = accuracy_score(y_test, y_pred)
precision, recall, fbscore, support = precision_recall_fscore_support(y_test, y_pred, average='micro')
print('Accuracy:', np.round(acc, 4))
print('Precision:', np.round(precision, 4))
print('Recall:', np.round(recall, 4))
print('f-beta score:', np.round(fbscore, 4))


[[1386  356]
 [ 184  695]]
              precision    recall  f1-score   support

           0       0.88      0.80      0.84      1742
           1       0.66      0.79      0.72       879

    accuracy                           0.79      2621
   macro avg       0.77      0.79      0.78      2621
weighted avg       0.81      0.79      0.80      2621

Accuracy: 0.794
Precision: 0.794
Recall: 0.794
f-beta score: 0.794


In [31]:

y_pred = xgb_pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

acc = accuracy_score(y_test, y_pred)
precision, recall, fbscore, support = precision_recall_fscore_support(y_test, y_pred, average='micro')
print('Accuracy:', np.round(acc, 4))
print('Precision:', np.round(precision, 4))
print('Recall:', np.round(recall, 4))
print('f-beta score:', np.round(fbscore, 4))


[[1561  181]
 [ 140  739]]
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      1742
           1       0.80      0.84      0.82       879

    accuracy                           0.88      2621
   macro avg       0.86      0.87      0.86      2621
weighted avg       0.88      0.88      0.88      2621

Accuracy: 0.8775
Precision: 0.8775
Recall: 0.8775
f-beta score: 0.8775


In [32]:
y_pred = lgbm_pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

acc = accuracy_score(y_test, y_pred)
precision, recall, fbscore, support = precision_recall_fscore_support(y_test, y_pred, average='micro')
print('Accuracy:', np.round(acc, 4))
print('Precision:', np.round(precision, 4))
print('Recall:', np.round(recall, 4))
print('f-beta score:', np.round(fbscore, 4))

[[1571  171]
 [ 138  741]]
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      1742
           1       0.81      0.84      0.83       879

    accuracy                           0.88      2621
   macro avg       0.87      0.87      0.87      2621
weighted avg       0.88      0.88      0.88      2621

Accuracy: 0.8821
Precision: 0.8821
Recall: 0.8821
f-beta score: 0.8821




In [33]:
y_pred = rf_pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(precision_recall_fscore_support(y_test, y_pred, average='micro'))

acc = accuracy_score(y_test, y_pred)
precision, recall, fbscore, support = precision_recall_fscore_support(y_test, y_pred, average='micro')
print('Accuracy:', np.round(acc, 4))
print('Precision:', np.round(precision, 4))
print('Recall:', np.round(recall, 4))
print('f-beta score:', np.round(fbscore, 4))

[[1522  220]
 [ 168  711]]
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      1742
           1       0.76      0.81      0.79       879

    accuracy                           0.85      2621
   macro avg       0.83      0.84      0.84      2621
weighted avg       0.85      0.85      0.85      2621

(0.8519648988935521, 0.8519648988935521, 0.8519648988935521, None)
Accuracy: 0.852
Precision: 0.852
Recall: 0.852
f-beta score: 0.852


## Summary

|Model Name        |Precision of Train|Precision of Val|Precision of Test|
|----------------|----------------|----------------|----------------|
|Logistic Regression|0.7898 | 0.7924 | 0.794|
|XGBoost	         |0.9504 | 0.8817 | 0.8775|
|LightGBM	         |0.9448| 0.8787|0.8821| 
|Random Forest	     |0.8687| 0.8466|0.852| 

I use the Precision metric because I want to reduce false positives, which can be costly. Falsely identifying a normal user as a bot may result in the user being blocked or facing other consequences.
<br/>

Based on the results shown above, I chose the LightGBM model because it achieves a higher Precision on unseen data compared to other models. Although both XGBoost and LightGBM exhibit overfitting, LightGBM shows a smaller drop in Precision compared to XGBoost.

## Future Improvements

<ul>
    <li>Get more data to create a larger dataset, as the current limited size may be causing the model to overfit.</li>
    <li>Explore and apply ensemble methods, such as stacking and voting, to potentially improve model performance.</li>
    <li>Extract features from text descriptions by utilizing text embeddings like TF-IDF or transformer-based models.</li>
</ul>