In [115]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from scipy.stats import loguniform
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


In [116]:
df = pd.read_csv("twitter_bots_final.csv")

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37438 entries, 0 to 37437
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   default_profile              37438 non-null  bool   
 1   default_profile_image        37438 non-null  bool   
 2   description                  37438 non-null  object 
 3   favourites_count             37438 non-null  int64  
 4   followers_count              37438 non-null  int64  
 5   friends_count                37438 non-null  int64  
 6   geo_enabled                  37438 non-null  bool   
 7   lang                         37438 non-null  object 
 8   verified                     37438 non-null  bool   
 9   average_tweets_per_day       37438 non-null  float64
 10  account_age_days             37438 non-null  int64  
 11  account_type                 37438 non-null  object 
 12  word_count                   37438 non-null  int64  
 13  mean_word_length

In [118]:
missing_values_per_column = df.isna().sum()
print(missing_values_per_column)

default_profile                    0
default_profile_image              0
description                        0
favourites_count                   0
followers_count                    0
friends_count                      0
geo_enabled                        0
lang                               0
verified                           0
average_tweets_per_day             0
account_age_days                   0
account_type                       0
word_count                         0
mean_word_length                   0
hashtag_count                      0
handle_count                       0
url_count                          0
description_language               0
description_en                 13341
description_en_embeddings          0
log_followers_friends_ratio        0
dtype: int64


In [119]:
df['account_type'].value_counts(normalize=True)

human    0.668118
bot      0.331882
Name: account_type, dtype: float64

In [None]:
df = df.drop(columns=['description_en', 'description'])

bool_cols = ['default_profile', 'default_profile_image', 'geo_enabled', 'verified']
cat_cols = ['lang', 'description_language']
num_cols = [
    'favourites_count', 'followers_count', 'friends_count',
    'average_tweets_per_day', 'account_age_days', 'word_count',
    'mean_word_length', 'hashtag_count', 'handle_count',
    'url_count', 'log_followers_friends_ratio'
]
target_col = 'account_type'

df[bool_cols] = df[bool_cols].astype(int) # convert to binary values
df = pd.get_dummies(df, columns=cat_cols, drop_first=True) # one-hot encoding 

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols]) # scale numerical features

def str_to_array(s):
    s = s.strip("[]")
    return np.array(s.split(), dtype=float)
df['description_en_embeddings'] = df['description_en_embeddings'].apply(str_to_array) # convert embeddings into array

X_embed = np.vstack(df['description_en_embeddings'].values)

feature_cols = [c for c in df.columns if c not in ['description_en_embeddings', target_col]]
X_tabular = df[feature_cols].values # all features excleding embeddings

X = np.hstack([X_embed, X_tabular])
y = df[target_col].map({'human': 0, 'bot': 1}).values

print("Feature matrix shape:", X.shape)
print("Account Type distribution:", np.bincount(y))

Feature matrix shape: (37438, 578)
Account distribution: [25013 12425]


In [121]:
# split train into train (80%) and temp (20%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# split temp into validation (10%) and test (10%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

In [None]:
X_train_tabular = X_train[:, 384:]  # all cols after the first 384 embedding columns
X_val_tabular   = X_val[:, 384:]
X_test_tabular  = X_test[:, 384:]

# only use embeddings for model1
X_train_embed = X_train[:, :384]
X_val_embed = X_val[:, :384]
X_test_embed = X_test[:, :384]

model1 using logistic regression

In [None]:
# model1 using logistic regression
model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train_embed, y_train)

y_val_pred = model1.predict(X_val_embed)
y_val_prob = model1.predict_proba(X_val_embed)[:, 1]

print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("AUC:", roc_auc_score(y_val, y_val_prob))
print(classification_report(y_val, y_val_pred))

Accuracy: 0.7094017094017094
AUC: 0.7578436043120966
              precision    recall  f1-score   support

           0       0.71      0.97      0.82      2501
           1       0.75      0.19      0.30      1243

    accuracy                           0.71      3744
   macro avg       0.73      0.58      0.56      3744
weighted avg       0.72      0.71      0.64      3744



In [None]:
# tuning parameters for model1
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs'],
    'class_weight': [None, 'balanced'],
    'max_iter': [1000]
}

grid = GridSearchCV(
    LogisticRegression(),
    param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train_embed, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV ROC-AUC:", grid.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END C=0.001, class_weight=None, max_iter=1000, penalty=l1, solver=lbfgs; total time=   0.1s
[CV] END C=0.001, class_weight=None, max_iter=1000, penalty=l1, solver=lbfgs; total time=   0.1s
[CV] END C=0.001, class_weight=None, max_iter=1000, penalty=l1, solver=lbfgs; total time=   0.1s
[CV] END C=0.001, class_weight=None, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.7s
[CV] END C=0.001, class_weight=None, max_iter=1000, penalty=l1, solver=lbfgs; total time=   0.1s
[CV] END C=0.001, class_weight=None, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.7s
[CV] END C=0.001, class_weight=None, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.8s
[CV] END C=0.001, class_weight=None, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.7s
[CV] END C=0.001, class_weight=None, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.7s
[CV] END C=0.001, class_weight=None, max_iter

60 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self

Best parameters: {'C': 1, 'class_weight': 'balanced', 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV ROC-AUC: 0.7376801915943437


In [124]:
# test on validation set
best_model1 = grid.best_estimator_
y_val_pred = best_model1.predict(X_val_embed)
y_val_prob = best_model1.predict_proba(X_val_embed)[:, 1]

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation ROC-AUC:", roc_auc_score(y_val, y_val_prob))
print(classification_report(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))

Validation Accuracy: 0.6733440170940171
Validation ROC-AUC: 0.757705928087333
              precision    recall  f1-score   support

           0       0.85      0.62      0.72      2501
           1       0.51      0.79      0.62      1243

    accuracy                           0.67      3744
   macro avg       0.68      0.70      0.67      3744
weighted avg       0.74      0.67      0.68      3744

[[1543  958]
 [ 265  978]]


In [125]:
# test on test set
y_test_pred = best_model1.predict(X_test_embed)
y_test_prob = best_model1.predict_proba(X_test_embed)[:, 1]

print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test ROC-AUC:", roc_auc_score(y_test, y_test_prob))
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

Test Accuracy: 0.6495726495726496
Test ROC-AUC: 0.734433065463893
              precision    recall  f1-score   support

           0       0.83      0.60      0.69      2502
           1       0.48      0.76      0.59      1242

    accuracy                           0.65      3744
   macro avg       0.66      0.68      0.64      3744
weighted avg       0.72      0.65      0.66      3744

[[1494 1008]
 [ 304  938]]


In [126]:
y_train_prob = best_model1.predict_proba(X_train_embed)[:, 1]
print("Train ROC-AUC:", roc_auc_score(y_train, y_train_prob))
print("Validation ROC-AUC:", roc_auc_score(y_val, y_val_prob))

Train ROC-AUC: 0.7497565553239478
Validation ROC-AUC: 0.757705928087333


AUC are very close, no overfitting

In [127]:
# get probabilities from model1
train_prob = best_model1.predict_proba(X_train_embed)[:, 1]
val_prob   = best_model1.predict_proba(X_val_embed)[:, 1]
test_prob  = best_model1.predict_proba(X_test_embed)[:, 1]

# combine probability with tabular features
X_train_model2 = np.hstack([train_prob.reshape(-1, 1), X_train_tabular])
X_val_model2   = np.hstack([val_prob.reshape(-1, 1), X_val_tabular])
X_test_model2  = np.hstack([test_prob.reshape(-1, 1), X_test_tabular])

model2 = logistic regression

In [128]:
# initialise model2 using logistic regression
model2_lr = LogisticRegression(max_iter=1000, class_weight='balanced')

# train on training set
model2_lr.fit(X_train_model2, y_train)

# predict on validation
y_val_pred = model2_lr.predict(X_val_model2)
y_val_prob = model2_lr.predict_proba(X_val_model2)[:, 1]

# evaluate
print("LR Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("LR Validation ROC-AUC:", roc_auc_score(y_val, y_val_prob))
print(classification_report(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))

LR Validation Accuracy: 0.7769764957264957
LR Validation ROC-AUC: 0.8648955542481318
              precision    recall  f1-score   support

           0       0.89      0.76      0.82      2501
           1       0.63      0.81      0.71      1243

    accuracy                           0.78      3744
   macro avg       0.76      0.79      0.76      3744
weighted avg       0.80      0.78      0.78      3744

[[1901  600]
 [ 235 1008]]


In [129]:
# predict on test set
y_test_pred = model2_lr.predict(X_test_model2)
y_test_prob = model2_lr.predict_proba(X_test_model2)[:, 1]

print("LR Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("LR Test AUC:", roc_auc_score(y_test, y_test_prob))
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

LR Test Accuracy: 0.7534722222222222
LR Test AUC: 0.8480696923942328
              precision    recall  f1-score   support

           0       0.87      0.74      0.80      2502
           1       0.60      0.78      0.68      1242

    accuracy                           0.75      3744
   macro avg       0.74      0.76      0.74      3744
weighted avg       0.78      0.75      0.76      3744

[[1847  655]
 [ 268  974]]


model2 = random forest 

In [130]:
# initialise model2 using random forest
model2_rf = RandomForestClassifier(n_estimators=200, max_depth=None, class_weight='balanced', random_state=42)
model2_rf.fit(X_train_model2, y_train)

# predict on validation set 
y_val_pred = model2_rf.predict(X_val_model2)
y_val_prob = model2_rf.predict_proba(X_val_model2)[:, 1]

print("RF Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("RF Validation ROC-AUC:", roc_auc_score(y_val, y_val_prob))
print(classification_report(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))


RF Validation Accuracy: 0.8774038461538461
RF Validation ROC-AUC: 0.9363287669646543
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      2501
           1       0.88      0.73      0.80      1243

    accuracy                           0.88      3744
   macro avg       0.88      0.84      0.86      3744
weighted avg       0.88      0.88      0.87      3744

[[2374  127]
 [ 332  911]]


In [131]:
# predict on test set
y_test_pred = model2_rf.predict(X_test_model2)
y_test_prob = model2_rf.predict_proba(X_test_model2)[:, 1]

print("RF Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("RF Test ROC-AUC:", roc_auc_score(y_test, y_test_prob))
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

RF Test Accuracy: 0.8664529914529915
RF Test ROC-AUC: 0.9290438824463778
              precision    recall  f1-score   support

           0       0.87      0.94      0.90      2502
           1       0.85      0.73      0.78      1242

    accuracy                           0.87      3744
   macro avg       0.86      0.83      0.84      3744
weighted avg       0.87      0.87      0.86      3744

[[2343  159]
 [ 341  901]]


model2 = decision tree

In [132]:
# initialise model2 using decision tree
model2_dt = DecisionTreeClassifier(max_depth=None, class_weight='balanced', random_state=42)
model2_dt.fit(X_train_model2, y_train)

# predict on validation set 
y_val_pred = model2_dt.predict(X_val_model2)
y_val_prob = model2_dt.predict_proba(X_val_model2)[:, 1]

print("DT Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("DT Validation ROC-AUC:", roc_auc_score(y_val, y_val_prob))
print(classification_report(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))

DT Validation Accuracy: 0.8261217948717948
DT Validation ROC-AUC: 0.8006543159083912
              precision    recall  f1-score   support

           0       0.87      0.88      0.87      2501
           1       0.74      0.72      0.73      1243

    accuracy                           0.83      3744
   macro avg       0.80      0.80      0.80      3744
weighted avg       0.83      0.83      0.83      3744

[[2192  309]
 [ 342  901]]


In [133]:
# predict on test set 
y_test_pred = model2_dt.predict(X_test_model2)
y_test_prob = model2_dt.predict_proba(X_test_model2)[:, 1]

print("DT Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("DT Test ROC-AUC:", roc_auc_score(y_test, y_test_prob))
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

DT Test Accuracy: 0.8215811965811965
DT Test ROC-AUC: 0.7951435952687126
              precision    recall  f1-score   support

           0       0.86      0.87      0.87      2502
           1       0.74      0.72      0.73      1242

    accuracy                           0.82      3744
   macro avg       0.80      0.80      0.80      3744
weighted avg       0.82      0.82      0.82      3744

[[2186  316]
 [ 352  890]]
