In [4]:
# %% [code]

# !pip install transformers tensorflow scikit-learn nltk

import os
import re
import json
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from transformers import TFDistilBertModel, DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, classification_report)
import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer

print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.18.0


In [5]:
# %% [code]

data = pd.read_csv('bot_detection_data.csv')
print("Dataset shape:", data.shape)
data.head()


Dataset shape: (50000, 11)


Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags
0,132131,flong,Station activity person against natural majori...,85,1,2353,False,1,Adkinston,2020-05-11 15:29:50,
1,289683,hinesstephanie,Authority research natural life material staff...,55,5,9617,True,0,Sanderston,2022-11-26 05:18:10,both live
2,779715,roberttran,Manage whose quickly especially foot none to g...,6,2,4363,True,0,Harrisonfurt,2022-08-08 03:16:54,phone ahead
3,696168,pmason,Just cover eight opportunity strong policy which.,54,5,2242,True,1,Martinezberg,2021-08-14 22:27:05,ever quickly new I
4,704441,noah87,Animal sign six data good or.,26,3,8438,False,1,Camachoville,2020-04-13 21:24:21,foreign mention


In [6]:
# %% [code]

data['Verified'] = data['Verified'].astype(int)


data['Tweet_Length'] = data['Tweet'].apply(lambda x: len(str(x).split()))

def count_hashtags(hashtag_str):
    if pd.isnull(hashtag_str) or hashtag_str.strip() == "":
        return 0
    hashtags = re.split('[, ]+', hashtag_str.strip())
    return len([tag for tag in hashtags if tag != ""])

data['Hashtag_Count'] = data['Hashtags'].apply(count_hashtags)

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text.strip()

data['Clean_Tweet'] = data['Tweet'].apply(clean_text)

sid = SentimentIntensityAnalyzer()
data['Sentiment'] = data['Clean_Tweet'].apply(lambda x: sid.polarity_scores(x)['compound'])

data[['Tweet', 'Clean_Tweet', 'Tweet_Length', 'Hashtag_Count', 'Sentiment']].head()


Unnamed: 0,Tweet,Clean_Tweet,Tweet_Length,Hashtag_Count,Sentiment
0,Station activity person against natural majori...,station activity person against natural majori...,12,0,0.3612
1,Authority research natural life material staff...,authority research natural life material staff...,10,2,0.6597
2,Manage whose quickly especially foot none to g...,manage whose quickly especially foot none to g...,10,2,0.0
3,Just cover eight opportunity strong policy which.,just cover eight opportunity strong policy which,7,4,0.7269
4,Animal sign six data good or.,animal sign six data good or,6,2,0.4404


In [7]:
# %% [code]

text_column = 'Clean_Tweet'
numeric_features = ['Retweet Count', 'Mention Count', 'Follower Count',
                    'Verified', 'Tweet_Length', 'Hashtag_Count', 'Sentiment']
target = 'Bot Label'

X_text = data[text_column].astype(str).values
X_numeric = data[numeric_features].values
y = data[target].values

X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_text, X_numeric, y, test_size=0.2, random_state=42, stratify=y)

print("Train size:", len(y_train), "Test size:", len(y_test))


Train size: 40000 Test size: 10000


In [8]:
# %% [code]
scaler = StandardScaler()
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled = scaler.transform(X_num_test)

joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [9]:
# %% [code]
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
max_length = 64

def tokenize_texts(texts):
    return tokenizer(texts.tolist(),
                     padding='max_length',
                     truncation=True,
                     max_length=max_length,
                     return_tensors="tf")
train_encodings = tokenize_texts(pd.Series(X_text_train))
test_encodings = tokenize_texts(pd.Series(X_text_test))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [10]:
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, Lambda
from tensorflow.keras.models import Model
from transformers import TFDistilBertModel
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

for layer in distilbert_model.layers:
    layer.trainable = False


max_length = 64


input_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")


def bert_layer(inputs):

    input_ids, attention_mask = inputs

    input_ids = tf.cast(input_ids, tf.int32)
    attention_mask = tf.cast(attention_mask, tf.int32)

    outputs = distilbert_model(input_ids, attention_mask=attention_mask, training=False)

    cls_token = outputs.last_hidden_state[:, 0, :]
    return cls_token


cls_output = Lambda(bert_layer, name="bert_layer")([input_ids, attention_mask])


numeric_features = ['Retweet Count', 'Mention Count', 'Follower Count',
                    'Verified', 'Tweet_Length', 'Hashtag_Count', 'Sentiment']
numeric_input = Input(shape=(len(numeric_features),), dtype=tf.float32, name="numeric_input")


combined = Concatenate()([cls_output, numeric_input])
x = Dense(128, activation='relu')(combined)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
output = Dense(1, activation='sigmoid')(x)


model = Model(inputs=[input_ids, attention_mask, numeric_input], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [11]:
# %% [code]

train_inputs = {
    "input_ids": train_encodings['input_ids'],
    "attention_mask": train_encodings['attention_mask'],
    "numeric_input": X_num_train_scaled
}

test_inputs = {
    "input_ids": test_encodings['input_ids'],
    "attention_mask": test_encodings['attention_mask'],
    "numeric_input": X_num_test_scaled
}

# Train the model
history = model.fit(
    train_inputs,
    y_train,
    validation_data=(test_inputs, y_test),
    epochs=1,
    batch_size=10
)


[1m4000/4000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 23ms/step - accuracy: 0.4992 - loss: 0.6995 - val_accuracy: 0.4982 - val_loss: 0.6936


In [12]:
# %% [code]

y_pred_prob = model.predict(test_inputs).ravel()
y_pred = (y_pred_prob > 0.5).astype(int)


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_prob)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1 Score:", f1)
print("AUC-ROC:", auc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 63ms/step
Accuracy: 0.4982
Precision: 0.4989024772656005
Recall: 0.6358912869704236
F1 Score: 0.5591284484273414
AUC-ROC: 0.4990585793974909

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.36      0.42      4996
           1       0.50      0.64      0.56      5004

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.49     10000
weighted avg       0.50      0.50      0.49     10000



In [13]:
# %% [code]

model.save('improved_bot_detection_model.h5')

tokenizer.save_pretrained('./tokenizer')




('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')