In [1]:
# %% [code]
# Install required packages (uncomment if needed)
# !pip install transformers tensorflow scikit-learn nltk

import os
import re
import json
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from transformers import TFDistilBertModel, DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, classification_report)
import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer

print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.18.0


In [2]:
# %% [code]
# Load dataset
data = pd.read_csv('bot_detection_data[1].csv')
print("Dataset shape:", data.shape)
data.head()


Dataset shape: (50000, 11)


Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags
0,132131,flong,Station activity person against natural majori...,85,1,2353,False,1,Adkinston,2020-05-11 15:29:50,
1,289683,hinesstephanie,Authority research natural life material staff...,55,5,9617,True,0,Sanderston,2022-11-26 05:18:10,both live
2,779715,roberttran,Manage whose quickly especially foot none to g...,6,2,4363,True,0,Harrisonfurt,2022-08-08 03:16:54,phone ahead
3,696168,pmason,Just cover eight opportunity strong policy which.,54,5,2242,True,1,Martinezberg,2021-08-14 22:27:05,ever quickly new I
4,704441,noah87,Animal sign six data good or.,26,3,8438,False,1,Camachoville,2020-04-13 21:24:21,foreign mention


In [3]:
# %% [code]
# Convert 'Verified' to numeric
data['Verified'] = data['Verified'].astype(int)

# Tweet Length: number of words
data['Tweet_Length'] = data['Tweet'].apply(lambda x: len(str(x).split()))

# Function to count hashtags
def count_hashtags(hashtag_str):
    if pd.isnull(hashtag_str) or hashtag_str.strip() == "":
        return 0
    hashtags = re.split('[, ]+', hashtag_str.strip())
    return len([tag for tag in hashtags if tag != ""])

data['Hashtag_Count'] = data['Hashtags'].apply(count_hashtags)

# Function to clean tweet text (basic cleaning)
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)   # remove URLs
    text = re.sub(r'@\w+', '', text)      # remove mentions
    text = re.sub(r'#\w+', '', text)      # remove hashtags (if desired)
    text = re.sub(r'[^\w\s]', '', text)   # remove punctuation
    text = re.sub(r'\d+', '', text)       # remove digits
    return text.strip()

data['Clean_Tweet'] = data['Tweet'].apply(clean_text)

# Sentiment analysis using VADER
sid = SentimentIntensityAnalyzer()
data['Sentiment'] = data['Clean_Tweet'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Display sample engineered features
data[['Tweet', 'Clean_Tweet', 'Tweet_Length', 'Hashtag_Count', 'Sentiment']].head()


Unnamed: 0,Tweet,Clean_Tweet,Tweet_Length,Hashtag_Count,Sentiment
0,Station activity person against natural majori...,station activity person against natural majori...,12,0,0.3612
1,Authority research natural life material staff...,authority research natural life material staff...,10,2,0.6597
2,Manage whose quickly especially foot none to g...,manage whose quickly especially foot none to g...,10,2,0.0
3,Just cover eight opportunity strong policy which.,just cover eight opportunity strong policy which,7,4,0.7269
4,Animal sign six data good or.,animal sign six data good or,6,2,0.4404


In [4]:
# %% [code]
# Define feature names
text_column = 'Clean_Tweet'
numeric_features = ['Retweet Count', 'Mention Count', 'Follower Count',
                    'Verified', 'Tweet_Length', 'Hashtag_Count', 'Sentiment']
target = 'Bot Label'

# Create X and y
X_text = data[text_column].astype(str).values
X_numeric = data[numeric_features].values
y = data[target].values

# Split into train and test sets (stratified split)
X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_text, X_numeric, y, test_size=0.2, random_state=42, stratify=y)

print("Train size:", len(y_train), "Test size:", len(y_test))


Train size: 40000 Test size: 10000


In [5]:
# %% [code]
scaler = StandardScaler()
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled = scaler.transform(X_num_test)

# Save the scaler for later use in production
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [6]:
# %% [code]
# Load DistilBERT tokenizer and set maximum sequence length
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
max_length = 64  # adjust as needed

def tokenize_texts(texts):
    return tokenizer(texts.tolist(),
                     padding='max_length',
                     truncation=True,
                     max_length=max_length,
                     return_tensors="tf")

# Tokenize training and testing texts
train_encodings = tokenize_texts(pd.Series(X_text_train))
test_encodings = tokenize_texts(pd.Series(X_text_test))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, Lambda
from tensorflow.keras.models import Model
from transformers import TFDistilBertModel

# Load the DistilBERT model (as before)
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

# (Optional) Freeze DistilBERT layers for initial training
for layer in distilbert_model.layers:
    layer.trainable = False

# Define the maximum sequence length used during tokenization
max_length = 64

# Define text input layers
input_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")

# Wrap the DistilBERT call inside a Lambda layer
def bert_layer(inputs):
    # Unpack inputs
    input_ids, attention_mask = inputs
    # Ensure the inputs are standard Tensors (cast if necessary)
    input_ids = tf.cast(input_ids, tf.int32)
    attention_mask = tf.cast(attention_mask, tf.int32)
    # Call DistilBERT (set training=False for inference mode)
    outputs = distilbert_model(input_ids, attention_mask=attention_mask, training=False)
    # Extract the [CLS] token representation (first token)
    cls_token = outputs.last_hidden_state[:, 0, :]
    return cls_token

# Use Lambda to wrap the call
cls_output = Lambda(bert_layer, name="bert_layer")([input_ids, attention_mask])

# Define numeric input layer (for engineered features)
numeric_features = ['Retweet Count', 'Mention Count', 'Follower Count',
                    'Verified', 'Tweet_Length', 'Hashtag_Count', 'Sentiment']
numeric_input = Input(shape=(len(numeric_features),), dtype=tf.float32, name="numeric_input")

# Combine the text representation and numeric features
combined = Concatenate()([cls_output, numeric_input])
x = Dense(128, activation='relu')(combined)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
output = Dense(1, activation='sigmoid')(x)

# Build and compile the model
model = Model(inputs=[input_ids, attention_mask, numeric_input], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [9]:
# %% [code]
# Prepare training and testing inputs as dictionaries
train_inputs = {
    "input_ids": train_encodings['input_ids'],
    "attention_mask": train_encodings['attention_mask'],
    "numeric_input": X_num_train_scaled
}

test_inputs = {
    "input_ids": test_encodings['input_ids'],
    "attention_mask": test_encodings['attention_mask'],
    "numeric_input": X_num_test_scaled
}

# Train the model
history = model.fit(
    train_inputs,
    y_train,
    validation_data=(test_inputs, y_test),
    epochs=1,  # adjust epochs based on dataset size and convergence
    batch_size=4
)


[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 517ms/step - accuracy: 0.5014 - loss: 0.7004

KeyboardInterrupt: 

In [10]:
# %% [code]
# Predict on the test set
y_pred_prob = model.predict(test_inputs).ravel()
y_pred = (y_pred_prob > 0.5).astype(int)

# Compute evaluation metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_prob)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1 Score:", f1)
print("AUC-ROC:", auc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1172s[0m 4s/step
Accuracy: 0.4975
Precision: 0.49296718017414604
Recall: 0.14708233413269384
F1 Score: 0.2265661074342004
AUC-ROC: 0.4954100570624365

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.85      0.63      4996
           1       0.49      0.15      0.23      5004

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.43     10000
weighted avg       0.50      0.50      0.43     10000



In [11]:
# %% [code]
# Save the Keras model
model.save('improved_bot_detection_model.h5')

# Save the tokenizer for later use
tokenizer.save_pretrained('./tokenizer')




('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')