<a href="https://colab.research.google.com/github/asmi1711/Bot-Detection-on-X/blob/main/Bot_Detection_on_X.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Preprocessing

Dataset Analysis Target Variable: account_type (human → 0, bot → 1)

Numerical Features: favourites_count, followers_count, friends_count, statuses_count, average_tweets_per_day, account_age_days

Boolean Features (Converted to Numeric 0/1): default_profile, default_profile_image, geo_enabled, verified

Text Features (Require Vectorization): description, screen_name, location

Columns to Drop: Unnamed: 0, created_at, id, profile_image_url, profile_background_image_url, lang (Not relevant for prediction)

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = "/content/twitter_human_bots_dataset (2).csv"
df = pd.read_csv(file_path)

# Drop unnecessary columns
columns_to_drop = [
    "Unnamed: 0", "created_at", "id", "screen_name", "profile_image_url",
    "profile_background_image_url", "location", "description", "lang"
]
df_cleaned = df.drop(columns=columns_to_drop)

# Encode categorical target variable
df_cleaned["account_type"] = df_cleaned["account_type"].map({"human": 0, "bot": 1})

# Separate features and target
X = df_cleaned.drop(columns=["account_type"])
y = df_cleaned["account_type"]

# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Convert back to DataFrame
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled_df = pd.DataFrame(y_resampled, columns=["account_type"])

# Concatenate balanced dataset
balanced_df = pd.concat([X_resampled_df, y_resampled_df], axis=1)

# Save the balanced dataset
balanced_df.to_csv("balanced_twitter_dataset.csv", index=False)

print("SMOTE applied successfully. Balanced dataset saved as 'balanced_twitter_dataset.csv'.")


#XGBoost

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
file_path = "/content/balanced_twitter_dataset.csv"
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop(columns=["account_type"])
y = df["account_type"]

# Split data into training, validation, and testing sets
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.33, random_state=42, stratify=y_train_temp
)

# Initialize the XGBoost model
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Fit the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_val = xgb_model.predict(X_val)
y_pred_test = xgb_model.predict(X_test)

# Evaluate the model
train_acc = xgb_model.score(X_train, y_train)
val_acc = xgb_model.score(X_val, y_val)
test_acc = xgb_model.score(X_test, y_test)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

print("\nClassification Report (Validation):")
print(classification_report(y_val, y_pred_val))
print("\nConfusion Matrix (Validation):")
print(confusion_matrix(y_val, y_pred_val))

print("\nClassification Report (Test):")
print(classification_report(y_test, y_pred_test))
print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_pred_test))

# Feature Importance Visualization
feature_importances = xgb_model.feature_importances_

# Plotting the bar graph
plt.figure(figsize=(2, 2))
plt.barh(X.columns, feature_importances, color='black')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance from XGBoost Model')
plt.gca().invert_yaxis()  # Invert y-axis for better readability
plt.show()


#Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
file_path = "/content/balanced_twitter_dataset.csv"
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop(columns=["account_type"])
y = df["account_type"]

# Split data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Split the temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_val = rf_model.predict(X_val)     # Validation predictions
y_pred_test = rf_model.predict(X_test)   # Test predictions

# Evaluate the model
train_acc = rf_model.score(X_train, y_train)
val_acc = rf_model.score(X_val, y_val)   # Validation accuracy
test_acc = rf_model.score(X_test, y_test)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

# Classification reports
print("\nClassification Report (Validation):")
print(classification_report(y_val, y_pred_val))

print("\nClassification Report (Test):")
print(classification_report(y_test, y_pred_test))

# Confusion matrices
print("\nConfusion Matrix (Validation):")
print(confusion_matrix(y_val, y_pred_val))

print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_pred_test))

# Feature Importance
feature_importances = rf_model.feature_importances_
print("\nFeature Importances:")
for i, importance in enumerate(feature_importances):
    print(f"{X.columns[i]}: {importance:.4f}")


#KNN

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
file_path = "/content/balanced_twitter_dataset.csv"
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop(columns=["account_type"])
y = df["account_type"]

# Split data into training, validation, and testing sets
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)  # Initial split

X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.33, random_state=42, stratify=y_train_temp
)  # Split train_temp into train and validation

# Initialize the KNN model
# n_neighbors = 5 is the default value (you can tune this)
knn_model = KNeighborsClassifier(n_neighbors=5)

# Fit the model (on the training set)
knn_model.fit(X_train, y_train)

# Make predictions (on the validation set)
y_pred_val = knn_model.predict(X_val)

# Evaluate the model (on the validation set)
accuracy_val = accuracy_score(y_val, y_pred_val)
train_acc = knn_model.score(X_train, y_train)  # Training accuracy
val_acc = knn_model.score(X_val, y_val)        # Validation accuracy

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"KNN Model Accuracy (Validation): {accuracy_val:.4f}")
print("\nClassification Report (KNN - Validation):")
print(classification_report(y_val, y_pred_val))
print("\nConfusion Matrix (KNN - Validation):")
print(confusion_matrix(y_val, y_pred_val))


# Make predictions (on the test set)
y_pred_test = knn_model.predict(X_test)

# Evaluate the model (on the test set)
accuracy_test = accuracy_score(y_test, y_pred_test)
test_acc = knn_model.score(X_test, y_test)  # Test accuracy

print(f"\nTest Accuracy: {test_acc:.4f}")
print(f"KNN Model Accuracy (Test): {accuracy_test:.4f}")
print("\nClassification Report (KNN - Test):")
print(classification_report(y_test, y_pred_test))
print("\nConfusion Matrix (KNN - Test):")
print(confusion_matrix(y_test, y_pred_test))


#Logistic Regression

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# Load the dataset
file_path = "/content/balanced_twitter_dataset.csv"
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop(columns=["account_type"])
y = df["account_type"]

# Split data into training, validation, and testing sets
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.33, random_state=42, stratify=y_train_temp
)

# Create a pipeline with scaling, feature selection, and Logistic Regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selector', SelectKBest(score_func=f_classif, k='all')),  # Use all features
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Hyperparameter grid for tuning
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'classifier__penalty': ['l1', 'l2'],  # Regularization type
    'classifier__solver': ['liblinear'],  # Works well for small datasets
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit the model
print("Training Logistic Regression with hyperparameter tuning...")
grid_search.fit(X_train, y_train)

# Best model from grid search
best_lr = grid_search.best_estimator_

# Make predictions
y_pred_val = best_lr.predict(X_val)
y_pred_test = best_lr.predict(X_test)

# Probability estimates for ROC curve
y_proba_val = best_lr.predict_proba(X_val)[:, 1]
y_proba_test = best_lr.predict_proba(X_test)[:, 1]

# Evaluate the model
train_acc = best_lr.score(X_train, y_train)
val_acc = best_lr.score(X_val, y_val)
test_acc = best_lr.score(X_test, y_test)

print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

# AUC-ROC Score
print(f"\nValidation AUC-ROC: {roc_auc_score(y_val, y_proba_val):.4f}")
print(f"Test AUC-ROC: {roc_auc_score(y_test, y_proba_test):.4f}")

print("\nClassification Report (Validation):")
print(classification_report(y_val, y_pred_val))
print("\nConfusion Matrix (Validation):")
print(confusion_matrix(y_val, y_pred_val))

print("\nClassification Report (Test):")
print(classification_report(y_test, y_pred_test))
print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_pred_test))

# Feature Importance (coefficients)
feature_importance = best_lr.named_steps['classifier'].coef_[0]
feature_names = X.columns

# Create DataFrame for visualization
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': feature_importance
}).sort_values('Coefficient', key=abs, ascending=False)

# Plot top 15 important features
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'][:15], importance_df['Coefficient'][:15], color='skyblue')
plt.xlabel('Coefficient Value (Absolute Magnitude)')
plt.title('Top 15 Important Features (Logistic Regression)')
plt.gca().invert_yaxis()
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba_test)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {roc_auc_score(y_test, y_proba_test):.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

#CNN

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

# Load the dataset
file_path = "/content/balanced_twitter_dataset.csv"
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop(columns=["account_type"])
y = df["account_type"]

# Encode labels (if not already numerical)
le = LabelEncoder()
y = le.fit_transform(y)  # 0 for human, 1 for bot

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Reshape data for CNN (samples, timesteps, features)
# We'll treat each feature as a timestep in 1D convolution
X_reshaped = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)

# Split data into training, validation, and testing sets
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X_reshaped, y, test_size=0.3, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.33, random_state=42, stratify=y_train_temp
)

# Convert labels to one-hot encoding
y_train_cat = to_categorical(y_train)
y_val_cat = to_categorical(y_val)
y_test_cat = to_categorical(y_test)

# CNN Model Architecture
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu',
           input_shape=(X_train.shape[1], X_train.shape[2])),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    Conv1D(filters=128, kernel_size=3, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(2, activation='softmax')  # 2 output classes
])

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train_cat,
                    epochs=100,
                    batch_size=64,
                    validation_data=(X_val, y_val_cat),
                    callbacks=[early_stop],
                    verbose=1)

# Evaluate the model
train_loss, train_acc = model.evaluate(X_train, y_train_cat, verbose=0)
val_loss, val_acc = model.evaluate(X_val, y_val_cat, verbose=0)
test_loss, test_acc = model.evaluate(X_test, y_test_cat, verbose=0)

print(f"\nTrain Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

# Make predictions
y_pred_val = np.argmax(model.predict(X_val), axis=1)
y_pred_test = np.argmax(model.predict(X_test), axis=1)

print("\nClassification Report (Validation):")
print(classification_report(y_val, y_pred_val))
print("\nConfusion Matrix (Validation):")
print(confusion_matrix(y_val, y_pred_val))

print("\nClassification Report (Test):")
print(classification_report(y_test, y_pred_test))
print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_pred_test))

# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

#LSTM

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

# Load the dataset
file_path = "/content/balanced_twitter_dataset.csv"
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop(columns=["account_type"])
y = df["account_type"]

# Encode labels (if not already numerical)
le = LabelEncoder()
y = le.fit_transform(y)  # 0 for human, 1 for bot (or vice versa)

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Reshape data for LSTM (samples, timesteps, features)
# Since we don't have natural sequences, we'll treat each feature as a timestep
X_reshaped = X_scaled.reshape(X_scaled.shape[0], 1, X_scaled.shape[1])

# Split data into training, validation, and testing sets
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X_reshaped, y, test_size=0.3, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.33, random_state=42, stratify=y_train_temp
)

# Convert labels to one-hot encoding
y_train_cat = to_categorical(y_train)
y_val_cat = to_categorical(y_val)
y_test_cat = to_categorical(y_test)

# LSTM Model Architecture
model = Sequential([
    LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2])),  # Single timestep, many features
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(2, activation='softmax')  # 2 output classes
])

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train_cat,
                    epochs=100,
                    batch_size=64,
                    validation_data=(X_val, y_val_cat),
                    callbacks=[early_stop],
                    verbose=1)

# Evaluate the model
train_loss, train_acc = model.evaluate(X_train, y_train_cat, verbose=0)
val_loss, val_acc = model.evaluate(X_val, y_val_cat, verbose=0)
test_loss, test_acc = model.evaluate(X_test, y_test_cat, verbose=0)

print(f"\nTrain Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

# Make predictions
y_pred_val = np.argmax(model.predict(X_val), axis=1)
y_pred_test = np.argmax(model.predict(X_test), axis=1)

print("\nClassification Report (Validation):")
print(classification_report(y_val, y_pred_val))
print("\nConfusion Matrix (Validation):")
print(confusion_matrix(y_val, y_pred_val))

print("\nClassification Report (Test):")
print(classification_report(y_test, y_pred_test))
print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_pred_test))

# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()