# Load the dataset

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack


file_path = './twitter_replies_data.csv'  # Update if needed
df = pd.read_csv(file_path)



Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Accuracy: 0.95
Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.97        28
           1       1.00      0.78      0.88         9

    accuracy                           0.95        37
   macro avg       0.97      0.89      0.92        37
weighted avg       0.95      0.95      0.94        37



# Pre-processing

In [None]:
# Check for missing values
df.dropna(inplace=True)

# Define features and target variable
text_feature = df['clean_text']  # Text-based feature
y = df['is_bot?']  # Target (0 = human, 1 = bot)

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.difference(['clean_text'])
numeric_cols = df.select_dtypes(include=['number']).columns

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Convert all categories to numbers
    label_encoders[col] = le

# Extract numeric and encoded categorical features
numeric_features = df[numeric_cols.union(categorical_cols)]

# Convert text data to numerical format using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to 5000 features for efficiency
X_tfidf = vectorizer.fit_transform(text_feature)

# Standardize numeric features
scaler = StandardScaler()
X_numeric = scaler.fit_transform(numeric_features)

# Combine text and numeric features
X_combined = hstack([X_tfidf, X_numeric])



# Training the Model

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Define a Random Forest model with hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_




# Evaluate the model

In [None]:
# Make predictions
y_pred = best_rf.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', report)