In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

### Loading combined dataset - genuine and bot accounts

In [None]:
# Save the combined dataframe to a new CSV file
combined_data.to_csv("combined_users_full.csv", index=False) #change file path if necessary

In [19]:
# Only use this step if you are loading the full dataset directly
# combined_data = pd.read_csv("combined_data_full.csv")

In [28]:
combined_data["geo_enabled"]

0        NaN
1        1.0
2        NaN
3        1.0
4        NaN
        ... 
11012    NaN
11013    NaN
11014    NaN
11015    NaN
11016    NaN
Name: geo_enabled, Length: 11017, dtype: float64

In [20]:
combined_data.head()

Unnamed: 0,id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,listed_count,url,lang,...,description,contributors_enabled,following,created_at,timestamp,crawled_at,updated,test_set_1,test_set_2,Type
0,531256710,Christel Martillo,Martillodig,12515,787,1947,1,0,,en,...,Lover of life - need I say more!,,,Tue Mar 20 12:07:58 +0000 2012,2012-03-20 13:07:58,2014-05-05 23:23:09,2016-03-15 15:39:59,,1.0,Bot
1,72081097,TMJ-ON CstSrv Jobs,tmj_on_cstsrv,117,300,247,0,30,https://t.co/DByWt45HZj,en,...,Follow this account for geo-targeted Customer ...,,,Sun Sep 06 17:29:17 +0000 2009,2009-09-06 19:29:17,2016-03-15 13:49:10,2016-03-15 13:49:10,,,Bot
2,327060670,Federico Floria,Airolf,25930,10854,10606,418,66,http://t.co/0mQHlgxEmD,it,...,"Mika e' la mia vita, Supernatural la mia passi...",,,Thu Jun 30 22:59:43 +0000 2011,2011-07-01 00:59:43,2014-04-27 23:20:27,2016-03-15 14:13:49,1.0,,Bot
3,2398910547,Southern Fields,southern_fields,730,1132,914,557,1,,en,...,"Just a plain ol' country boy. I love God, Geor...",,,Mon Mar 10 07:48:09 +0000 2014,2014-03-10 08:48:09,2014-05-02 23:22:50,2016-03-15 14:16:12,1.0,,Bot
4,69218476,Vanessa,Vanessa8w3v,1,153,568,0,0,http://xurl.jp/pc3a,,...,,,,1251353539000L,2009-08-27 08:12:19,2010-11-07 11:10:52,2016-03-14 17:08:43,,,Bot


In [21]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11017 entries, 0 to 11016
Data columns (total 43 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   id                                  11017 non-null  int64  
 1   name                                11016 non-null  object 
 2   screen_name                         11017 non-null  object 
 3   statuses_count                      11017 non-null  int64  
 4   followers_count                     11017 non-null  int64  
 5   friends_count                       11017 non-null  int64  
 6   favourites_count                    11017 non-null  int64  
 7   listed_count                        11017 non-null  int64  
 8   url                                 3508 non-null   object 
 9   lang                                10017 non-null  object 
 10  time_zone                           5015 non-null   object 
 11  location                            4908 

### Data preprocessing and encoding

In [None]:
# Drop columns that are unlikely to be useful or have too many missing values
columns_to_drop = [
    "id", "name", "screen_name", "url", "lang", "time_zone", "location",
    "default_profile", "default_profile_image", "geo_enabled", "profile_image_url",
    "profile_banner_url", "profile_use_background_image", "profile_background_image_url_https",
    "profile_text_color", "profile_image_url_https", "profile_sidebar_border_color",
    "profile_background_tile", "profile_sidebar_fill_color", "profile_background_image_url",
    "profile_background_color", "profile_link_color", "utc_offset", "is_translator",
    "follow_request_sent", "protected", "verified", "notifications", "contributors_enabled",
    "following", "created_at", "timestamp", "crawled_at", "updated", "test_set_1", "test_set_2"
]
combined_data_dropped = combined_data.drop(columns=columns_to_drop)
combined_data_dropped.head()

Unnamed: 0,statuses_count,followers_count,friends_count,favourites_count,listed_count,description,Type
0,12515,787,1947,1,0,Lover of life - need I say more!,Bot
1,117,300,247,0,30,Follow this account for geo-targeted Customer ...,Bot
2,25930,10854,10606,418,66,"Mika e' la mia vita, Supernatural la mia passi...",Bot
3,730,1132,914,557,1,"Just a plain ol' country boy. I love God, Geor...",Bot
4,1,153,568,0,0,,Bot


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Instantiate a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=1000)

# Fit and transform the 'description' column
combined_data_dropped['description'] = combined_data_dropped['description'].fillna('')
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_data_dropped['description'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate original DataFrame with the TF-IDF DataFrame
combined_data_dropped = pd.concat([combined_data_dropped, tfidf_df], axis=1)

# Now drop the original 'description' column as it's been vectorized
combined_data_dropped.drop('description', axis=1, inplace=True)

# Encoding 'Type'
combined_data_dropped['Type'] = combined_data_dropped['Type'].map({'Bot': 1, 'Genuine': 0})

combined_data_dropped.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/harrychang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,statuses_count,followers_count,friends_count,favourites_count,listed_count,Type,03,10,100,11,...,xx,ya,yeah,year,years,yo,york,young,youth,youtube
0,12515,787,1947,1,0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,117,300,247,0,30,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,25930,10854,10606,418,66,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,730,1132,914,557,1,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,153,568,0,0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Modelling with XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix


# Assuming 'target' is your binary column indicating bot or not
X = combined_data_dropped.drop('Type', axis=1)
y = combined_data_dropped['Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the XGBoost classifier
model = XGBClassifier()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
recall = recall_score(y_test, predictions)
precision = precision_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.984573502722323
Recall: 0.9856020942408377
Precision: 0.9920948616600791
F1 Score: 0.9888378200919239
Confusion Matrix:
[[ 664   12]
 [  22 1506]]


### Modelling with Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Fit the model
rf_model.fit(X_train, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)
rf_conf_matrix = confusion_matrix(y_test, rf_predictions)

# Print the evaluation metrics
print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"Random Forest Recall: {rf_recall}")
print(f"Random Forest Precision: {rf_precision}")
print(f"Random Forest F1 Score: {rf_f1}")
print("Random Forest Confusion Matrix:")
print(rf_conf_matrix)

Random Forest Accuracy: 0.985934664246824
Random Forest Recall: 0.987565445026178
Random Forest Precision: 0.9921104536489151
Random Forest F1 Score: 0.9898327320432928
Random Forest Confusion Matrix:
[[ 664   12]
 [  19 1509]]


### Modelling with Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression classifier
lr_model = LogisticRegression(max_iter=1000)

# Fit the model
lr_model.fit(X_train, y_train)

# Make predictions
lr_predictions = lr_model.predict(X_test)

# Evaluate the model
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_recall = recall_score(y_test, lr_predictions)
lr_precision = precision_score(y_test, lr_predictions)
lr_f1 = f1_score(y_test, lr_predictions)
lr_conf_matrix = confusion_matrix(y_test, lr_predictions)

# Print the evaluation metrics
print(f"Logistic Regression Accuracy: {lr_accuracy}")
print(f"Logistic Regression Recall: {lr_recall}")
print(f"Logistic Regression Precision: {lr_precision}")
print(f"Logistic Regression F1 Score: {lr_f1}")
print("Logistic Regression Confusion Matrix:")
print(lr_conf_matrix)

Logistic Regression Accuracy: 0.896551724137931
Logistic Regression Recall: 0.9882198952879581
Logistic Regression Precision: 0.877906976744186
Logistic Regression F1 Score: 0.9298029556650245
Logistic Regression Confusion Matrix:
[[ 466  210]
 [  18 1510]]
