<a href="https://colab.research.google.com/github/ankit-genzeon/AI-ML-Bootcamp-Genzeon-2023/blob/master/Bot_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
Dataset = '/content/drive/MyDrive/Colab Notebooks/DataSet/bot_detection_data.csv'
df = pd.read_csv(Dataset)

In [None]:
df.head()

In [None]:
df.describe()

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# # Load the dataset
# df = pd.read_csv('bot_detection_dataset.csv')

# Preprocess the data
# Drop unnecessary columns
df = df.drop(['User ID', 'Username', 'Created At'], axis=1)

# Handling missing values
df['Location'].fillna('Unknown', inplace=True)  # Replace missing locations with 'Unknown'

# Feature engineering
df['Tweet_Length'] = df['Tweet'].apply(len)  # Add a new feature 'Tweet_Length' with the length of each tweet

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer()
tweet_vectors = vectorizer.fit_transform(df['Tweet'])
df = pd.concat([df, pd.DataFrame(tweet_vectors.toarray(), columns=vectorizer.get_feature_names_out())], axis=1)
df = df.drop('Tweet', axis=1)

# One-hot encoding for 'Location' column
column_transformer = ColumnTransformer([('encoder', OneHotEncoder(), ['Location'])], remainder='passthrough')
df = pd.DataFrame(column_transformer.fit_transform(df), columns=column_transformer.get_feature_names_out())

# Visualize the data
sns.countplot(x='Bot Label', data=df)
plt.title('Distribution of Bot and Non-Bot Users')
plt.show()

# Split the data into training and testing sets
X = df.drop(['Bot Label'], axis=1)  # Features
y = df['Bot Label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a Machine Learning Algorithm: Random Forest
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# # Predict Bot or Not on new data
# new_data = pd.read_csv('new_data.csv')  # Replace 'new_data.csv' with your new data file

# # Preprocess the new data
# new_data = new_data.drop(['User ID', 'Username', 'Created At'], axis=1)
# new_data['Location'].fillna('Unknown', inplace=True)
# new_data['Tweet_Length'] = new_data['Tweet'].apply(len)

# new_tweet_vectors = vectorizer.transform(new_data['Tweet'])
# new_data = pd.concat([new_data, pd.DataFrame(new_tweet_vectors.toarray(), columns=vectorizer.get_feature_names_out())], axis=1)
# new_data = pd.DataFrame(column_transformer.transform(new_data), columns=column_transformer.get_feature_names_out())

# new_predictions = model.predict(new_data)
# print(new_predictions)


ValueError: ignored