In [1]:

import pandas as pd
import numpy as np

data = pd.read_csv("cyberbullying_tweets.csv")

data_shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to a new CSV file (optional)
data_shuffled.to_csv("cyberbullying_tweets_shuffled.csv", index=False)
data = pd.read_csv("cyberbullying_tweets_shuffled.csv")
data.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,@Goree_JuhssGuns hahaha he ain't even worth my...,ethnicity
1,RT @hsaymssik: Sucks to have the smile wiped o...,gender
2,"Just a reminder, it's absolutely disgusting to...",ethnicity
3,RT @BuzzFeedUK: When you accidentally open you...,other_cyberbullying
4,Loving the look of the fritters! #mkr,not_cyberbullying


In [2]:

import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

corpus = []
for tweet in data['tweet_text']:
    tweet = tweet.lower()
    tweet = re.sub(r'[^a-zA-Z]', ' ', tweet)
    tweet = tweet.split()
    tweet = [word for word in tweet if word not in stop_words]
    tweet = ' '.join(tweet)
    corpus.append(tweet)

data['clean_text'] = corpus
data.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbookpro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,tweet_text,cyberbullying_type,clean_text
0,@Goree_JuhssGuns hahaha he ain't even worth my...,ethnicity,goree juhssguns hahaha even worth tweets dumb ...
1,RT @hsaymssik: Sucks to have the smile wiped o...,gender,rt hsaymssik sucks smile wiped face huh kat gl...
2,"Just a reminder, it's absolutely disgusting to...",ethnicity,reminder absolutely disgusting see people woul...
3,RT @BuzzFeedUK: When you accidentally open you...,other_cyberbullying,rt buzzfeeduk accidentally open front camera h...
4,Loving the look of the fritters! #mkr,not_cyberbullying,loving look fritters mkr


In [3]:

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=20000)
X = cv.fit_transform(corpus).toarray()
Y = data['cyberbullying_type']


In [4]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

: 

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_preds = nb.predict(X_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)

In [None]:
print(f"\n{'='*30}\nModel: Naive Bayes\n{'='*30}")
print("Accuracy:", accuracy_score(y_test, nb_preds))
print("Classification Report:\n", classification_report(y_test, nb_preds))


plt.figure(figsize=(10,6))
sns.heatmap(confusion_matrix(y_test, nb_preds), annot=True, fmt='d', cmap='Blues')
plt.title(f"Confusion Matrix for Naive Bayes")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
print(f"\n{'='*30}\nModel: KNN\n{'='*30}")
print("Accuracy:", accuracy_score(y_test, knn_preds))
print("Classification Report:\n", classification_report(y_test, nb_preds))


plt.figure(figsize=(10,6))
sns.heatmap(confusion_matrix(y_test, knn_preds), annot=True, fmt='d', cmap='Blues')
plt.title(f"Confusion Matrix for KNN")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
print(f"\n{'='*30}\nModel: Logistic Regressions\n{'='*30}")
print("Accuracy:", accuracy_score(y_test, lr_preds))
print("Classification Report:\n", classification_report(y_test, nb_preds))


plt.figure(figsize=(10,6))
sns.heatmap(confusion_matrix(y_test, lr_preds), annot=True, fmt='d', cmap='Blues')
plt.title(f"Confusion Matrix for Logistic Regressions")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:

sample_tweets = [
    "You're so stupid and annoying!",
    "I hope you have a great day :)",
    "Go back to where you came from.",
    "I hate balck people"]

sample_cleaned = []
for tweet in sample_tweets:
    tweet = tweet.lower()
    tweet = re.sub(r'[^a-zA-Z]', ' ', tweet)
    tweet = tweet.split()
    tweet = [word for word in tweet if word not in stop_words]
    tweet = ' '.join(tweet)
    sample_cleaned.append(tweet)

sample_vec = cv.transform(sample_cleaned).toarray()



In [None]:
predictions = nb.predict(sample_vec)

for tweet, pred in zip(sample_tweets, predictions):
    print(f"Tweet: {tweet}\nPrediction: {pred}\n")

In [None]:
predictions = knn.predict(sample_vec)

for tweet, pred in zip(sample_tweets, predictions):
    print(f"Tweet: {tweet}\nPrediction: {pred}\n")


In [None]:
predictions = lr.predict(sample_vec)

for tweet, pred in zip(sample_tweets, predictions):
    print(f"Tweet: {tweet}\nPrediction: {pred}\n")
