In [77]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (accuracy_score, 
                             precision_score,
                             recall_score, 
                             f1_score)
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

In [78]:
data = pd.read_csv("../../data/normalized_tweets.csv")
data.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,In words katandandre food crapilicious,not_cyberbullying
1,Why aussietv white,not_cyberbullying
2,classy whore Or red velvet cupcakes,not_cyberbullying
3,meh thanks heads concerned another angry dud...,not_cyberbullying
4,This ISIS account pretending Kurdish account ...,not_cyberbullying


In [79]:
data['cyberbullying_type'].value_counts()

religion               7996
age                    7992
ethnicity              7958
gender                 7906
not_cyberbullying      7816
other_cyberbullying    7613
Name: cyberbullying_type, dtype: int64

In [80]:
LE = LabelEncoder()
LE.fit(data['cyberbullying_type'])
data['cyberbullying_type'] = LE.transform(data['cyberbullying_type'])

In [81]:
x = data['tweet_text']
y = data['cyberbullying_type']

## Multinomial Naive Bayes Model

In [82]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
# Transform text data into TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(x)

# Create Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Define K-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store F1 scores and accuracy scores for each fold
f1_scores = []
accuracy_scores = []

# Perform K-fold cross-validation
for train_index, test_index in kfold.split(X_tfidf):
    X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train classifier
    nb_classifier.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = nb_classifier.predict(X_test)
    
    # Calculate F1 score and accuracy
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    
    # Append scores to lists
    f1_scores.append(f1)
    accuracy_scores.append(accuracy)

# Calculate mean F1 score and accuracy across all folds
mean_f1_score = np.mean(f1_scores)
mean_accuracy = np.mean(accuracy_scores)

print("Mean F1 Score:", mean_f1_score)
print("Mean Accuracy:", mean_accuracy)

Mean F1 Score: 0.7125384027333365
Mean Accuracy: 0.7389649038384982


## Binary Naive Bayes

In [83]:
from sklearn.naive_bayes import BernoulliNB

In [84]:
# religion = 5
# age = 0
# ethnicity = 1
# gender = 2
# not_cyberbullying = 3
# other_cyberbullying = 4
data['cyberbullying_type'].value_counts()

5    7996
0    7992
1    7958
2    7906
3    7816
4    7613
Name: cyberbullying_type, dtype: int64

In [85]:
# select religion and not_cyberbullyng
data = data[(data["cyberbullying_type"] == 5) | (data["cyberbullying_type"] == 3)]

In [86]:
data['cyberbullying_type'].value_counts()

5    7996
3    7816
Name: cyberbullying_type, dtype: int64

In [87]:
data

Unnamed: 0,tweet_text,cyberbullying_type
0,In words katandandre food crapilicious,3
1,Why aussietv white,3
2,classy whore Or red velvet cupcakes,3
3,meh thanks heads concerned another angry dud...,3
4,This ISIS account pretending Kurdish account ...,3
...,...,...
23713,Can imagine Christians came together like 5 ti...,5
23714,So support justice initial problem It morphed...,5
23715,If harbour doubts Muslims believe sharia note...,5
23716,One thing Muslims want exterminate everyone M...,5


In [88]:
# Separate features and target
X = data['tweet_text']
y = data['cyberbullying_type']

# Map labels to binary: 1 for cyberbullying, 0 for non-cyberbullying
y_binary = (y == 5).astype(int)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform text data into TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_binary, test_size=0.2, random_state=42)

# Create Bernoulli Naive Bayes classifier
nb_classifier = BernoulliNB()

# Train classifier
nb_classifier.fit(X_train, y_train)

# Predict on test set
y_pred = nb_classifier.predict(X_test)

# Calculate F1 score and accuracy
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("F1 Score:", f1)
print("Accuracy:", accuracy)


F1 Score: 0.94336
Accuracy: 0.9440404679102118
