In [7]:
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
import re
import string
from sklearn.svm import LinearSVC
import numpy as np
import joblib  # for saving the trained model


df = pd.read_csv("preprocessed_data.csv")

In [32]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    text = re.sub(r'\d+', '', text)
    return text

def preprocess_and_split_data(df):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(df["tokenized_text"], df["cyberbullying_type"], test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def evaluate_classifier_performance(y_test, y_pred):
    # Evaluate classifier performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    return accuracy, precision, recall, f1

def naive_bayes(df, input_text):
    # Preprocess input text
    input_text_processed = preprocess_text(input_text)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = preprocess_and_split_data(df)

    # Initialize TfidfVectorizer to convert text into feature vectors
    tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform the training data
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

    # Initialize Naive Bayes classifier
    classifier = MultinomialNB()

    # Train the classifier
    classifier.fit(X_train_tfidf, y_train)

    # Transform the preprocessed input text using the TF-IDF vectorizer
    input_text_tfidf = tfidf_vectorizer.transform([input_text_processed])

    # Predict the label for the input text
    nb_pred = classifier.predict(input_text_tfidf)

    # Evaluate classifier performance
    accuracy, precision, recall, f1 = evaluate_classifier_performance(y_test, classifier.predict(tfidf_vectorizer.transform(X_test)))

    return nb_pred[0], accuracy, precision, recall, f1

def svm_classifier(df, input_text):
    # Preprocess input text
    input_text_processed = preprocess_text(input_text)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = preprocess_and_split_data(df)

    # Initialize the TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform the text data
    X_tfidf = tfidf_vectorizer.fit_transform(X_train)  # Use X_train for fitting

    # Initialize the SVM classifier
    svm_classifier = SVC(kernel='linear')

    # Train the classifier
    svm_classifier.fit(X_tfidf, y_train)  # Use X_tfidf for training

    # Transform the preprocessed input text using the TF-IDF vectorizer
    input_text_tfidf = tfidf_vectorizer.transform([input_text_processed])

    # Predict the label for the input text
    svm_pred = svm_classifier.predict(input_text_tfidf)

    # Evaluate classifier performance
    accuracy, precision, recall, f1 = evaluate_classifier_performance(y_test, svm_classifier.predict(tfidf_vectorizer.transform(X_test)))

    return svm_pred[0], accuracy, precision, recall, f1

def logistic_regression(df, input_text):
    # Preprocess input text
    input_text_processed = preprocess_text(input_text)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = preprocess_and_split_data(df)

    # Initialize the TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform the text data
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)  # Use X_train for fitting

    # Initialize the Logistic Regression classifier
    lr_classifier = LogisticRegression()

    # Train the classifier
    lr_classifier.fit(X_train_tfidf, y_train)  # Use X_train_tfidf for training

    # Transform the preprocessed input text using the TF-IDF vectorizer
    input_text_tfidf = tfidf_vectorizer.transform([input_text_processed])

    # Predict the label for the input text
    lr_pred = lr_classifier.predict(input_text_tfidf)

    # Evaluate classifier performance
    accuracy, precision, recall, f1 = evaluate_classifier_performance(y_test, lr_classifier.predict(tfidf_vectorizer.transform(X_test)))

    return lr_pred[0], accuracy, precision, recall, f1

def classify_input_text(input_text, df):
    # Call each classifier function with the input text
    nb_pred, nb_accuracy, nb_precision, nb_recall, nb_f1 = naive_bayes(df, input_text)
    svm_pred, svm_accuracy, svm_precision, svm_recall, svm_f1 = svm_classifier(df, input_text)
    lr_pred, lr_accuracy, lr_precision, lr_recall, lr_f1 = logistic_regression(df, input_text)

    # Print the predicted cyberbullying type and performance metrics
    print("Input Text:", input_text)
    print("\nNaive Bayes Predicted Label:", nb_pred)
    print("Naive Bayes Metrics:")
    print("Accuracy:", nb_accuracy)
    print("Precision:", nb_precision)
    print("Recall:", nb_recall)
    print("F1 Score:", nb_f1)

    print("\nSVM Predicted Label:", svm_pred)
    print("SVM Metrics:")
    print("Accuracy:", svm_accuracy)
    print("Precision:", svm_precision)
    print("Recall:", svm_recall)
    print("F1 Score:", svm_f1)

    print("\nLogistic Regression Predicted Label:", lr_pred)
    print("Logistic Regression Metrics:")
    print("Accuracy:", lr_accuracy)
    print("Precision:", lr_precision)
    print("Recall:", lr_recall)
    print("F1 Score:", lr_f1)

In [37]:
input_text = "how are you"
classify_input_text(input_text, df)

Input Text: how are you

Naive Bayes Predicted Label: gender
Naive Bayes Metrics:
Accuracy: 0.7153789705419855
Precision: 0.7013574981123638
Recall: 0.7153789705419855
F1 Score: 0.6833930118695727

SVM Predicted Label: not_cyberbullying
SVM Metrics:
Accuracy: 0.8107768109864766
Precision: 0.8131701936348158
Recall: 0.8107768109864766
F1 Score: 0.8112505657948316

Logistic Regression Predicted Label: not_cyberbullying
Logistic Regression Metrics:
Accuracy: 0.8057448369850089
Precision: 0.809715340986911
Recall: 0.8057448369850089
F1 Score: 0.8073268050413585


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
input_text = "you are black ass nigger"
classify_input_text(input_text, df)

Input Text: you are black ass nigger

Naive Bayes Predicted Label: ethnicity
Naive Bayes Metrics:
Accuracy: 0.7153789705419855
Precision: 0.7013574981123638
Recall: 0.7153789705419855
F1 Score: 0.6833930118695727

SVM Predicted Label: ethnicity
SVM Metrics:
Accuracy: 0.8107768109864766
Precision: 0.8131701936348158
Recall: 0.8107768109864766
F1 Score: 0.8112505657948316

Logistic Regression Predicted Label: ethnicity
Logistic Regression Metrics:
Accuracy: 0.8057448369850089
Precision: 0.809715340986911
Recall: 0.8057448369850089
F1 Score: 0.8073268050413585


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
input_text = "all the girl are slutty bitch!!!"
classify_input_text(input_text, df)

Input Text: all the girl are slutty bitch!!!

Naive Bayes Predicted Label: gender
Naive Bayes Metrics:
Accuracy: 0.7153789705419855
Precision: 0.7013574981123638
Recall: 0.7153789705419855
F1 Score: 0.6833930118695727

SVM Predicted Label: gender
SVM Metrics:
Accuracy: 0.8107768109864766
Precision: 0.8131701936348158
Recall: 0.8107768109864766
F1 Score: 0.8112505657948316

Logistic Regression Predicted Label: gender
Logistic Regression Metrics:
Accuracy: 0.8057448369850089
Precision: 0.809715340986911
Recall: 0.8057448369850089
F1 Score: 0.8073268050413585


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
input_text = "are you muslim? please don't bomb me"
classify_input_text(input_text, df)

Input Text: are you muslim? please don't bomb me

Naive Bayes Predicted Label: not_cyberbullying
Naive Bayes Metrics:
Accuracy: 0.7153789705419855
Precision: 0.7013574981123638
Recall: 0.7153789705419855
F1 Score: 0.6833930118695727

SVM Predicted Label: religion
SVM Metrics:
Accuracy: 0.8107768109864766
Precision: 0.8131701936348158
Recall: 0.8107768109864766
F1 Score: 0.8112505657948316

Logistic Regression Predicted Label: religion
Logistic Regression Metrics:
Accuracy: 0.8057448369850089
Precision: 0.809715340986911
Recall: 0.8057448369850089
F1 Score: 0.8073268050413585


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
