<h1>Logistic Regression Model</h1>

In [1]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support, f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pickle
import numpy as np

In [2]:
#0, 1, 2
multi_names = ['Hate', 'Neutral', 'Offensive']
#0, 1
binary_names = ["Hate", "Not Hate"]

In [3]:
with open('../../hatexplain_data.pickle', 'rb') as file:
    data = pickle.load(file)

In [4]:
y_train = data["y_train"]
y_test = data["y_test"]
y_train_binary = data["y_train_binary"]
y_test_binary = data["y_test_binary"]

<h4> TF-IDF training and testing variables</h4>

In [5]:
with open('../../saved_embeddings/tfidf_embeddings.pickle', 'rb') as file:
    data = pickle.load(file)

In [6]:
X_train_tfidf = data["X_train_tfidf"]
X_test_tfidf = data["X_test_tfidf"]
X_train_tfidf_binary = data["X_train_tfidf_binary"]
X_test_tfidf_binary = data["X_test_tfidf_binary"]

<h4>Hyperparameter Fine-tuning</h4>

In [None]:
# param_grid = {
#     'C': [0.1, 1, 10],
#     'kernel': ['linear', 'rbf'],
#     'gamma': [10, 1, 0.1],
#     'class_weight': [None, 'balanced']
# }

# cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
# grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
# grid_search.fit(train_X[:5000], train_y[:5000])

# print("Best Hyperparameters: ", grid_search.best_params_)

<h1>TF-IDF Model</h1>

<h3>Variance in performance</h3> 

In [7]:
lr_classifier = LogisticRegression(random_state=42)

In [None]:
#Using validation set to test for variance in accuracy results using k fold validation

# stratified_10_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(svm_classifier, train_X, train_y, cv=stratified_10_fold, scoring='accuracy')
# accuracy_variance = np.var(cross_val_scores)

# # Print the accuracy scores for each fold
# print("Accuracy scores for each fold:", cross_val_scores)

# # Print the variance of the accuracy scores
# print("Variance in accuracy scores:", accuracy_variance)

<h3>Final Classifier</h3>

In [8]:
lr_classifier.fit(X_train_tfidf, y_train)

In [9]:
# Prediciting document labels for test data
y_lr_pred_tfidf = lr_classifier.predict(X_test_tfidf)

lr_report = classification_report(y_test, y_lr_pred_tfidf, target_names=multi_names)
print("Report: \n", lr_report)

Report: 
               precision    recall  f1-score   support

        Hate       0.74      0.71      0.73      1187
     Neutral       0.66      0.79      0.72      1563
   Offensive       0.57      0.43      0.49      1096

    accuracy                           0.66      3846
   macro avg       0.66      0.64      0.64      3846
weighted avg       0.66      0.66      0.66      3846



In [11]:
# Generate the confusion matrix


# Plot the confusion matrix using Seaborn
cm = confusion_matrix(y_true=y_test_binary, y_pred=y_lr_pred_tfidf)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='gray_r', cbar=False, xticklabels=multi_names, yticklabels=multi_names)
plt.title('LR (with TF-IDF) Hate Speech Detection Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('lr_tfidf_cm.png')

ValueError: Found input variables with inconsistent numbers of samples: [1187, 3846]

<h3>Final Classifier (Binary Classification)</h3>

In [None]:
lr_classifier = LogisticRegression(max_iter=10000, random_state=42)

In [None]:
lr_classifier.fit(X_train_tfidf_binary, y_train_binary)

In [None]:
# Prediciting document labels for test data
y_pred_binary = lr_classifier.predict(X_test_tfidf_binary)

svm_report = classification_report(y_test_binary, y_pred_binary, target_names=binary_names)
print("Report: \n", svm_report)

In [None]:
# Generate the confusion matrix


# Plot the confusion matrix using Seaborn
cm = confusion_matrix(y_true=y_test_binary, y_pred=y_pred_binary)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='gray_r', cbar=False, xticklabels=binary_names, yticklabels=binary_names)
plt.title('LR (with TF-IDF) Binary Hate Speech Detection Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('lr_tfidf_binary_cm.png')