<h1>Logistic Regression Model</h1>

In [2]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support, f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pickle
import numpy as np

In [3]:
#0, 1, 2
multi_names = ['Hate', 'Neutral', 'Offensive']

In [4]:
with open('../../hatexplain_data.pickle', 'rb') as file:
    data = pickle.load(file)

In [5]:
y_train = data["y_train"]
y_test = data["y_test"]

<h4> ELMo training and testing variables</h4>

In [None]:
data = np.load('elmo_embeddings_big.npz')

X_train_elmo = data['X_train_elmo']
X_test_elmo = data['X_test_elmo']

data.close()

<h4> Doc2Vec training and testing variables</h4>

In [None]:
with open('d2v_embeddings.pickle', 'rb') as file:
    data = pickle.load(file)

In [None]:
X_train_d2v = data["X_train_d2v"]
X_test_d2v = data["X_test_d2v"]

<h4> fastText training and testing variables</h4>

In [None]:
with open('fastText_embeddings.pickle', 'rb') as file:
    data = pickle.load(file)

In [None]:
X_train_ft = data["X_train_ft"]
X_test_ft = data["X_test_ft"]

<h4> W2V training and testing variables</h4>

In [6]:
with open('../saved_embeddings/w2v_embeddings.pickle', 'rb') as file:
    data = pickle.load(file)

In [7]:
X_train_w2v = data["X_train_w2v"]
X_test_w2v = data["X_test_w2v"]

<h4> TF-IDF training and testing variables</h4>

In [11]:
with open('../saved_embeddings/tfidf_embeddings.pickle', 'rb') as file:
    data = pickle.load(file)

In [12]:
X_train_tfidf = data["X_train_tfidf"]
X_test_tfidf = data["X_test_tfidf"]

<h4>Hyperparameter Fine-tuning</h4>

In [None]:
# param_grid = {
#     'C': [0.1, 1, 10],
#     'kernel': ['linear', 'rbf'],
#     'gamma': [10, 1, 0.1],
#     'class_weight': [None, 'balanced']
# }

# cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
# grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
# grid_search.fit(train_X[:5000], train_y[:5000])

# print("Best Hyperparameters: ", grid_search.best_params_)

<h1>Word2Vec Model</h1>

<h3>Variance in performance</h3> 

In [8]:
lr_classifier = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
#Using validation set to test for variance in accuracy results using k fold validation

# stratified_10_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(svm_classifier, train_X, train_y, cv=stratified_10_fold, scoring='accuracy')
# accuracy_variance = np.var(cross_val_scores)

# # Print the accuracy scores for each fold
# print("Accuracy scores for each fold:", cross_val_scores)

# # Print the variance of the accuracy scores
# print("Variance in accuracy scores:", accuracy_variance)

<h3>Final Classifier</h3>

In [9]:
lr_classifier.fit(X_train_w2v, y_train)

In [10]:
# Prediciting document labels for test data
y_lr_pred_w2v = lr_classifier.predict(X_test_w2v)

rf_report = classification_report(y_test, y_lr_pred_w2v, target_names=target_names)
print("Report: \n", rf_report)

Report: 
               precision    recall  f1-score   support

     hateful       0.64      0.64      0.64      1187
      normal       0.61      0.76      0.68      1563
   offensive       0.53      0.35      0.42      1096

    accuracy                           0.60      3846
   macro avg       0.59      0.58      0.58      3846
weighted avg       0.60      0.60      0.59      3846



In [None]:
# Generate the confusion matrix


cm = confusion_matrix(y_test, y_lr_pred_w2v)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Hateful', 'Normal', 'Offensive'], yticklabels=['Hateful', 'Normal', 'Offensive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

<h1>TF-IDF Model</h1>

<h3>Variance in performance</h3> 

In [13]:
lr_classifier = LogisticRegression(random_state=42)

In [None]:
#Using validation set to test for variance in accuracy results using k fold validation

# stratified_10_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(svm_classifier, train_X, train_y, cv=stratified_10_fold, scoring='accuracy')
# accuracy_variance = np.var(cross_val_scores)

# # Print the accuracy scores for each fold
# print("Accuracy scores for each fold:", cross_val_scores)

# # Print the variance of the accuracy scores
# print("Variance in accuracy scores:", accuracy_variance)

<h3>Final Classifier</h3>

In [14]:
lr_classifier.fit(X_train_tfidf, y_train)

In [15]:
# Prediciting document labels for test data
y_lr_pred_tfidf = lr_classifier.predict(X_test_tfidf)

rf_report = classification_report(y_test, y_lr_pred_tfidf, target_names=target_names)
print("Report: \n", rf_report)

Report: 
               precision    recall  f1-score   support

     hateful       0.74      0.71      0.73      1187
      normal       0.66      0.79      0.72      1563
   offensive       0.58      0.43      0.50      1096

    accuracy                           0.66      3846
   macro avg       0.66      0.65      0.65      3846
weighted avg       0.66      0.66      0.66      3846



In [None]:
# Generate the confusion matrix


cm = confusion_matrix(y_test, y_lr_pred_tfidf)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Hateful', 'Normal', 'Offensive'], yticklabels=['Hateful', 'Normal', 'Offensive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

<h1>fastText SVM Model</h1>

<h3>Variance in performance</h3> 

In [None]:
lr_classifier = LogisticRegression(random_state=42)

In [None]:
#Using validation set to test for variance in accuracy results using k fold validation

# stratified_10_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(svm_classifier, train_X, train_y, cv=stratified_10_fold, scoring='accuracy')
# accuracy_variance = np.var(cross_val_scores)

# # Print the accuracy scores for each fold
# print("Accuracy scores for each fold:", cross_val_scores)

# # Print the variance of the accuracy scores
# print("Variance in accuracy scores:", accuracy_variance)

<h3>Final Classifier</h3>

In [None]:
rf_classifier.fit(X_train_ft, y_trainft)

In [None]:
# Prediciting document labels for test data
y_lr_pred_ft = rf_classifier.predict(X_test_ft)

rf_report = classification_report(y_test_ft, y_lr_pred_ft, target_names=target_names)
print("Report: \n", rf_report)

In [None]:
# Generate the confusion matrix


cm = confusion_matrix(y_test_ft, y_lr_pred_ft)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Hateful', 'Normal', 'Offensive'], yticklabels=['Hateful', 'Normal', 'Offensive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

<h1>Doc2Vec SVM Model</h1>

<h3>Variance in performance</h3> 

In [21]:
rf_classifier = RandomForestClassifier(max_iter=1000, random_state=42)

In [None]:
#Using validation set to test for variance in accuracy results using k fold validation

# stratified_10_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(svm_classifier, train_X, train_y, cv=stratified_10_fold, scoring='accuracy')
# accuracy_variance = np.var(cross_val_scores)

# # Print the accuracy scores for each fold
# print("Accuracy scores for each fold:", cross_val_scores)

# # Print the variance of the accuracy scores
# print("Variance in accuracy scores:", accuracy_variance)

<h3>Final Classifier</h3>

In [22]:
rf_classifier.fit(X_train_d2v, y_traind2v)

In [23]:
# Prediciting document labels for test data
y_lr_pred_d2v = rf_classifier.predict(X_test_d2v)

rf_report = classification_report(y_test_d2v, y_lr_pred_d2v, target_names=target_names)
print("Report: \n", rf_report)

Report: 
               precision    recall  f1-score   support

     hateful       0.64      0.60      0.62      1187
      normal       0.58      0.78      0.67      1563
   offensive       0.53      0.30      0.39      1096

    accuracy                           0.59      3846
   macro avg       0.58      0.56      0.56      3846
weighted avg       0.58      0.59      0.57      3846



In [None]:
# Generate the confusion matrix


cm = confusion_matrix(y_test_d2v, y_lr_pred_d2v)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Hateful', 'Normal', 'Offensive'], yticklabels=['Hateful', 'Normal', 'Offensive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()