In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support, f1_score
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pickle
import numpy as np

In [2]:
#0, 1, 2
multi_names = ['Hate', 'Neutral', 'Offensive']
#0, 1
binary_names = ["Non-Hate", "Hate"]

In [3]:
with open('../../hatexplain_data.pickle', 'rb') as file:
    data = pickle.load(file)

In [4]:
y_train = data["y_train"]
y_test = data["y_test"]
y_train_binary = data["y_train_binary"]
y_test_binary = data["y_test_binary"]

<h4> W2V training and testing variables</h4>

In [6]:
with open('../../saved_embeddings/w2v_embeddings.pickle', 'rb') as file:
    data = pickle.load(file)

In [7]:
X_train_w2v = data["X_train_w2v"]
X_test_w2v = data["X_test_w2v"]

<h4>Hyperparameter Fine-tuning</h4>

In [10]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': ['sqrt', 'log2', None],
    'class_weight': [None, 'balanced']
}

rf_model = RandomForestClassifier(random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(rf_model, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train_w2v[:2000], y_train[:2000])

print("Best Hyperparameters: ", grid_search.best_params_)

Best Hyperparameters:  {'class_weight': None, 'criterion': 'entropy', 'max_features': None, 'n_estimators': 200}


In [11]:
cv_results = grid_search.cv_results_['mean_test_score']
best_score = grid_search.best_score_

print(cv_results)
print(best_score)

[0.45418905 0.44586767 0.46291727 0.45229941 0.44677949 0.44819877
 0.47120569 0.47488107 0.47631036 0.45678517 0.45637957 0.45905841
 0.44426431 0.43739846 0.43955895 0.46832476 0.47979807 0.46959487
 0.45678517 0.45637957 0.45905841 0.44426431 0.43739846 0.43955895
 0.46832476 0.47979807 0.46959487 0.42529899 0.44552636 0.44505465
 0.44344911 0.43838267 0.44621281 0.46463197 0.46054148 0.46288987
 0.43921983 0.44848348 0.44863711 0.44351188 0.43840412 0.43941707
 0.46821042 0.45974052 0.46172919 0.43921983 0.44848348 0.44863711
 0.44351188 0.43840412 0.43941707 0.46821042 0.45974052 0.46172919]
0.47979806784881757


<h1>Word2Vec Random Forest Model</h1>

<h3>Variance in performance</h3> 

In [13]:
rf_classifier = RandomForestClassifier(random_state=42)

In [None]:
#Using validation set to test for variance in accuracy results using k fold validation

# stratified_10_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(svm_classifier, train_X, train_y, cv=stratified_10_fold, scoring='accuracy')
# accuracy_variance = np.var(cross_val_scores)

# # Print the accuracy scores for each fold
# print("Accuracy scores for each fold:", cross_val_scores)

# # Print the variance of the accuracy scores
# print("Variance in accuracy scores:", accuracy_variance)

<h3>Final Classifier</h3>

In [14]:
rf_classifier.fit(X_train_w2v, y_train)

In [15]:
# Prediciting document labels for test data
y_rf_pred_w2v = rf_classifier.predict(X_test_w2v)

rf_report = classification_report(y_test, y_rf_pred_w2v, target_names=multi_names)
print("Report: \n", rf_report)

Report: 
               precision    recall  f1-score   support

        Hate       0.59      0.54      0.56      1187
     Neutral       0.53      0.79      0.64      1563
   Offensive       0.47      0.19      0.27      1096

    accuracy                           0.54      3846
   macro avg       0.53      0.51      0.49      3846
weighted avg       0.53      0.54      0.51      3846



In [None]:
# Generate the confusion matrix


cm = confusion_matrix(y_test, y_rf_pred_w2v)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=multi_names, yticklabels=multi_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()