<h1>SVM Model</h1>

In [1]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support, f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import pickle#
from skopt import BayesSearchCV

In [2]:
#0, 1, 2
target_names = ['hateful', 'normal', 'offensive']

In [3]:
with open('../hatexplain_data.pickle', 'rb') as file:
    data = pickle.load(file)

In [4]:
y_train = data["y_train"]
y_test = data["y_test"]

<h4> ELMo training and testing variables</h4>

In [None]:
data = np.load('../saved_embeddings/elmo_embeddings_big.npz')

X_train_elmo = data['X_train_elmo']
X_test_elmo = data['X_test_elmo']

data.close()

<h4> Doc2Vec training and testing variables</h4>

In [None]:
with open('../saved_embeddings/d2v_embeddings.pickle', 'rb') as file:
    data = pickle.load(file)

In [None]:
X_train_d2v = data["X_train_d2v"]
X_test_d2v = data["X_test_d2v"]

<h4> fastText training and testing variables</h4>

In [None]:
with open('../saved_embeddings/fastText_embeddings.pickle', 'rb') as file:
    data = pickle.load(file)

In [None]:
X_train_ft = data["X_train_ft"]
X_test_ft = data["X_test_ft"]

<h4> W2V training and testing variables</h4>

In [5]:
with open('../saved_embeddings/w2v_embeddings.pickle', 'rb') as file:
    data = pickle.load(file)

In [6]:
X_train_w2v = data["X_train_w2v"]
X_test_w2v = data["X_test_w2v"]

<h4> TF-IDF training and testing variables</h4>

In [None]:
with open('../saved_embeddings/tfidf_embeddings.pickle', 'rb') as file:
    data = pickle.load(file)

In [None]:
X_train_tfidf = data["X_train_tfidf"]
X_test_tfidf = data["X_test_tfidf"]

<h4> TF-IDF + W2V Weighted training and testing variables</h4>

In [None]:
with open('../saved_embeddings/tfidf+w2v_embeddings.pickle', 'rb') as file:
    data = pickle.load(file)

In [None]:
X_train_tfidf_w2v = data["X_train_tfidf_w2v"]
X_test_tfidf_w2v = data["X_test_tfidf_w2v"]

<h4>Hyperparameter Fine-tuning</h4>

In [8]:
svm_model = SVC(random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [10, 1, 0.1],
    'class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_w2v[:5000], y_train[:5000])
print("Best Hyperparameters: ", grid_search.best_params_)

# search_spaces = {
#     'C': (1e-4, 1e+6, 'log-uniform'),
#     'gamma': (1e-4, 1e+1, 'log-uniform'),
#     'kernel': ['linear', 'rbf']
# }
# np.int = int
# bayes_search = BayesSearchCV(svm_model, search_spaces, n_iter=50, cv=cv, scoring='accuracy', n_jobs=4)
# bayes_search.fit(X_train_w2v[:2000], y_train[:2000])

Best Hyperparameters:  {'C': 0.1, 'class_weight': None, 'gamma': 10, 'kernel': 'linear'}


In [None]:
best_params = bayes_search.best_params_
best_score = bayes_search.best_score_

print(best_params)
print(best_score)

<h1>Word2Vec Model</h1>

<h3>Variance in performance</h3> 

In [None]:
svm_classifier = SVC(C=0.04, kernel='rbf', random_state=42)

In [None]:
#Using validation set to test for variance in accuracy results using k fold validation

# stratified_10_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(svm_classifier, train_X, train_y, cv=stratified_10_fold, scoring='accuracy')
# accuracy_variance = np.var(cross_val_scores)

# # Print the accuracy scores for each fold
# print("Accuracy scores for each fold:", cross_val_scores)

# # Print the variance of the accuracy scores
# print("Variance in accuracy scores:", accuracy_variance)

<h3>Final Classifier</h3>

In [None]:
svm_classifier.fit(X_train_w2v, y_train)

In [None]:
# Prediciting document labels for test data
y_svm_pred_w2v = svm_classifier.predict(X_test_w2v)

svm_report = classification_report(y_test, y_svm_pred_w2v, target_names=target_names)
print("Report: \n", svm_report)

In [None]:
# Generate the confusion matrix


cm = confusion_matrix(y_test, y_svm_pred_w2v)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Hateful', 'Normal', 'Offensive'], yticklabels=['Hateful', 'Normal', 'Offensive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

<h1>TF-IDF Model</h1>

<h3>Variance in performance</h3> 

In [None]:
svm_classifier = SVC(random_state=42)

In [None]:
#Using validation set to test for variance in accuracy results using k fold validation

# stratified_10_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(svm_classifier, train_X, train_y, cv=stratified_10_fold, scoring='accuracy')
# accuracy_variance = np.var(cross_val_scores)

# # Print the accuracy scores for each fold
# print("Accuracy scores for each fold:", cross_val_scores)

# # Print the variance of the accuracy scores
# print("Variance in accuracy scores:", accuracy_variance)

<h3>Final Classifier</h3>

In [None]:
svm_classifier.fit(X_train_tfidf, y_train)

In [None]:
# Prediciting document labels for test data
y_svm_pred_tfidf = svm_classifier.predict(X_test_tfidf)

svm_report = classification_report(y_test, y_svm_pred_tfidf, target_names=target_names)
print("Report: \n", svm_report)

In [None]:
# Generate the confusion matrix


cm = confusion_matrix(y_test, y_svm_pred_tfidf)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Hateful', 'Normal', 'Offensive'], yticklabels=['Hateful', 'Normal', 'Offensive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

<h1>TF-IDF + W2V Model</h1>

<h3>Variance in performance</h3> 

In [None]:
svm_classifier = SVC(random_state=42)

In [None]:
#Using validation set to test for variance in accuracy results using k fold validation

# stratified_10_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(svm_classifier, train_X, train_y, cv=stratified_10_fold, scoring='accuracy')
# accuracy_variance = np.var(cross_val_scores)

# # Print the accuracy scores for each fold
# print("Accuracy scores for each fold:", cross_val_scores)

# # Print the variance of the accuracy scores
# print("Variance in accuracy scores:", accuracy_variance)

<h3>Final Classifier</h3>

In [None]:
svm_classifier.fit(X_train_tfidf_w2v, y_train)

In [None]:
# Prediciting document labels for test data
y_svm_pred_tfidf_w2v = svm_classifier.predict(X_test_tfidf_w2v)

svm_report = classification_report(y_test, y_svm_pred_tfidf_w2v, target_names=target_names)
print("Report: \n", svm_report)

In [None]:
# Generate the confusion matrix


cm = confusion_matrix(y_test, y_svm_pred_tfidf_w2v)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Hateful', 'Normal', 'Offensive'], yticklabels=['Hateful', 'Normal', 'Offensive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

<h1>fastText SVM Model</h1>

<h3>Variance in performance</h3> 

In [None]:
svm_classifier = SVC(random_state=42)

In [None]:
#Using validation set to test for variance in accuracy results using k fold validation

# stratified_10_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(svm_classifier, train_X, train_y, cv=stratified_10_fold, scoring='accuracy')
# accuracy_variance = np.var(cross_val_scores)

# # Print the accuracy scores for each fold
# print("Accuracy scores for each fold:", cross_val_scores)

# # Print the variance of the accuracy scores
# print("Variance in accuracy scores:", accuracy_variance)

<h3>Final Classifier</h3>

In [None]:
svm_classifier.fit(X_train_ft, y_train)

In [None]:
# Prediciting document labels for test data
y_svm_pred_ft = svm_classifier.predict(X_test_ft)

svm_report = classification_report(y_test, y_svm_pred_ft, target_names=target_names)
print("Report: \n", svm_report)

In [None]:
# Generate the confusion matrix


cm = confusion_matrix(y_test, y_svm_pred_ft)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Hateful', 'Normal', 'Offensive'], yticklabels=['Hateful', 'Normal', 'Offensive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

<h1>Doc2Vec SVM Model</h1>

<h3>Variance in performance</h3> 

In [None]:
svm_classifier = SVC(random_state=42)

In [None]:
#Using validation set to test for variance in accuracy results using k fold validation

# stratified_10_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(svm_classifier, train_X, train_y, cv=stratified_10_fold, scoring='accuracy')
# accuracy_variance = np.var(cross_val_scores)

# # Print the accuracy scores for each fold
# print("Accuracy scores for each fold:", cross_val_scores)

# # Print the variance of the accuracy scores
# print("Variance in accuracy scores:", accuracy_variance)

<h3>Final Classifier</h3>

In [None]:
svm_classifier.fit(X_train_d2v, y_train)

In [None]:
# Prediciting document labels for test data
y_svm_pred_d2v = svm_classifier.predict(X_test_d2v)

svm_report = classification_report(y_test, y_svm_pred_d2v, target_names=target_names)
print("Report: \n", svm_report)

In [None]:
# Generate the confusion matrix


cm = confusion_matrix(y_test, y_svm_pred_d2v)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Hateful', 'Normal', 'Offensive'], yticklabels=['Hateful', 'Normal', 'Offensive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

<h1>ELMo SVM Model</h1>

<h3>Variance in performance</h3> 

In [None]:
svm_classifier = SVC(random_state=42)

In [None]:
#Using validation set to test for variance in accuracy results using k fold validation

# stratified_10_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(svm_classifier, train_X, train_y, cv=stratified_10_fold, scoring='accuracy')
# accuracy_variance = np.var(cross_val_scores)

# # Print the accuracy scores for each fold
# print("Accuracy scores for each fold:", cross_val_scores)

# # Print the variance of the accuracy scores
# print("Variance in accuracy scores:", accuracy_variance)

<h3>Final Classifier</h3>

In [None]:
svm_classifier.fit(X_train_elmo, y_train)

In [None]:
# Prediciting document labels for test data
y_svm_pred_elmo = svm_classifier.predict(X_test_elmo)

svm_report = classification_report(y_test, y_svm_pred_elmo, target_names=target_names)
print("Report: \n", svm_report)

In [None]:
# Generate the confusion matrix


cm = confusion_matrix(y_test, y_svm_pred_elmo)

# Display the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Hateful', 'Normal', 'Offensive'], yticklabels=['Hateful', 'Normal', 'Offensive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()