# Implementation of Other Models

In [3]:
# Load dataset
import numpy as np
import pandas as pd

train_df = pd.read_csv("./dataset/train_tfidf_features.csv")
test_df = pd.read_csv("./dataset/test_tfidf_features.csv")

X_train = train_df.drop(["id", "label"], axis=1)
y_train = train_df["label"]
X_test = test_df.drop(["id"], axis=1)

import os
if not os.path.exists('./predictions'):
    os.makedirs('./predictions')

def save_predictions(y_pred, filename):
        output = pd.DataFrame({"id": test_df["id"], "label": y_pred})
        output.to_csv(filename, index=False)

In [4]:
# Functions for self-evaluation
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

def f1_score(y_true, y_pred, class_label):
    tp = np.sum((y_true == class_label) & (y_pred == class_label))
    fp = np.sum((y_true != class_label) & (y_pred == class_label))
    fn = np.sum((y_true == class_label) & (y_pred != class_label))
    
    if tp + 0.5 * (fp + fn) == 0:
        return 0
    
    f1 = tp / (tp + 0.5 * (fp + fn))
    return f1

def macro_f1_score(y_true, y_pred):
    f1_hateful = f1_score(y_true, y_pred, class_label=1)
    f1_non_hateful = f1_score(y_true, y_pred, class_label=0)
    return (f1_hateful + f1_non_hateful) / 2

In [8]:
# PCA dimensionality reduction and logistic regression
# 2000 components seems to be too many features as indicated in task 2, reduce to test 1000, 500 and 100 for less time complexity
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegressionCV

components_list = [2000, 1000, 500, 100]
log_reg_cv = LogisticRegressionCV(cv=5, random_state=0) # max_iter 1000 no dif

for n_components in components_list:
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    log_reg_cv.fit(X_train_pca, y_train)

    y_pred = log_reg_cv.predict(X_test_pca)

    # Self evaluation on training set
    y_train_pred = log_reg_cv.predict(X_train_pca)

    print("Components:", n_components)
    print("Training set accuracy:", accuracy(y_train, y_train_pred))
    print("F1 Score for Hateful (class 1):", f1_score(y_train, y_train_pred, 1))
    print("F1 Score for Non-Hateful (class 0):", f1_score(y_train, y_train_pred, 0))
    print("Macro F1 Score:", macro_f1_score(y_train, y_train_pred))

    save_predictions(y_pred, f'./predictions/LogisticRegressionCV_PCA_{n_components}_components_Predictions.csv')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Components: 2000
Training set accuracy: 0.7832867783985102
F1 Score for Hateful (class 1): 0.684085510688836
F1 Score for Non-Hateful (class 0): 0.8350752878653676
Macro F1 Score: 0.7595803992771017


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Components: 1000
Training set accuracy: 0.7543063314711359
F1 Score for Hateful (class 1): 0.6357832988267771
F1 Score for Non-Hateful (class 0): 0.8146294344924482
Macro F1 Score: 0.7252063666596127
Components: 500
Training set accuracy: 0.7371973929236499
F1 Score for Hateful (class 1): 0.60886887233674
F1 Score for Non-Hateful (class 0): 0.8021207606695294
Macro F1 Score: 0.7054948165031347
Components: 100
Training set accuracy: 0.6881401303538175
F1 Score for Hateful (class 1): 0.4970436414828719
F1 Score for Non-Hateful (class 0): 0.7740058195926285
Macro F1 Score: 0.6355247305377502


In [5]:
# PCA dimensionality reduction and SVM, as SVMs typically perform better with high-dimensional and unstructured datasets, such as image and text data, compared to logistic regression.
from sklearn.decomposition import PCA
from sklearn.svm import SVC

pca = PCA(n_components=500)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    svm = SVC(kernel=kernel, random_state=0)
    svm.fit(X_train_pca, y_train)
    
    y_pred = svm.predict(X_test_pca)

    # Self evaluation on training set
    y_train_pred = svm.predict(X_train_pca)

    print("Kernel:", kernel)
    print("Training set accuracy:", accuracy(y_train, y_train_pred))
    print("F1 Score for Hateful (class 1):", f1_score(y_train, y_train_pred, 1))
    print("F1 Score for Non-Hateful (class 0):", f1_score(y_train, y_train_pred, 0))
    print("Macro F1 Score:", macro_f1_score(y_train, y_train_pred))

    save_predictions(y_pred, f'./predictions/SVM_PCA_500_components_{kernel}_Predictions.csv')

Kernel: linear
Training set accuracy: 0.7314944134078212
F1 Score for Hateful (class 1): 0.5715081723625557
F1 Score for Non-Hateful (class 0): 0.8044915254237288
Macro F1 Score: 0.6879998488931423
Kernel: poly
Training set accuracy: 0.8564944134078212
F1 Score for Hateful (class 1): 0.7715397443023903
F1 Score for Non-Hateful (class 0): 0.8953932298294731
Macro F1 Score: 0.8334664870659316
Kernel: rbf
Training set accuracy: 0.8780842644320298
F1 Score for Hateful (class 1): 0.8254893794252395
F1 Score for Non-Hateful (class 0): 0.9063184724768591
Macro F1 Score: 0.8659039259510493
Kernel: sigmoid
Training set accuracy: 0.6504888268156425
F1 Score for Hateful (class 1): 0.4891988433407042
F1 Score for Non-Hateful (class 0): 0.7343653250773994
Macro F1 Score: 0.6117820842090518


In [9]:
# Decision Tree Classifier, no tuning
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=0)
decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)

# Self evaluation on training set. Shows a lot of overfitting
y_train_pred = decision_tree.predict(X_train)
print("Training set accuracy:", accuracy(y_train, y_train_pred))
print("F1 Score for Hateful (class 1):", f1_score(y_train, y_train_pred, 1))
print("F1 Score for Non-Hateful (class 0):", f1_score(y_train, y_train_pred, 0))
print("Macro F1 Score:", macro_f1_score(y_train, y_train_pred))

save_predictions(y_pred, f'./predictions/DecisionTree_Predictions.csv')

Training set accuracy: 0.996566573556797
F1 Score for Hateful (class 1): 0.9954903309638462
F1 Score for Non-Hateful (class 0): 0.9972280949025135
Macro F1 Score: 0.9963592129331799


In [15]:
# Decision Tree Classifier with max_depth after considering overfitting. Performs worse
max_depth_values = [10, 100, 500]

# Train and predict using Decision Tree for each max_depth value
for max_depth in max_depth_values:
    decision_tree = DecisionTreeClassifier(random_state=0, max_depth=max_depth)
    decision_tree.fit(X_train, y_train)
    
    y_pred = decision_tree.predict(X_test)
    
    save_predictions(y_pred, f'./predictions/DecisionTree_max_depth_{max_depth}_Predictions.csv')

In [17]:
# Random Forest Implementation, no tuning
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=0)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

# Self evaluation on training set
y_train_pred = rf.predict(X_train)
print("Training set accuracy:", accuracy(y_train, y_train_pred))
print("F1 Score for Hateful (class 1):", f1_score(y_train, y_train_pred, 1))
print("F1 Score for Non-Hateful (class 0):", f1_score(y_train, y_train_pred, 0))
print("Macro F1 Score:", macro_f1_score(y_train, y_train_pred))

save_predictions(y_pred, './predictions/RandomForest_Predictions.csv')

Training set accuracy: 0.996566573556797
F1 Score for Hateful (class 1): 0.9954965269826731
F1 Score for Non-Hateful (class 0): 0.9972257488127145
Macro F1 Score: 0.9963611378976938
Random Forest predictions saved to ./predictions/RandomForest_Predictions.csv


In [7]:
# Random Forest Hyperparameter tuning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
import numpy as np

param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=0)

scorer = make_scorer(f1_score, average='macro')

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions, 
                                   n_iter=20, cv=3, scoring=scorer, verbose=2, random_state=0, n_jobs=-1)

random_search.fit(X_train, y_train)

best_params = random_search.best_params_
best_rf = random_search.best_estimator_

print("Best Parameters:", best_params)

y_pred = best_rf.predict(X_test)

# Save predictions
save_predictions(y_pred, './predictions/RandomForest_Tuned_Predictions.csv')

# Self evaluation on training set
y_train_pred = best_rf.predict(X_train)
print("Training set accuracy:", accuracy(y_train, y_train_pred))
print("F1 Score for Hateful (class 1):", f1_score(y_train, y_train_pred, pos_label=1))
print("F1 Score for Non-Hateful (class 0):", f1_score(y_train, y_train_pred, pos_label=0))
print("Macro F1 Score:", f1_score(y_train, y_train_pred, average='macro'))

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [6]:
# Gradient Boosting using XGBoost
import xgboost as xgb

xgb_model = xgb.XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

# Self evaluation on training set
y_train_pred = xgb_model.predict(X_train)
print("Training set accuracy:", accuracy(y_train, y_train_pred))
print("F1 Score for Hateful (class 1):", f1_score(y_train, y_train_pred, 1))
print("F1 Score for Non-Hateful (class 0):", f1_score(y_train, y_train_pred, 0))
print("Macro F1 Score:", macro_f1_score(y_train, y_train_pred))

save_predictions(y_pred, './predictions/XGBoost_Predictions.csv')

ModuleNotFoundError: No module named 'xgboost'