In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
# Load data into a pandas DataFrame
data = pd.read_csv('combined_data.csv')
data = data.dropna()

# Separate the feature (textual data) and target (binary labels) columns
X = data['text']
y = data['isHate']

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Vectorize the textual data using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [6]:
# Oversample the texts class
smote = SMOTE(random_state=42)
X_train_tfidf_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

In [7]:
# Train Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf_resampled, y_train_resampled)

LogisticRegression()

In [8]:
# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf_resampled, y_train_resampled)

MultinomialNB()

In [9]:
# Train Support Vector Machine
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train_tfidf_resampled, y_train_resampled)

SVC(kernel='linear')

In [10]:
X_train_tfidf_resampled = X_train_tfidf_resampled.toarray()

In [11]:
# Simple Neural Network architecture
nn_model = Sequential()
nn_model.add(Dense(64, input_shape=(X_train_tfidf_resampled.shape[1],), activation='relu'))
nn_model.add(Dropout(0.5)) 
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dropout(0.5)) 
nn_model.add(Dense(1, activation='sigmoid'))

In [12]:
# Compile the model
nn_model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])

In [13]:
es = EarlyStopping(patience=50, restore_best_weights=True, monitor="loss")

In [14]:
# Train the model
nn_model.fit(X_train_tfidf_resampled, y_train_resampled, epochs=10, batch_size=32, verbose=1, callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ffbf6cb5d60>

In [15]:
# Make predictions on the testing set
lr_pred = lr_model.predict(X_test_tfidf)
nb_pred = nb_model.predict(X_test_tfidf)
svm_pred = svm_model.predict(X_test_tfidf)
nn_pred = nn_model.predict_classes(X_test_tfidf)



In [16]:
# Print the classification report and accuracy score for each model
print("Tf-Idf Results")
print()

print('Logistic Regression:')
print(classification_report(y_test, lr_pred))
print('Accuracy score:', accuracy_score(y_test, lr_pred))
print()


print('Naive Bayes:')
print(classification_report(y_test, nb_pred))
print('Accuracy score:', accuracy_score(y_test, nb_pred))
print()


print('Support Vector Machine:')
print(classification_report(y_test, svm_pred))
print('Accuracy score:', accuracy_score(y_test, svm_pred))
print()

print('Neural Network:')
print(classification_report(y_test, nn_pred))
print('Accuracy score:', accuracy_score(y_test, nn_pred))

Tf-Idf Results

Logistic Regression:
              precision    recall  f1-score   support

           0       0.97      0.89      0.93      4763
           1       0.32      0.61      0.42       390

    accuracy                           0.87      5153
   macro avg       0.64      0.75      0.68      5153
weighted avg       0.92      0.87      0.89      5153

Accuracy score: 0.8734717640209587

Naive Bayes:
              precision    recall  f1-score   support

           0       0.97      0.85      0.91      4763
           1       0.27      0.65      0.38       390

    accuracy                           0.84      5153
   macro avg       0.62      0.75      0.64      5153
weighted avg       0.91      0.84      0.87      5153

Accuracy score: 0.8379584707937124

Support Vector Machine:
              precision    recall  f1-score   support

           0       0.96      0.90      0.93      4763
           1       0.30      0.52      0.38       390

    accuracy                        