In [17]:
# imports
import pandas as pd

from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import metrics

from keras.callbacks import EarlyStopping
from keras import regularizers
from tensorflow.keras.models import load_model

import numpy as np

from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from Data_Pre_Process import pre_process_corpus, tfidf, bert_encoded

In [4]:
# reading the preprocessed dataset
DF = pd.read_csv('my_dataset.csv')

In [5]:
df_train=DF[:round(0.7*len(DF))]
df_test=DF[round(0.7*len(DF)):]

In [6]:
# load bert encoded data
X = np.load('encoded_train_tweet.npy')
X_test = np.load('encoded_test_tweet.npy')

In [7]:
# load tfidf encoded data
tfidf_train = np.load("tfidf_train_tweet.npy")
tfidf_test = np.load("tfidf_test_tweet.npy")

In [8]:
y_test=df_test.label

In [9]:
y= df_train.label

# DNN model

In [24]:
# Configure early stopping based on validation accuracy with a patience of 10 epochs and restore the best weights
callback = EarlyStopping(monitor='val_acc', patience=10, restore_best_weights=True)

# Build a sequential neural network model
model = models.Sequential()

# Add dense layers with dropout and L2 regularization to prevent overfitting
model.add(layers.Dense(128, activation='relu', input_shape=(np.hstack((tfidf_train, X)).shape[1],)))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(l=0.01)))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(l=0.01)))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(1, activation='sigmoid'))

# Compile the model with RMSprop optimizer and binary crossentropy loss function
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

# Output the model summary to understand the architecture
print(model.summary())

# Train the model using combined TF-IDF features and additional features 'X', with callbacks for early stopping
history = model.fit(np.hstack((tfidf_train, X)), y, epochs=50, batch_size=512, 
                    validation_data=(np.hstack((tfidf_test, X_test)), y_test), callbacks=[callback])


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 128)               4214272   
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_6 (Dense)             (None, 128)               16512     
                                                                 
 dropout_5 (Dropout)         (None, 128)               0         
                                                                 
 dense_7 (Dense)             (None, 128)               16512     
                                                                 
 dropout_6 (Dropout)         (None, 128)               0         
                                                                 
 dense_8 (Dense)             (None, 1)                

In [25]:
# save the model
model.save('dnn_model.keras')

In [10]:
# load the model
dnn_model = load_model('dnn_model.keras')
dnn_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 128)               4214272   
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_6 (Dense)             (None, 128)               16512     
                                                                 
 dropout_5 (Dropout)         (None, 128)               0         
                                                                 
 dense_7 (Dense)             (None, 128)               16512     
                                                                 
 dropout_6 (Dropout)         (None, 128)               0         
                                                                 
 dense_8 (Dense)             (None, 1)                

In [48]:
# Define a function to calculate the F1 score for model predictions above a specified threshold
def get_f1_score(model, test_data, true_labels, threshold=0.5):
    
    pred= model.predict(test_data)
    y_hat= np.zeros_like(pred)
    y_hat_idx= np.where(pred>threshold)[0]
    
    for i in range(len(pred)):
        if i in y_hat_idx:
            y_hat[i]=1
    
    return f1_score(true_labels, y_hat)

In [49]:
# calculating the optimal threshold for our model
f1_scores = [get_f1_score(model, np.hstack((tfidf_test, X_test)), y_test, threshold/100) for threshold in range(90)]
f1_scores = np.array(f1_scores)
optimal_threshold = np.argmax(f1_scores) / 100

print(f"Optimal Threshold: {optimal_threshold}")

Optimal Threshold: 0.42


In [58]:
# Retrieve the F1 score corresponding to the optimal threshold from the dictionary 'f1_scores'
f1_scores[int(optimal_threshold * 100)]


0.6935123042505593

# Logistic Regression

In [64]:
# Initialize a logistic regression model and train it using only TF-IDF features
log_reg = LogisticRegression()
tfidf_test_dense = np.asarray(tfidf_test)  
log_reg.fit(tfidf_train, y) 

# Predict and evaluate using the TF-IDF features on the test set
predictions_tfidf_only = log_reg.predict(tfidf_test)
f1_tfidf_only = f1_score(y_test, predictions_tfidf_only) 
print("F1 Score with TF-IDF only:", f1_tfidf_only) 

# Retrain the logistic regression model using combined TF-IDF and additional features
log_reg.fit(np.hstack((tfidf_train, X)), y) 
predictions_combined = log_reg.predict(np.hstack((tfidf_test, X_test))) 
f1_combined = f1_score(y_test, predictions_combined)
print("F1 Score with combined features:", f1_combined) 


F1 Score with TF-IDF only: 0.42403628117913833


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


F1 Score with combined features: 0.6330196749358425


# Random Forest

In [None]:
# Initialize a RandomForestClassifier and train it using only TF-IDF features
rf_classifier = RandomForestClassifier()
rf_classifier.fit(tfidf_train, y)  

# Predict using the TF-IDF features on the test set and calculate the F1 score
predictions_tfidf_only = rf_classifier.predict(tfidf_test)
f1_tfidf_only = f1_score(y_test, predictions_tfidf_only)
print("F1 Score with TF-IDF only:", f1_tfidf_only)  

# Retrain the RandomForestClassifier using combined TF-IDF and additional features
rf_classifier.fit(np.hstack((tfidf_train, X)), y)  
predictions_combined = rf_classifier.predict(np.hstack((tfidf_test, X_test))) 
f1_combined = f1_score(y_test, predictions_combined)  
print("F1 Score with combined features:", f1_combined) 


In [18]:
# Define a function to classify input text as hate speech or not using a pre-trained deep neural network (DNN) model
def classify_text():

    input_message = input("Enter a text to classify as hate speech or not: ")
    input_text = pd.Series([input_message]) 

    preprocessed_text_tfidf = pre_process_corpus(input_text)[0]
    tfidf_features = tfidf.transform([preprocessed_text_tfidf]).todense()

    bert_features = bert_encoded.encode([preprocessed_text_tfidf])

    combined_features = np.hstack((tfidf_features, bert_features))

    prediction = dnn_model.predict(combined_features)
    predicted_class = 'Hate Speech' if prediction[0] >= 0.5 else 'Not Hate Speech'  

    print(f"Prediction: {predicted_class}")


In [19]:
classify_text()

Enter a text to classify as hate speech or not: It's just a fact that Black people are less intelligent than whites
Prediction: Hate Speech
