# Data Preparation

In [1]:
import pandas as pd

df = pd.read_csv('/content/reddit_dataset.csv')

# Count the occurrences of 0 and 1 in the target column
value_counts = df['label'].value_counts()

# Get the counts for 0 and 1, defaulting to 0 if not present
count_0 = value_counts.get(0, 0)
count_1 = value_counts.get(1, 0)

print(f"Number of Normal comments: {count_0}")
print(f"Number of Hate comments: {count_1}")

Number of Normal comments: 16462
Number of Hate comments: 5192


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/reddit_dataset.csv')

X = df.drop('label', axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

print(f"Training set size: {train_df.shape[0]}")
print(f"Testing set size: {test_df.shape[0]}")

Training set size: 17323
Testing set size: 4331


In [3]:
print("===TRAINING DATASET===")
print(f"Number of Normal comments in train_df: {(train_df['label'] == 0).sum()}")
print(f"Number of Hate comments in train_df: {(train_df['label'] == 1).sum()}")
print("===TESTING DATASET===")
print(f"Number of Normal comments in test_df: {(test_df['label'] == 0).sum()}")
print(f"Number of Hate comments in test_df: {(test_df['label'] == 1).sum()}")

===TRAINING DATASET===
Number of Normal comments in train_df: 13169
Number of Hate comments in train_df: 4154
===TESTING DATASET===
Number of Normal comments in test_df: 3293
Number of Hate comments in test_df: 1038


In [4]:
train_sentences = train_df["reddit_comment"].to_numpy()
val_sentences = test_df["reddit_comment"].to_numpy()

train_labels = train_df["label"].to_numpy()
val_labels = test_df["label"].to_numpy()

In [5]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(17323, 17323, 4331, 4331)

# Converting Text into Numbers

## Text vectorization (tokenization)

In [6]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output sequence of tokens be?
                                    # pad_to_max_tokens=True) # Not valid if using max_tokens=None

In [7]:
# Find average number of tokens (words) in training Tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

47

In [8]:
# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 47 # max length our sequences will be (e.g. how many words from a comment does the model see)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [9]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [10]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens ([UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'to', 'a']
Bottom 5 least common words: ['distraction', 'distracting', 'distant', 'dissect', 'disprove']


## Creating an Embedding using an Embedding Layer

In [11]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             name="embedding_1")

# Model 0: Naive Bayes (baseline)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()),
                    ("clf", MultinomialNB())
])

model_0.fit(train_sentences, train_labels)

In [13]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Naive Bayes (baseline) model achieves an accuracy of: {baseline_score*100:.2f}%")

Naive Bayes (baseline) model achieves an accuracy of: 77.26%


In [14]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  # Convert TensorFlow tensors to NumPy arrays if necessary
  y_true = y_true.numpy() if isinstance(y_true, tf.Tensor) else y_true
  y_pred = y_pred.numpy() if isinstance(y_pred, tf.Tensor) else y_pred

  model_accuracy = accuracy_score(y_true, y_pred) * 100
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [15]:
baseline_preds = model_0.predict(val_sentences)
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 77.25698453013162,
 'precision': 0.8033889160716947,
 'recall': 0.7725698453013161,
 'f1': 0.6868947580662076}

# Model 1: Feed-forward Neural Network (Dense Model)

In [16]:
# Build model with the Functional API
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

In [17]:
# Compile model
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [18]:
# Get a summary of the model
model_1.summary()

In [19]:
# Fit the model
model_1_history = model_1.fit(train_sentences, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              )

Epoch 1/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.7582 - loss: 0.5559 - val_accuracy: 0.7638 - val_loss: 0.4916
Epoch 2/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.7823 - loss: 0.4516 - val_accuracy: 0.8340 - val_loss: 0.3915
Epoch 3/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.8698 - loss: 0.3355 - val_accuracy: 0.8735 - val_loss: 0.3512
Epoch 4/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8993 - loss: 0.2741 - val_accuracy: 0.8832 - val_loss: 0.3459
Epoch 5/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9118 - loss: 0.2406 - val_accuracy: 0.8850 - val_loss: 0.3533


In [20]:
model_1.evaluate(val_sentences, val_labels)

[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8855 - loss: 0.3561


[0.35332340002059937, 0.8850150108337402]

In [21]:
# Make predictions (these come back in the form of probabilities)
model_1_pred_probs = model_1.predict(val_sentences)
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs)) # squeeze removes single dimensions
model_1_results = calculate_results(y_true=val_labels,
                                    y_pred=model_1_preds)
model_1_results

[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


{'accuracy': 88.50150080812746,
 'precision': 0.8819131223644537,
 'recall': 0.8850150080812745,
 'f1': 0.879411615271778}

# Model 2: LSTM

In [22]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)
from tensorflow.keras import layers
model_2_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     name="embedding_2")


# Create LSTM model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_2_embedding(x)
print(x.shape)
x = layers.LSTM(64)(x) # return vector for whole sequence
print(x.shape)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

(None, 47, 128)
(None, 64)


In [23]:
# Compile model
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [24]:
model_2.summary()

In [25]:
# Fit model
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              )

Epoch 1/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7605 - loss: 0.5577 - val_accuracy: 0.8488 - val_loss: 0.4128
Epoch 2/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.8497 - loss: 0.4026 - val_accuracy: 0.8714 - val_loss: 0.3667
Epoch 3/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.8936 - loss: 0.3061 - val_accuracy: 0.8783 - val_loss: 0.3414
Epoch 4/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.9149 - loss: 0.2549 - val_accuracy: 0.8497 - val_loss: 0.3821
Epoch 5/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9342 - loss: 0.2114 - val_accuracy: 0.8333 - val_loss: 0.4689


In [26]:
# Make predictions on the validation dataset
model_2_pred_probs = model_2.predict(val_sentences)
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_results = calculate_results(y_true=val_labels,
                                    y_pred=model_2_preds)
model_2_results

[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


{'accuracy': 83.32948510736551,
 'precision': 0.846904581989087,
 'recall': 0.8332948510736551,
 'f1': 0.8380883561847527}

# Model 3: GRU

In [27]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)
from tensorflow.keras import layers
model_3_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     name="embedding_3")

# Build an RNN using the GRU cell
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_3_embedding(x)
x = layers.GRU(64)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")

In [28]:
# Compile GRU model
model_3.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [29]:
# Get a summary of the GRU model
model_3.summary()

In [30]:
# Fit model
model_3_history = model_3.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              )

Epoch 1/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.7600 - loss: 0.5654 - val_accuracy: 0.7606 - val_loss: 0.5488
Epoch 2/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.8105 - loss: 0.4572 - val_accuracy: 0.8878 - val_loss: 0.3398
Epoch 3/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.9035 - loss: 0.2846 - val_accuracy: 0.8693 - val_loss: 0.3637
Epoch 4/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.9292 - loss: 0.2230 - val_accuracy: 0.8543 - val_loss: 0.4320
Epoch 5/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.9461 - loss: 0.1644 - val_accuracy: 0.8474 - val_loss: 0.5253


In [31]:
# Make predictions on the validation data
model_3_pred_probs = model_3.predict(val_sentences)
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_results = calculate_results(y_true=val_labels,
                                    y_pred=model_3_preds)
model_3_results

[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step


{'accuracy': 84.73793581159086,
 'precision': 0.850581979701888,
 'recall': 0.8473793581159086,
 'f1': 0.8487983703217429}

# Model 4: Bidirectonal RNN

In [32]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)
from tensorflow.keras import layers
model_4_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     name="embedding_4")

# Build a Bidirectional RNN in TensorFlow
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_4_embedding(x)
x = layers.Bidirectional(layers.LSTM(64))(x) # bidirectional goes both ways so has double the parameters of a regular LSTM layer
outputs = layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_Bidirectional")

In [33]:
# Compile
model_4.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [34]:
# Get a summary of our bidirectional model
model_4.summary()

In [35]:
# Fit the model (takes longer because of the bidirectional layers)
model_4_history = model_4.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              )

Epoch 1/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.8148 - loss: 0.4625 - val_accuracy: 0.8926 - val_loss: 0.3211
Epoch 2/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.8974 - loss: 0.2846 - val_accuracy: 0.8906 - val_loss: 0.3186
Epoch 3/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.9217 - loss: 0.2208 - val_accuracy: 0.8751 - val_loss: 0.3608
Epoch 4/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.9478 - loss: 0.1545 - val_accuracy: 0.8670 - val_loss: 0.4680
Epoch 5/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.9620 - loss: 0.1103 - val_accuracy: 0.8580 - val_loss: 0.5372


In [36]:
# Make predictions with bidirectional RNN on the validation data
model_4_pred_probs = model_4.predict(val_sentences)
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_results = calculate_results(val_labels, model_4_preds)
model_4_results

[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


{'accuracy': 85.80004617871161,
 'precision': 0.8557309395429343,
 'recall': 0.8580004617871161,
 'f1': 0.8567071594728467}

# Model 5: 1D Convolutional Neural Network

In [37]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)
from tensorflow.keras import layers
model_5_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     name="embedding_5")

# Create 1-dimensional convolutional layer to model sequences
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_5_embedding(x)
x = layers.Conv1D(filters=32, kernel_size=5, activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_5 = tf.keras.Model(inputs, outputs, name="model_5_Conv1D")

In [38]:
# Compile Conv1D model
model_5.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [39]:
# Get a summary of our 1D convolution model
model_5.summary()

In [40]:
# Fit the model
model_5_history = model_5.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              )

Epoch 1/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7680 - loss: 0.4729 - val_accuracy: 0.8943 - val_loss: 0.3154
Epoch 2/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.8987 - loss: 0.2862 - val_accuracy: 0.8864 - val_loss: 0.3249
Epoch 3/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9215 - loss: 0.2148 - val_accuracy: 0.8719 - val_loss: 0.3619
Epoch 4/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9570 - loss: 0.1322 - val_accuracy: 0.8652 - val_loss: 0.4202
Epoch 5/5
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9802 - loss: 0.0734 - val_accuracy: 0.8626 - val_loss: 0.4892


In [41]:
# Make predictions with model_5
model_5_pred_probs = model_5.predict(val_sentences)
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_results = calculate_results(y_true=val_labels,
                                    y_pred=model_5_preds)
model_5_results

[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


{'accuracy': 86.26183329485107,
 'precision': 0.8568710444556901,
 'recall': 0.8626183329485108,
 'f1': 0.8567280988046929}

# Combining Models (Ensembling/Stacking)

In [43]:
import numpy as np

# Get mean pred probs for 3 models
baseline_pred_probs = np.max(model_0.predict_proba(val_sentences), axis=1) # get the prediction probabilities from baseline model
combined_pred_probs = baseline_pred_probs + tf.squeeze(model_1_pred_probs, axis=1) + tf.squeeze(model_5_pred_probs)
combined_preds = tf.round(combined_pred_probs/3) # average and round the prediction probabilities to get prediction classes

In [44]:
# Calculate results from averaging the prediction probabilities
ensemble_results = calculate_results(val_labels, combined_preds)
ensemble_results

{'accuracy': 87.11613945970907,
 'precision': 0.8694510682837723,
 'recall': 0.8711613945970907,
 'f1': 0.8701940071871923}

# Comparing the performance of all the models

In [46]:
# Combine model results into a DataFrame
all_model_results = pd.DataFrame({"baseline": baseline_results,
                                  "simple_dense": model_1_results,
                                  "lstm": model_2_results,
                                  "gru": model_3_results,
                                  "bidirectional": model_4_results,
                                  "conv1d": model_5_results,
                                  "ensemble": ensemble_results})

all_model_results = all_model_results.transpose()
all_model_results

Unnamed: 0,accuracy,precision,recall,f1
baseline,77.256985,0.803389,0.77257,0.686895
simple_dense,88.501501,0.881913,0.885015,0.879412
lstm,83.329485,0.846905,0.833295,0.838088
gru,84.737936,0.850582,0.847379,0.848798
bidirectional,85.800046,0.855731,0.858,0.856707
conv1d,86.261833,0.856871,0.862618,0.856728
ensemble,87.116139,0.869451,0.871161,0.870194


# Saving a trained model

In [48]:
# Save Simple Dense model
model_1.save("model_1.keras")