In [13]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

In [14]:
import pandas as pd

df = pd.read_csv("./train.csv")

print("Missing values per column:")
print(df.isnull().sum())

# This is the attempt to replace the odd unicode characters with something else, but it doesnt seem to work yet
df["comment_text"] = df["comment_text"].str.replace("\n", " ")

df_cleaned = df.dropna()

X_train, X_test, y_train, y_test = train_test_split(df_cleaned["comment_text"], df["toxic"], test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
# No longer using this
#df_cleaned.to_csv("cleaned_train.csv", index=False)

Missing values per column:
id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


**Train Loop**

In [9]:
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

train_sequences = tokenizer.texts_to_sequences(X_train)
val_sequences = tokenizer.texts_to_sequences(X_val)
test_sequences = tokenizer.texts_to_sequences(X_test)

max_length = 500
X_train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
X_val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
X_test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_val_padded, y_val))

test_loss, test_accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_accuracy:.4f}")

# Print model summary
model.summary()

Epoch 1/5




[1m3192/3192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1931s[0m 604ms/step - accuracy: 0.9388 - loss: 0.1819 - val_accuracy: 0.9584 - val_loss: 0.1134
Epoch 2/5
[1m3192/3192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1712s[0m 536ms/step - accuracy: 0.9657 - loss: 0.0934 - val_accuracy: 0.9593 - val_loss: 0.1122
Epoch 3/5
[1m3192/3192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1687s[0m 528ms/step - accuracy: 0.9707 - loss: 0.0773 - val_accuracy: 0.9593 - val_loss: 0.1097
Epoch 4/5
[1m3192/3192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1612s[0m 505ms/step - accuracy: 0.9749 - loss: 0.0647 - val_accuracy: 0.9546 - val_loss: 0.1245
Epoch 5/5
[1m3192/3192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1641s[0m 514ms/step - accuracy: 0.9813 - loss: 0.0497 - val_accuracy: 0.9581 - val_loss: 0.1308
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 183ms/step - accuracy: 0.9617 - loss: 0.1240
Test Loss: 0.1261, Test Acc: 0.9605


In [12]:
model.save("toxic_comment_rnn_model.keras")