# Setup

In [4]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

In [5]:
data = pd.read_csv("/kaggle/input/toxic-message-classifier-dataset/train_cleaned.csv")

# Actual Model

## Creating the encoder

In [15]:
NUM_ROWS = 10000
BATCH_SIZE = 64

MAX_TOKENS = 5000
MAX_LENGTH = 200
encoder = tf.keras.layers.TextVectorization(max_tokens=MAX_TOKENS, output_sequence_length=MAX_LENGTH)
encoder.adapt(data.head(NUM_ROWS)["comment_text"].tolist())

vocab = np.array(encoder.get_vocabulary())
vocab[:20]

len(vocab)

5000

## Creating the model

In [16]:
# Sets random seed so results are identical every time
SEED = 1
tf.random.set_seed(SEED)
np.random.seed(SEED)
tf.keras.utils.set_random_seed(SEED)

# TOXIC_CATEGORIES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
TOXIC_CATEGORIES = ["toxic"]

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=512,
        mask_zero=True
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(TOXIC_CATEGORIES), activation="sigmoid")
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=["accuracy"])


### Training the model

In [17]:
data.shape

(149348, 9)

In [18]:
# Classification for all types of toxicity
multiDf = data.head(NUM_ROWS)[["comment_text"] + TOXIC_CATEGORIES]

x = multiDf["comment_text"]
y = multiDf[TOXIC_CATEGORIES]

splitter = StratifiedShuffleSplit(random_state=1, test_size=0.2)

min_train_size = 1000
for train, test in splitter.split(x, y[TOXIC_CATEGORIES[0]]):
    if len(train) < min_train_size:
        continue
    training_data = x.iloc[train]
    training_target = y.iloc[train]
    validation_data = x.iloc[test]
    validation_target = y.iloc[test]
    break  # Stop after finding the first valid split


In [19]:
# Early Stopping
callback = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=2)

history = model.fit(training_data, training_target, epochs=10, validation_data=(validation_data, validation_target), callbacks=[callback], batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


In [26]:
model_1 = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=512,
        mask_zero=True
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(0.01))),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, kernel_regularizer=tf.keras.regularizers.l2(0.01))),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(TOXIC_CATEGORIES), activation="sigmoid")
])


model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=["accuracy"])

# Early Stopping
callback = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=4)

history = model_1.fit(training_data, training_target, epochs=10, validation_data=(validation_data, validation_target), callbacks=[callback], batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
# JUST adding 2 more dense layers to the initial model
model_2 =tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=512,
        mask_zero=True
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(TOXIC_CATEGORIES), activation="sigmoid")
])

model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=["accuracy"])

# Early Stopping
callback = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=4)

history = model_2.fit(training_data, training_target, epochs=10, validation_data=(validation_data, validation_target), callbacks=[callback], batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
#Increase the number of LSTM cells in the bidirectional LSTM layers. This can help t
# he model learn more complex temporal dependencies

# Different values for dropout
model_3 =tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=512,
        mask_zero=True
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(256, activation="relu"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(len(TOXIC_CATEGORIES), activation="sigmoid")
])

model_3.compile(loss=tf.keras.losses.BinaryCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=["accuracy"])

# Early Stopping
callback = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=4)

history = model_3.fit(training_data, training_target, epochs=10, validation_data=(validation_data, validation_target), callbacks=[callback], batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


### Getting test data

In [None]:
# Classification for all toxicity values
test_data = pd.read_csv("/kaggle/input/toxic-message-classifier-dataset/test.csv")
test_labels = pd.read_csv("/kaggle/input/toxic-message-classifier-dataset/test_labels.csv")

test_labels = test_labels
merged_df = test_labels.merge(test_data, left_on="id", right_on="id")
merged_df = merged_df.loc[(merged_df["toxic"] != -1) & (merged_df["severe_toxic"] != -1) & (merged_df["obscene"] != -1) & (merged_df["threat"] != -1) & (merged_df["insult"] != -1) & (merged_df["identity_hate"] != -1)]

In [None]:
# Tests all rows with a value of 0 or 1 for all toxicity values

test_df = merged_df["comment_text"]
test_target = merged_df[TOXIC_CATEGORIES]
model.evaluate(test_df, test_target)

In [None]:
# Test where any toxicity value is 1

test_df = merged_df["comment_text"]
query = " | ".join([f"({label} == 1)" for label in TOXIC_CATEGORIES])
filtered_df = merged_df.query(query)

filtered_test_dataset = filtered_df["comment_text"]
filtered_df_target = filtered_df[TOXIC_CATEGORIES]
model.evaluate(filtered_test_dataset, filtered_df_target)

In [None]:
# Test where all toxicity values are 0

test_df = merged_df["comment_text"]
query = " | ".join([f"({label} == 0)" for label in TOXIC_CATEGORIES])
filtered_df = merged_df.query(query)

filtered_test_dataset = filtered_df["comment_text"]
filtered_df_target = filtered_df[TOXIC_CATEGORIES]
model.evaluate(filtered_test_dataset, filtered_df_target)

In [None]:
submission_set = pd.read_csv('/kaggle/input/toxic-message-classifier-dataset/test.csv')
submission_set.head()

x_test = submission_set['comment_text'].values
y_testing = model.predict(x_test, verbose=1, batch_size=BATCH_SIZE)

y_testing

In [None]:
submission_df = pd.DataFrame(columns = ['id','toxic','severe_toxic','obscene','threat','insult','identity_hate'])

submission_df['id'] = submission_set['id']
submission_df['toxic'] = [0 if x[0] < 0.5 else 1 for x in y_testing]
submission_df['severe_toxic'] = [0 if x[1] < 0.5 else 1 for x in y_testing]
submission_df['obscene'] = [0 if x[2] < 0.5 else 1 for x in y_testing]
submission_df['threat'] = [0 if x[3] < 0.5 else 1 for x in y_testing]
submission_df['insult'] = [0 if x[4] < 0.5 else 1 for x in y_testing]
submission_df['identity_hate'] = [0 if x[5] < 0.5 else 1 for x in y_testing]

submission_df.head()

submission_df.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
model.summary()