# Retrieving The Data

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [None]:
df = pd.read_csv("train.csv")
df.head()

# Preprocess

In [None]:
from tensorflow.keras.layers import TextVectorization

X = df['comment_text'] # our samples 
y = df[df.columns[2:]].values # our labels

MAX_FEATURES = 200000 # number of words in the vocab
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [None]:
vectorizer.adapt(X.values) # Learning the vocabulary from the training data

In [None]:
vectorized_text = vectorizer(X.values) # Converts all comments into numerical token sequences.

In [None]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y)) # create a dataset: [vectorized_text][y values]
dataset = dataset.cache() # stores the data in cache to speed up the training
dataset = dataset.shuffle(160000) # shuffles the samples to prevent the model learning patterns in the order of data
dataset = dataset.batch(16) # encapsulate 16 samples for one group/batch for faster training
dataset = dataset.prefetch(8) # ensures training never waits for data by loading the next batches in advance

In [None]:
train = dataset.take(int(len(dataset)*.7)) # 70% training
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)) # 20% for validation for detecting overfitting
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)) # 10# for test

# Create Sequential Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding
model = Sequential()
# Create the embedding layer - converts words into vectors
model.add(Embedding(MAX_FEATURES+1, output_dim = 32, input_length = 1800))
# Bidirectional LSTM Layer captures relationships from both past & future words
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer - Multi-label classification
model.add(Dense(6, activation='sigmoid'))

In [None]:
# Since each label is independent, binary classification is used for each category
# Adaptive learning rate for faster and efficient training
model.compile(loss='BinaryCrossentropy', optimizer='Adam')
# Initializing of all layers for 1800 fixed sequence length (tokens per comment)
model.build(input_shape=(None, 1800))  # Batch size is flexible (None), sequence length is 1800

In [None]:
model.summary()

In [None]:
history = model.fit(train, epochs=10, validation_data=val)

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

# Make Predictions

In [None]:
input_text = vectorizer('You freaking suck! I am going to hit you.')
res = model.predict(np.array([input_text]))
print(res)

In [None]:
(res > 0.5).astype(int)

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next() # extracting one batch from the test dataset 

In [None]:
(model.predict(batch_X) > 0.5).astype(int) # predicting results from all comments in the batch 

# Evaluate Model

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
# Initialize metrics
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions into 1D arrays for metrics
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

# Testing

In [None]:
import tensorflow as tf

In [None]:
model.save('toxicity.h5.keras')

In [None]:
model = tf.keras.models.load_model('toxicity.h5.keras')

In [None]:
input_str = vectorizer('I hate you!')

In [None]:
res = model.predict(np.array([input_str]))

In [None]:
res

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    output = {}
    for idx, col in enumerate(df.columns[2:]): 
        output[col] = results[0][idx] > 0.5  
    
    return output

In [None]:
# Test with an example comment
comment = "I'm going to kill you"
result = score_comment(comment)

print("Toxicity Analysis Result:")
for label, value in result.items():
    print(f"{label}: {'Toxic' if value else 'Not Toxic'}")