In [None]:
# %%
# Import the pandas library, commonly used for data manipulation and analysis, aliased as 'pd'.
import pandas as pd
# Import TensorFlow, the core open-source library for machine learning and deep learning.
import tensorflow as tf
# From TensorFlow's Keras API, import the Tokenizer class for converting text into numerical sequences.
from tensorflow.keras.preprocessing.text import Tokenizer
# Import the pad_sequences function to ensure all text sequences have the same length.
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Import the Sequential model type, which allows us to build a neural network layer by layer.
from tensorflow.keras.models import Sequential
# Import the specific types of layers we'll use in our neural network.
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
# Import a utility from scikit-learn to easily split our data into training and testing sets.
from sklearn.model_selection import train_test_split
# Import the pickle library to save our tokenizer object for later use in the web app.
import pickle

# [cite_start]Use pandas to read the dataset from a CSV file into a DataFrame called 'df'. [cite: 18]
df = pd.read_csv('data/train.csv')

# Define a list containing the names of all columns that indicate different types of toxicity.
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# Create a new, single 'toxic' column. If a comment is flagged in ANY of the label_cols, this new column will be 1 (True), otherwise 0 (False).
# .any(axis=1) checks for any True value across the columns for each row.
# .astype(int) converts the resulting boolean (True/False) to an integer (1/0).
df['toxic'] = df[label_cols].any(axis=1).astype(int)

# To make training faster for this example, we'll work with a smaller, random subset of the data.
# First, we select only the columns we need: 'comment_text' (our feature) and 'toxic' (our label).
# Then, we .sample() 50,000 rows randomly.
# random_state=42 ensures that we get the exact same "random" sample every time we run this code, making our results reproducible.
df = df[['comment_text', 'toxic']].sample(50000, random_state=42)

# Display the first 5 rows of our processed DataFrame to verify the changes.
df.head()

Unnamed: 0,comment_text,toxic
119105,"Geez, are you forgetful! We've already discus...",0
131631,Carioca RFA \n\nThanks for your support on my ...,0
125326,"""\n\n Birthday \n\nNo worries, It's what I do ...",0
111256,Pseudoscience category? \n\nI'm assuming that ...,0
83590,"(and if such phrase exists, it would be provid...",0


In [None]:
# %%
# Extract the 'comment_text' column from the DataFrame and convert it into a NumPy array. This will serve as our features (X).
comments = df['comment_text'].values
# Extract the 'toxic' column and convert it into a NumPy array. This will be our target labels (y).
labels = df['toxic'].values

# Define the maximum number of unique words to include in our vocabulary. We'll use the 10,000 most frequent words.
vocab_size = 10000
# Create an instance of the Tokenizer.
# num_words specifies the vocabulary size.
# oov_token='<OOV>' creates a special token for any words that are not in the vocabulary (Out-Of-Vocabulary words).
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
# "Fits" the tokenizer on our text data. This process builds the word index, mapping each unique word to an integer.
tokenizer.fit_on_texts(comments)

# Convert each comment text into a sequence of integers based on the word index created by the tokenizer.
sequences = tokenizer.texts_to_sequences(comments)

# Define the maximum length for each sequence. Comments longer than this will be cut, and shorter ones will be padded.
max_length = 200
# Apply padding to the sequences to ensure they all have the same length (max_length).
# 'padding="post"' adds zeros at the end of shorter sequences.
# 'truncating="post"' removes words from the end of longer sequences.
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

In [5]:
# %%
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Training data shape: (40000, 200)
Testing data shape: (10000, 200)


In [None]:
# %%
# Define the dimensionality of the word embedding vectors. Each word will be represented by a 16-dimensional vector.
embedding_dim = 16

# Create a Sequential model, which is a linear stack of layers. [cite_start]We will add layers one by one. [cite: 21]
model = Sequential([
    # The first layer is an Embedding layer. It turns the integer-encoded vocabulary into dense vectors of a fixed size (embedding_dim).
    # vocab_size: The total number of words in our vocabulary.
    # embedding_dim: The size of the vector for each word.
    # input_length: The length of input sequences (200 in our case).
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    
    # [cite_start]A Bidirectional LSTM layer. Bidirectional means the input is processed in both forward and backward directions, capturing context from both past and future words. [cite: 22]
    # LSTM(64, ...): An LSTM layer with 64 internal units (neurons).
    # return_sequences=True: This is important because the next layer is also an LSTM, which requires a sequence as input. This argument makes the layer output the full sequence of hidden states.
    Bidirectional(LSTM(64, return_sequences=True)),
    
    # Another Bidirectional LSTM layer, this time with 32 units.
    # By default, return_sequences is False, so this layer will only output the final hidden state, which summarizes the entire sequence's meaning.
    Bidirectional(LSTM(32)),
    
    # A standard fully-connected (Dense) layer with 16 neurons.
    # activation='relu': Uses the Rectified Linear Unit activation function to introduce non-linearity, allowing the model to learn more complex patterns.
    Dense(16, activation='relu'),
    
    # The final output layer. It has a single neuron because this is a binary classification problem (toxic or not toxic).
    # activation='sigmoid': The sigmoid activation function squashes the output to a value between 0 and 1, which can be interpreted as the probability of the comment being toxic.
    Dense(1, activation='sigmoid')
])

# Configure the model for training.
model.compile(loss='binary_crossentropy', # Specifies the loss function. 'binary_crossentropy' is the standard for two-class classification problems.
              optimizer='adam',             # Specifies the optimization algorithm. 'adam' is a popular and effective choice.
              metrics=['accuracy'])         # Specifies the metric to monitor during training. Here, we want to see the classification accuracy.

# Print a summary of the model's architecture, including each layer's type, output shape, and number of parameters.
model.summary()



In [7]:
# %%
# Train the model
num_epochs = 5
history = model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test), verbose=1)

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 144ms/step - accuracy: 0.9334 - loss: 0.1978 - val_accuracy: 0.9588 - val_loss: 0.1320
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 88ms/step - accuracy: 0.9589 - loss: 0.1186 - val_accuracy: 0.9565 - val_loss: 0.1298
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 92ms/step - accuracy: 0.9679 - loss: 0.0903 - val_accuracy: 0.9583 - val_loss: 0.1219
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 102ms/step - accuracy: 0.9735 - loss: 0.0764 - val_accuracy: 0.9596 - val_loss: 0.1278
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 178ms/step - accuracy: 0.9768 - loss: 0.0664 - val_accuracy: 0.9606 - val_loss: 0.1336


In [8]:
# %%
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 33ms/step - accuracy: 0.9606 - loss: 0.1336
Test Accuracy: 96.06%


In [9]:
# %%
# Save the trained model
model.save('toxicity_model.h5')

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Model and tokenizer saved successfully!")



Model and tokenizer saved successfully!
