In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, GRU, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split

# Load your dataset (CSV file with multiple label columns)
df = pd.read_csv('/content/extended_toxic_comments.csv')

# Preprocess the dataset by filling any missing values in 'comment_text' with empty strings
df['comment_text'].fillna("", inplace=True)

# Tokenization and sequence padding
MAX_NUM_WORDS = 10000  # Maximum number of words to consider in the tokenizer
MAX_SEQUENCE_LENGTH = 100  # Maximum length of each comment (in tokens)
EMBEDDING_DIM = 100  # Embedding vector size

# Tokenizer to convert text to sequences of integers
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['comment_text'].values)
sequences = tokenizer.texts_to_sequences(df['comment_text'].values)

# Pad the sequences to ensure consistent input size
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Labels for multi-label classification
y = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

# Split the dataset into training and validation sets (80% train, 20% validate)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the hybrid CNN + RNN (LSTM) model
model = Sequential()

# Embedding layer
model.add(Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))

# CNN part (Convolutional layer followed by max-pooling)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# RNN part (LSTM layer wrapped in a Bidirectional wrapper)
model.add(Bidirectional(LSTM(100, return_sequences=True)))

# Optional: Add another RNN layer (GRU in this case)
model.add(GRU(100))

# Dense layers for output
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))  # Dropout to prevent overfitting

# Output layer with sigmoid activation for multi-label classification
model.add(Dense(6, activation='sigmoid'))  # 6 output units for 6 labels

# Compile the model (for multi-label classification, use binary crossentropy)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val), verbose=1)

# Save the trained model and tokenizer for later use
model.save('toxic_comment_model.h5')

# Save the tokenizer
import pickle
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Model and tokenizer saved!")




Epoch 1/5
[1m2197/2197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m851s[0m 384ms/step - accuracy: 0.7002 - loss: 0.1531 - val_accuracy: 0.9900 - val_loss: 0.0762
Epoch 2/5
[1m2197/2197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m860s[0m 384ms/step - accuracy: 0.9603 - loss: 0.0759 - val_accuracy: 0.9900 - val_loss: 0.0706
Epoch 3/5
[1m2197/2197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m864s[0m 385ms/step - accuracy: 0.9581 - loss: 0.0660 - val_accuracy: 0.9900 - val_loss: 0.0671
Epoch 4/5
[1m2197/2197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m863s[0m 385ms/step - accuracy: 0.9028 - loss: 0.0546 - val_accuracy: 0.9895 - val_loss: 0.0635
Epoch 5/5
[1m2197/2197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m846s[0m 385ms/step - accuracy: 0.8212 - loss: 0.0461 - val_accuracy: 0.9880 - val_loss: 0.0617




Model and tokenizer saved!


In [None]:
!pip install gradio


Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from g

In [None]:
!pip install gradio --upgrade #Upgrade Gradio to the latest version

import numpy as np
import gradio as gr
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the saved model
model = load_model('toxic_comment_model.h5')

# Load the saved tokenizer
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Set constants for tokenization and padding
MAX_SEQUENCE_LENGTH = 100
toxicity_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Function to predict toxicity of a comment
def predict_toxicity(comment):
    # Tokenize and pad the input comment
    test_sequences = tokenizer.texts_to_sequences([comment])
    test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

    # Make predictions
    prediction = model.predict(test_data)[0]

    # Create a dictionary of results
    result = {label: round(pred, 2) for label, pred in zip(toxicity_labels, prediction)}

    return result

# Gradio interface
#In Gradio 3.x and above,  gr.inputs.Textbox  is gr.Textbox, gr.outputs.Label is gr.Label etc.
interface = gr.Interface(
    fn=predict_toxicity,
    inputs=gr.Textbox(lines=2, placeholder="Enter a comment to check for toxicity"), #Change to gr.Textbox
    outputs="json",
    title="Comment Toxicity Detector",
    description="Enter a comment, and this tool will predict if the comment contains various types of toxicity like 'toxic', 'severe toxic', 'obscene', 'threat', 'insult', or 'identity hate'.",
)

# Launch the Gradio app
interface.launch()





Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://529c059bbbcf5f7947.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


