In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPool1D, Dense, Dropout, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from google.colab import files

In [None]:
def upload_files():
    uploaded = files.upload()
    return next(iter(uploaded))

In [None]:
def load_and_preprocess_data():
    print("Please upload your training CSV file:")
    train_path = upload_files()
    print("Please upload your test CSV file:")
    test_path = upload_files()

    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    train['comment_text'] = train['comment_text'].fillna('unknown')
    test['comment_text'] = test['comment_text'].fillna('unknown')

    return train, test

In [None]:
train, test = load_and_preprocess_data()

Please upload your training CSV file:


Saving train.csv to train.csv
Please upload your test CSV file:


Saving test.csv to test.csv


In [None]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(train['comment_text'])

In [None]:
def get_sequences(tokenizer, texts, maxlen=512):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=maxlen)
    return padded_sequences

In [None]:

X_train = get_sequences(tokenizer, train['comment_text'])
X_test = get_sequences(tokenizer, test['comment_text'])
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values


In [None]:
def build_model(input_dim, output_dim, max_length):
    inp = Input(shape=(max_length,))
    x = Embedding(input_dim=input_dim, output_dim=output_dim)(inp)
    x = Dropout(0.1)(x)

    conv_blocks = []
    for dilation_rate in [1, 2, 4, 8]:
        conv = Conv1D(filters=64, kernel_size=3, dilation_rate=dilation_rate, activation='relu')(x)
        conv = GlobalMaxPool1D()(conv)
        conv_blocks.append(conv)

    x = Concatenate()(conv_blocks)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.1)(x)
    output = Dense(6, activation='sigmoid')(x)

    model = Model(inputs=inp, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
model = build_model(input_dim=len(tokenizer.word_index) + 1, output_dim=64, max_length=512)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 512)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 512, 64)              135360    ['input_1[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 512, 64)              0         ['embedding[0][0]']           
                                                                                                  
 conv1d (Conv1D)             (None, 510, 64)              12352     ['dropout[0][0]']             
                                                                                              

In [None]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='min'),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1, mode='min')
]

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2, callbacks=callbacks)

# Load best model and evaluate on test data
model.load_weights('best_model.h5')
predictions = model.predict(X_test)

Epoch 1/5
Epoch 1: val_loss improved from inf to 0.08244, saving model to best_model.h5
Epoch 2/5


  saving_api.save_model(


Epoch 2: val_loss improved from 0.08244 to 0.06933, saving model to best_model.h5
Epoch 3/5
Epoch 3: val_loss improved from 0.06933 to 0.06465, saving model to best_model.h5
Epoch 4/5
Epoch 4: val_loss improved from 0.06465 to 0.06283, saving model to best_model.h5
Epoch 5/5
Epoch 5: val_loss improved from 0.06283 to 0.06207, saving model to best_model.h5
