# Toxic comment classification

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
train = pd.read_csv("./drive/MyDrive/datasets/toxic_comments/train.csv")
test = pd.read_csv("./drive/MyDrive/datasets/toxic_comments/test.csv")

In [None]:
train = train.sample(frac=1)

In [None]:
X = train["comment_text"].fillna("NODATA").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

In [None]:
print(X.shape, y.shape)

(159571,) (159571, 6)


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(127656,) (31915,) (127656, 6) (31915, 6)


In [None]:
max_features = 20000
maxlen = 100

In [None]:
list_sentences_train = X_train
list_sentences_test = X_test

In [None]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train) #receive text and return a sequence (indexes of the words)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
V_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen) #pads with zeros to fulfill the maxlen if not reached
V_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [None]:
print(V_train.shape,V_test.shape)

(127656, 100) (31915, 100)


In [None]:
def get_model():
    #define the architecture of the neural network
    embed_size = 128
    inp = Input(shape=(maxlen, )) #input layer
    x = Embedding(max_features, embed_size)(inp) #useful for NLP tasks
    x = Bidirectional(LSTM(50, return_sequences=True))(x) #50 neurons; bidirectional used for give information backwards
    x = GlobalMaxPool1D()(x) #highest activation of the previous layer
    x = Dropout(0.1)(x) #Regularization strategy
    x = Dense(50, activation="relu")(x) #fully connected  
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [None]:
model = get_model()
batch_size = 32 #number of samples of the network in order to estimate the gradient
epochs = 2

#Defining the callbacks
file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min') #after each epoch, saves the best weights of the best neural network

early = EarlyStopping(monitor="val_loss", mode="min", patience=20) #monitors if validation loss is getting worse, it stops


callbacks_list = [checkpoint, early]
model.fit(V_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

model.load_weights(file_path)

Epoch 1/2
Epoch 00001: val_loss improved from inf to 0.04921, saving model to weights_base.best.hdf5
Epoch 2/2
Epoch 00002: val_loss improved from 0.04921 to 0.04897, saving model to weights_base.best.hdf5


In [None]:
y_pred = model.predict(V_test)

In [None]:
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [None]:
precision, recall, f1score, _ = precision_recall_fscore_support(y_test, y_pred.round(), average='macro', zero_division=0)

In [None]:
print('Precision: ', round(precision, 6))
print('Recall: ', round(recall, 6))
print('F1-score: ', round(f1score, 6))

Precision:  0.575797
Recall:  0.344494
F1-score:  0.391629


In [None]:
print(classification_report(y_test, y_pred.round(), digits=6, zero_division=0))

              precision    recall  f1-score   support

           0   0.913004  0.650571  0.759764      3065
           1   0.882353  0.049020  0.092879       306
           2   0.871935  0.770620  0.818153      1661
           3   0.000000  0.000000  0.000000        87
           4   0.787489  0.596753  0.678980      1540
           5   0.000000  0.000000  0.000000       289

   micro avg   0.870141  0.605642  0.714189      6948
   macro avg   0.575797  0.344494  0.391629      6948
weighted avg   0.824608  0.605642  0.685331      6948
 samples avg   0.058841  0.053096  0.053671      6948

