In [27]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [2]:
path = '../content/'
EMBEDDING_FILE=f'{path}/glove.6B.50d.txt'
TRAIN_DATA_FILE=f'{path}train.csv'
train = pd.read_csv(TRAIN_DATA_FILE)
list_sentences_train = train["comment_text"].fillna("null").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

In [3]:
embed_size = 50
max_features = 20000
maxlen = 100
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
X_padded = pad_sequences(tokenized_train, maxlen=maxlen)
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=247)

In [4]:
def get_coefs(word,*arr): 
  return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

In [5]:
emb_mean,emb_std = all_embs.mean(), all_embs.std()

  if self.run_code(code, result):


(0.020940498, 0.6441043)

In [6]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [7]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.15)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
model.fit(X_train, y_train, batch_size=80, epochs=10, validation_split=0.1);

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
y_pred_train = model.predict([X_train], batch_size=1024, verbose=1)
y_pred_test = model.predict([X_test], batch_size=1024, verbose=1)



In [20]:
from sklearn import metrics
from sklearn.metrics import classification_report

AUC = np.zeros((2,6))

for i,x in enumerate(list_classes):
    auc = np.array([metrics.roc_auc_score(y_train[:,i], y_pred_train[:,i]),
                    metrics.roc_auc_score(y_test[:,i], y_pred_test[:,i])])
    print(x)
    print("Train AUC:",auc[0],"Test AUC:",auc[1])
    AUC[:,i] = auc
    
    y_pred = np.where(y_pred_test[:,i]>0.5,1,0)
    
    print(classification_report(y_test[:,i], y_pred))

    avg_auc = AUC.mean(axis=1)
print("Average Train AUC:",avg_auc[0],"Average Test AUC:",avg_auc[1])

toxic
Train AUC: 0.9955593481369647 Test AUC: 0.9605371697993328
              precision    recall  f1-score   support

         0.0       0.98      0.97      0.97     28844
         1.0       0.74      0.78      0.76      3071

    accuracy                           0.95     31915
   macro avg       0.86      0.87      0.87     31915
weighted avg       0.95      0.95      0.95     31915

severe_toxic
Train AUC: 0.9955340397734171 Test AUC: 0.9821836051019912
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     31614
         1.0       0.44      0.46      0.45       301

    accuracy                           0.99     31915
   macro avg       0.72      0.73      0.72     31915
weighted avg       0.99      0.99      0.99     31915

obscene
Train AUC: 0.9977446363704712 Test AUC: 0.9832292544667296
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     30274
         1.0       0.78      0.78

In [24]:
from keras import backend as K
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [25]:
y_test = y_test.astype('float64')
y_pred_test = y_pred_test.astype('float64')

In [26]:
total = 0
for i in range(6):
  curr_f1 = f1_m(y_test[:,i],y_pred_test[:,i]).numpy()
  print(str(list_classes[i]) + " f1 score: " + str(curr_f1))
  total += curr_f1
avg_f1 = total/6
avg_f1


toxic f1 score: 0.7602543220225894
severe_toxic f1 score: 0.44732571973868995
obscene f1 score: 0.7825029876199965
threat f1 score: 0.3648648167092831
insult f1 score: 0.6872963669239817
identity_hate f1 score: 0.4989690219951161


0.5902022058349429