In [35]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from time import time

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GRU, Embedding, Dropout, Activation, LSTM
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
# import gensim.models.keyedvectors as word2vec
# import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score


In [10]:
path = './data/'
EMBEDDING_FILE=f'{path}glove.6B.50d.txt'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'

In [11]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (159571, 8)
Test shape: (153164, 2)


In [14]:
train_sample = train.sample(frac=.5, random_state=29)

train_smaple.shape

(79786, 8)

In [15]:
list_sentences_train = train_sample["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_sample[list_classes].values

In [16]:
embed_size = [50, 100, 150, 200, 250, 300]
max_features = [10000, 20000, 30000, 40000, 50000, 60000]
max_len = [50, 100, 150, 200, 250, 300]

In [17]:
x_train, x_test, y_train, y_test = train_test_split(list_sentences_train, y, test_size=0.5, random_state=29)

In [18]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [19]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940498, 0.6441043)

First run for LSTM

In [None]:
# Sould i also tune drop out?
resul = pd.DataFrame(columns=['layer','embed_size','max_features','max_len','time_taken','toxic','severe_toxic','obscene','threat','insult','identity_have','average'])
rown = 0
for es in embed_size:
    for mf in max_features:
        for ml in max_len:
            start = time()
            tokenizer = Tokenizer(num_words=mf)
            tokenizer.fit_on_texts(list(list_sentences_train))
            list_tokenized_train = tokenizer.texts_to_sequences(x_train)
            list_tokenized_test = tokenizer.texts_to_sequences(x_test)
            X_t = pad_sequences(list_tokenized_train, maxlen=ml)
            X_te = pad_sequences(list_tokenized_test, maxlen=ml)
            
            word_index = tokenizer.word_index
            nb_words = min(mf, len(word_index))
            embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, es))
            for word, i in word_index.items():
                if i >= mf: continue
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
            inp = Input(shape=(ml,))
            x = Embedding(mf, es, weights=[embedding_matrix])(inp)
            x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
            x = GlobalMaxPool1D()(x)
            x = Dense(50, activation="relu")(x)
            x = Dropout(0.1)(x)
            x = Dense(6, activation="sigmoid")(x)
            model = Model(inputs=inp, outputs=x)
            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            
            print(f'Fitting Embed Size: {es}, Max Features: {mf}, Max Len: {ml}\n')
            model.fit(X_t, y_train, batch_size=32, epochs=2)
            print('\n')
            print(f'Predicting Embed Size: {es}, Max Features: {mf}, Max Len: {ml}\n')
            y_pred = model.predict([X_te], batch_size=1024, verbose=1)
            print('\n')
            print(f'ROC AUC for Embed Size: {es}, Max Features: {mf}, Max Len: {ml}\n')
            
            tox_rocauc = roc_auc_score(y_test[:,0], y_pred[:,0])
            stox_rocauc = roc_auc_score(y_test[:,1], y_pred[:,1])
            obs_rocauc = roc_auc_score(y_test[:,2], y_pred[:,2])
            thr_rocauc = roc_auc_score(y_test[:,3], y_pred[:,3])
            ins_rocauc = roc_auc_score(y_test[:,4], y_pred[:,4])
            idh_rocauc = roc_auc_score(y_test[:,5], y_pred[:,5])
            avg_rocauc = (tox_rocauc+stox_rocauc+obs_rocauc+thr_rocauc+ins_rocauc+idh_rocauc)/6

            print(f'Toxic: {tox_rocauc}')
            print(f'S Tox: {stox_rocauc}')
            print(f'Obs:   {obs_rocauc}')
            print(f'Thr:   {thr_rocauc}')
            print(f'Ins:   {ins_rocauc}')
            print(f'IDH:   {idh_rocauc}')
            print(f'Avrg:  {avg_rocauc}')
            print('\n')
            
            end = time()
            
            resul.loc[rown] = ['LSTM', es, mf, ml, end-start, tox_rocauc, stox_rocauc, obs_rocauc, thr_rocauc, ins_rocauc, idh_rocauc, avg_rocauc]
            rown+=1
            

resul.to_csv('./results.csv', float_format='%.3f', index=False)

Fitting Embed Size: 50, Max Features: 10000, Max Len: 50

Epoch 1/2
Epoch 2/2


Predicting Embed Size: 50, Max Features: 10000, Max Len: 50



ROC AUC for Embed Size: 50, Max Features: 10000, Max Len: 50

Toxic: 0.9667095997557349
S Tox: 0.9869042313502779
Obs:   0.9807350476276513
Thr:   0.9427846152127126
Ins:   0.9747962550028565
IDH:   0.9615397264190366
Avrg:  0.968911579228045


Fitting Embed Size: 50, Max Features: 10000, Max Len: 100

Epoch 1/2
Epoch 2/2


Predicting Embed Size: 50, Max Features: 10000, Max Len: 100



ROC AUC for Embed Size: 50, Max Features: 10000, Max Len: 100

Toxic: 0.971423183485109
S Tox: 0.9882872429386084
Obs:   0.9843643612066674
Thr:   0.9565841146316809
Ins:   0.9779196733324673
IDH:   0.9677877979680576
Avrg:  0.9743943955937651


Fitting Embed Size: 50, Max Features: 10000, Max Len: 150

Epoch 1/2
Epoch 2/2


Predicting Embed Size: 50, Max Features: 10000, Max Len: 150



ROC AUC for Embed Size: 50, Max Features: 10000, Max Len: 150

Toxic: 0.9732