In [1]:
import pandas as pd
import numpy as np

In [61]:
import os
import sys
import re
import string
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [89]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, confusion_matrix
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Flatten, MaxPooling1D, Conv1D, Reshape, concatenate,\
Embedding, BatchNormalization, Activation, Dropout, GRU, Bidirectional
from keras.models import Model
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from Attention import *
import tensorflow as tf

In [56]:
data = pd.read_csv('../data/train.csv')

In [57]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [58]:
classes = ['toxic', 'severe_toxic',
           'obscene', 'threat', 'insult', 'identity_hate']

In [90]:
comments = data['comment_text'].fillna('UNK').values
y = data[classes].values

In [85]:
train_comments, val_comments, y_train, y_val = train_test_split(comments, y, test_size=0.1)

In [71]:
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 100

In [72]:
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(train_comments)

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_comments)
val_sequences   = tokenizer.texts_to_sequences(val_comments)
X_train         = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)
X_val           = pad_sequences(val_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [74]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 26 unique tokens.


In [75]:
! ls ../embeddings

glove.840B.300d.txt  glove.840B.300d.zip


In [1]:
embeddings_index = {}
f = open('../embeddings/glove.42B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print("error reading word", word)
f.close()

print('Found %s word vectors.' % len(embeddings_index))

FileNotFoundError: [Errno 2] No such file or directory: '../embeddings/glove.840B.300d.txt'

In [None]:

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

In [None]:
embedding_matrix.shape

In [97]:
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

with tf.device('/gpu:1'):
    x = Embedding(nb_words, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)(comment_input)

with tf.device('/gpu:0'):
    x = Conv1D(32, kernel_size=3, padding='same',activation='relu')(x)
    x = MaxPooling1D(pool_size=3)(x)
    x = Dropout(0.1)(x)

    x = Conv1D(64 ,kernel_size=3, padding='same', activation='relu')(x)
    x = MaxPooling1D(pool_size=3)(x)
    x = Dropout(0.1)(x)

    x = Conv1D(128, kernel_size=3, padding='same', activation='relu')(x)
    x = MaxPooling1D(pool_size=3)(x)
    x = Dropout(0.1)(x)

    #x = Bidirectional(GRU(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = Flatten()(x)

    x = Dense(128, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = BatchNormalization()(x)

    predictions = Dense(6, activation='sigmoid')(x)

In [98]:
model = Model(comment_input, predictions)

In [99]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [100]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 100)               0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 100, 300)          8100      
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 100, 32)           28832     
_________________________________________________________________
max_pooling1d_28 (MaxPooling (None, 33, 32)            0         
_________________________________________________________________
dropout_41 (Dropout)         (None, 33, 32)            0         
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 33, 64)            6208      
_________________________________________________________________
max_pooling1d_29 (MaxPooling (None, 11, 64)            0         
__________

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_ckpt = ModelCheckpoint(filepath='../models/conv1d_rnn.h5', save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=2)
callbacks = [model_ckpt, reduce_lr]

In [None]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=512,
          epochs=20, callbacks=callbacks) 

Train on 143613 samples, validate on 15958 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20

In [32]:
y_pred_val = model.predict(X_val)

In [47]:
y_pred_val.shape

(31915, 6)

In [54]:
a = 0.1212121
print("{0:.2f}".format(a))

0.12


In [117]:
val_df = pd.DataFrame()
for i, col in enumerate(classes):
    loss = log_loss(y_val[:, i], y_pred_val[:, i])
    auc = roc_auc_score(y_val[:, i], y_pred_val[:, i])
    acc = accuracy_score(y_val[:, i], (y_pred_val[:, i] > .5).astype(int))
    val_df = val_df.append({'class': col, 'auc': auc, 'loss': loss, 'acc': acc}, ignore_index=True)
    print('{:15} log_loss: {:.2f} auc: {:.2f} acc: {:.2f}'.format(col, loss, auc, acc))
    print()
    print(confusion_matrix(y_val[:, i], (y_pred_val[:, i] > .5).astype(int)))
    print()
val_df = val_df.set_index('class')

toxic           log_loss: 0.12 auc: 0.97 acc: 0.96

[[28606   219]
 [ 1150  1940]]

severe_toxic    log_loss: 0.03 auc: 0.98 acc: 0.99

[[31541    40]
 [  273    61]]

obscene         log_loss: 0.06 auc: 0.98 acc: 0.98

[[30090   111]
 [  601  1113]]

threat          log_loss: 0.01 auc: 0.97 acc: 1.00

[[31820     2]
 [   88     5]]

insult          log_loss: 0.08 auc: 0.98 acc: 0.97

[[30147   148]
 [  840   780]]

identity_hate   log_loss: 0.02 auc: 0.98 acc: 0.99

[[31631    13]
 [  229    42]]



In [104]:
val_df.mean()

acc     0.980605
auc     0.977894
loss    0.054529
dtype: float64

In [105]:
test = pd.read_csv('../data/test.csv')

In [106]:
test_comments = test['comment_text'].fillna('UNK')
test_sequences = tokenizer.texts_to_sequences(test_comments)
X_test = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [109]:
model.load_weights('../models/conv1d_rnn.h5')

In [110]:
y_pred = model.predict(X_test)

In [111]:
sub = pd.read_csv('../data/sample_submission.csv')

In [112]:
sub.iloc[:, 1:] = y_pred

In [113]:
sub.to_csv('../submissions/conv1d_rnn.csv', index=False)

In [114]:
sub.to_csv('cnn.csv', index=False)

In [115]:
pd.read_csv('../submissions/conv1d_rnn.csv').head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.899014,0.06560973,0.636133,0.01711239,0.498962,0.04544874
1,0000247867823ef7,0.005194,8.586932e-08,0.000779,9.83027e-09,0.000243,4.422942e-06
2,00013b17ad220c46,0.000398,6.701796e-09,0.000135,3.856752e-10,2.6e-05,3.213974e-07
3,00017563c3f7919a,0.000285,6.2189e-11,1.7e-05,1.383176e-08,1.4e-05,1.032655e-08
4,00017695ad8997eb,0.004565,6.080754e-07,0.000887,1.510305e-06,0.000159,8.54721e-06
