# Model - RNN + CNN

## Import Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

import sys
import re

path = "C:/Users/alex_/Google Drive/Kaggle_ToxicComment/"
sys.path.append(path)

EMBEDDING_FILE = 'C:/Users/alex_/Documents/Pre_Trained_Models/word_embeddings/glove/glove.840B.300d.txt'


Using TensorFlow backend.


## Import data

In [2]:
train = pd.read_csv(path + "/data/train_2.csv",
skipinitialspace=True,
header = 0,
sep = ',',
encoding='utf8')

test = pd.read_csv(path + "/data/test_2.csv",
skipinitialspace=True,
header = 0,
sep = ',',
encoding='utf8')

train["comment_text"].fillna(" ")
test["comment_text"].fillna(" ")

#X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

idx = pd.DataFrame(train["id"]).reset_index(drop=True)
idx_train, idx_val = train_test_split(np.asarray(idx.index), train_size=0.85, random_state=17)

idx_train = list(idx_train)
idx_val = list(idx_val)



## Some preprocessing

* Replace regular expressions <br>
* lowrcase <br>
* http +  www <br>
* "!" and "?"

In [3]:
repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

keys = [i for i in repl.keys()]

new_train_data = []
new_test_data = []
ltr = train["comment_text"].tolist()
lte = test["comment_text"].tolist()
for i in ltr:
    arr = str(i).split()
    xx = ""
    for j in arr:
        j = str(j).lower()
        if j[:4] == 'http' or j[:3] == 'www':
            continue
        if j in keys:
            # print("inn")
            j = repl[j]
        xx += j + " "
    new_train_data.append(xx)
for i in lte:
    arr = str(i).split()
    xx = ""
    for j in arr:
        j = str(j).lower()
        if j[:4] == 'http' or j[:3] == 'www':
            continue
        if j in keys:
            # print("inn")
            j = repl[j]
        xx += j + " "
    new_test_data.append(xx)
train["new_comment_text"] = new_train_data
test["new_comment_text"] = new_test_data
print("crap removed")
trate = train["new_comment_text"].tolist()
tete = test["new_comment_text"].tolist()
for i, c in enumerate(trate):
    trate[i] = re.sub('[^a-zA-Z ?!]+', '', str(trate[i]).lower())
for i, c in enumerate(tete):
    tete[i] = re.sub('[^a-zA-Z ?!]+', '', tete[i])
train["comment_text"] = trate
test["comment_text"] = tete
print('only alphabets')


embed_size=0

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_t = train[list_classes].values

list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

crap removed
only alphabets


## Define ... functions

In [4]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))


## Define Input Parameters

In [5]:
max_features=150000
maxlen=200
embed_size=300

Adjust the input data to the selected parameters

In [6]:
tok=text.Tokenizer(num_words=max_features,lower=True)
tok.fit_on_texts(list(trate)+list(tete))
X_train=tok.texts_to_sequences(trate)
X_test=tok.texts_to_sequences(tete)
x_train=sequence.pad_sequences(X_train,maxlen=maxlen)
x_test=sequence.pad_sequences(X_test,maxlen=maxlen)   

## Load the Word embeddings

In [7]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float64')
        embeddings_index[word] = coefs   
        
word_index = tok.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

## Model 3 - Word Embeddings + RNN + CNN

### Model's artchitecture

In [8]:
sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
x = SpatialDropout1D(0.2)(x)

x = Bidirectional(GRU(128, return_sequences=True,dropout=0.15,recurrent_dropout=0.15))(x)
conv1 = Conv1D(64, kernel_size = 3, padding = "valid")(x)
avg_pool_conv1 = GlobalAveragePooling1D()(conv1)
max_pool_conv1 = GlobalMaxPooling1D()(conv1)

x = concatenate([avg_pool_conv1, max_pool_conv1]) 
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
preds = Dense(6, activation="sigmoid")(x)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     45000000    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 200, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 200, 256)     329472      spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
conv1d_1 (

...

In [9]:
batch_size = 128
epochs = 5

#xtrain, xval, ytrain, yval = train_test_split(x_train, y_train, train_size=0.85, random_state=17)

xtrain = x_train[idx_train]
xval = x_train[idx_val]
ytrain = y_train[idx_train]
yval = y_train[idx_val]

filepath = path + "weights_base.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
ra_val = RocAucEvaluation(validation_data=(xval, yval), interval = 1)
def schedule(ind):
    a = [0.0008, 0.0010, 0.0008, 0.0010, 0.0008] # Adam
    #a = [0.25, 0.5, 0.25] # Adadelta
    return a[ind] 
lr = callbacks.LearningRateScheduler(schedule)
callbacks_list = [ra_val, checkpoint, early, lr]


### Fit model

In [10]:
model.fit(xtrain, ytrain, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=(xval, yval),
          callbacks = callbacks_list,
          verbose=1,
          shuffle=True)

Train on 135635 samples, validate on 23936 samples
Epoch 1/5
 ROC-AUC - epoch: 1 - score: 0.980673

Epoch 00001: val_loss improved from inf to 0.04750, saving model to C:/Users/alex_/Google Drive/Kaggle_ToxicComment/weights_base.best.hdf5
Epoch 2/5
 ROC-AUC - epoch: 2 - score: 0.986057

Epoch 00002: val_loss improved from 0.04750 to 0.04407, saving model to C:/Users/alex_/Google Drive/Kaggle_ToxicComment/weights_base.best.hdf5
Epoch 3/5
 ROC-AUC - epoch: 3 - score: 0.987035

Epoch 00003: val_loss improved from 0.04407 to 0.04257, saving model to C:/Users/alex_/Google Drive/Kaggle_ToxicComment/weights_base.best.hdf5
Epoch 4/5
 ROC-AUC - epoch: 4 - score: 0.987054

Epoch 00004: val_loss did not improve
Epoch 5/5
 ROC-AUC - epoch: 5 - score: 0.987031

Epoch 00005: val_loss did not improve


<keras.callbacks.History at 0x1dabaa74d68>

## Model Assessment - Predict on the validation set

In [11]:
model.save(path + '\\Emb_RNN_CNN.h5')
#model.load_weights(filepath)
print('Predicting....')
ypred = model.predict(xval,batch_size=1024,verbose=1)

Predicting....


In [12]:
# Prepare the file
df_subm = pd.DataFrame(ypred)
df_subm.columns = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

# Export to a csv 
df_subm.to_csv(path + '\\submission_2\\val_wordembeddings_RNN_CNN.csv', index=True, header = True)

### % of comments well categorized for all the categories 

In [13]:
from sklearn.metrics import confusion_matrix, classification_report
from utils import unique_category

ypred_cat = np.where((ypred >= 0.5), 1, 0) 

YCompare = np.hstack([yval, ypred])

TotalAccuracy = float(np.sum(yval == ypred_cat, axis=0)[0])/len(yval) * 100
print(TotalAccuracy)

96.44886363636364




### Confusion Matrix for each category

In [14]:
# Keep columns reffering to categories
arr_colnames = train.columns.values.tolist()[2:8]

unique_category('toxic', arr_colnames, yval, ypred_cat)
unique_category('severe_toxic', arr_colnames, yval, ypred_cat)
unique_category('obscene', arr_colnames, yval, ypred_cat)
unique_category('threat', arr_colnames, yval, ypred_cat)
unique_category('insult', arr_colnames, yval, ypred_cat)
unique_category('identity_hate', arr_colnames, yval, ypred_cat)

Name of the category : toxic
Accuracy : 96.44886363636364
[[21231   403]
 [  447  1855]]
             precision    recall  f1-score   support

          0       0.98      0.98      0.98     21634
          1       0.82      0.81      0.81      2302

avg / total       0.96      0.96      0.96     23936

Name of the category : severe_toxic
Accuracy : 98.9346590909091
[[23629    45]
 [  210    52]]
             precision    recall  f1-score   support

          0       0.99      1.00      0.99     23674
          1       0.54      0.20      0.29       262

avg / total       0.99      0.99      0.99     23936

Name of the category : obscene
Accuracy : 98.19100935828877
[[22391   233]
 [  200  1112]]
             precision    recall  f1-score   support

          0       0.99      0.99      0.99     22624
          1       0.83      0.85      0.84      1312

avg / total       0.98      0.98      0.98     23936

Name of the category : threat
Accuracy : 99.7033756684492
[[23847    11]
 [   60

## Export results - test set

In [15]:
from keras.models import load_model
model.load_weights(path + '\\Emb_RNN_CNN.h5')
#print('Predicting....')
y_test_pred = model.predict(x_test,batch_size=1024,verbose=1)



In [16]:
# Prepare the file
df_subm = pd.DataFrame(y_test_pred, index=test.id)
df_subm.columns = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

# Export to a csv 
df_subm.to_csv(path + '\\submission_2\\wordembeddings_RNN_CNN.csv', index=True, header = True)

# Model 1 - Words and Character gram + Logistic Regression

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

Vectorize the comment : <br>
* Vectorize the words
* Vectorize the characters
* Append the 2 !

In [18]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])
#print(all_text[0:10])

In [19]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=25000)

word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(1, 5),
    max_features=35000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

###  Fit the model and Predict on the validation set

In [20]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
target = train[class_names]
validation = pd.DataFrame(train['id'][idx_val])

#idx_train = np.reshape(idx_train, (len(idx_train), 1))
#idx_val = np.reshape(idx_val, (len(idx_val), 1))


In [21]:
for class_name in class_names:
    
    target = np.asarray(train[class_name])
    target = np.reshape(target, len(target),1)
    
    ytrain = target[idx_train]
    yval = target[idx_val]
    
    train_features2 = train_features.tocsr()  
    xtrain = train_features2[idx_train,:]
    xval = train_features2[idx_val,:]
    
    classifier = LogisticRegression(C=1, solver='sag')
    classifier.fit(xtrain, ytrain)
    validation[class_name] = classifier.predict_proba(xval)[:, 1]
    
    pred = round(validation[class_name])
    pred = np.reshape(pred, (len(pred), 1))
    
    cm = confusion_matrix(yval, pred)
    cr = classification_report(yval, pred)

    print('Name of the category : {} '.format(class_name))
    print("Accuracy : {}".format(format((float(cm[0,0]) + float(cm[1,1]))/len(yval)*100)))
    print(cm)
    print(cr)
    
validation.to_csv(path + '\\submission_2\\val_logreg_words_chars.csv', index=False, header = True)
print(validation[0:9])

  return getattr(obj, method)(*args, **kwds)


Name of the category : toxic 
Accuracy : 96.39455213903744
[[21485   149]
 [  714  1588]]
             precision    recall  f1-score   support

          0       0.97      0.99      0.98     21634
          1       0.91      0.69      0.79      2302

avg / total       0.96      0.96      0.96     23936

Name of the category : severe_toxic 
Accuracy : 98.9346590909091
[[23612    62]
 [  193    69]]
             precision    recall  f1-score   support

          0       0.99      1.00      0.99     23674
          1       0.53      0.26      0.35       262

avg / total       0.99      0.99      0.99     23936

Name of the category : obscene 
Accuracy : 98.03225267379679
[[22529    95]
 [  376   936]]
             precision    recall  f1-score   support

          0       0.98      1.00      0.99     22624
          1       0.91      0.71      0.80      1312

avg / total       0.98      0.98      0.98     23936

Name of the category : threat 
Accuracy : 99.70755347593582
[[23848    10]
 [

In [22]:
from sklearn.metrics import confusion_matrix, classification_report
from utils import unique_category

target = np.asarray(train[class_names])[idx_val]
ypred_cat = np.where((validation[class_names] >= 0.5), 1, 0) 

TotalAccuracy = float(np.sum(target == ypred_cat, axis=0)[0])/len(yval) * 100
print(TotalAccuracy)

96.39455213903744


### Predict  on test set + export results

In [None]:
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]
    print('Class "{}" done'.format(class_name))
    
print(submission[0:9])
submission.to_csv(path + '\\submission_2\\logreg_words_chars.csv', index=False, header = True)

# Model 2 - Word Embeddings + DeepCNN

In [23]:
from keras.layers import BatchNormalization,PReLU
from keras.layers.merge import add, Concatenate

max_features=150000
maxlen=200
embed_size=300

In [24]:
sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
x = SpatialDropout1D(0.2)(x)

def dpcnn(embedded_sequences):
    filter_nr = 64
    filter_size = 3
    max_pool_size = 3
    max_pool_strides = 2
    dense_nr = 256
    dense_dropout = 0.5
    
    block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(embedded_sequences)
    block1 = BatchNormalization()(block1)
    block1 = PReLU()(block1)
    block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1)
    block1 = BatchNormalization()(block1)
    block1 = PReLU()(block1)
    
    #we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output
    #if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output
    resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear')(embedded_sequences)
    resize_emb = PReLU()(resize_emb)
        
    block1_output = add([block1, resize_emb])
    block1_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output)
    
    block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block1_output)
    block2 = BatchNormalization()(block2)
    block2 = PReLU()(block2)
    block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block2)
    block2 = BatchNormalization()(block2)
    block2 = PReLU()(block2)
        
    block2_output = add([block2, block1_output])
    block2_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block2_output)
    
    block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block2_output)
    block3 = BatchNormalization()(block3)
    block3 = PReLU()(block3)
    block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block3)
    block3 = BatchNormalization()(block3)
    block3 = PReLU()(block3)
        
    block3_output = add([block3, block2_output])
    block3_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block3_output)
    
    block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block3_output)
    block4 = BatchNormalization()(block4)
    block4 = PReLU()(block4)
    block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear')(block4)
    block4 = BatchNormalization()(block4)
    block4 = PReLU()(block4)
    
    output = add([block4, block3_output])
    output = GlobalMaxPooling1D()(output)
    
    output = Dense(dense_nr, activation='linear')(output)
    output = BatchNormalization()(output)
    output = PReLU()(output)
    output = Dropout(dense_dropout)(output)
    output = Dense(6, activation='sigmoid')(output)
    
    return output

output = dpcnn(x)
model_dpcnn = Model(inputs=sequence_input, outputs=output)
model_dpcnn.compile(loss='binary_crossentropy',
                  optimizer='Adam',
                  metrics=['accuracy'])

model_dpcnn.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 200, 300)     45000000    input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 200, 300)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 200, 64)      57664       spatial_dropout1d_2[0][0]        
__________________________________________________________________________________________________
batch_norm

In [25]:
batch_size = 128
epochs = 5

#xtrain, xval, ytrain, yval = train_test_split(x_train, y_train, train_size=0.85, random_state=17)

xtrain = x_train[idx_train]
xval = x_train[idx_val]
ytrain = y_train[idx_train]
yval = y_train[idx_val]

checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
ra_val = RocAucEvaluation(validation_data=(xval, yval), interval = 1)
def schedule(ind):
    a = [0.0008, 0.0010, 0.0008, 0.0010, 0.0008] # Adam
    #a = [0.25, 0.5, 0.25] # Adadelta
    return a[ind] 
lr = callbacks.LearningRateScheduler(schedule)
callbacks_list = [ra_val, checkpoint, early, lr]

In [26]:
model_dpcnn.fit(xtrain, ytrain, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=(xval, yval),
          callbacks = callbacks_list,
          verbose=1,
          shuffle=True)

Train on 135635 samples, validate on 23936 samples
Epoch 1/5
 ROC-AUC - epoch: 1 - score: 0.977076

Epoch 00001: val_loss improved from inf to 0.04795, saving model to C:/Users/alex_/Google Drive/Kaggle_ToxicComment/weights_base.best.hdf5
Epoch 2/5
 ROC-AUC - epoch: 2 - score: 0.983659

Epoch 00002: val_loss improved from 0.04795 to 0.04565, saving model to C:/Users/alex_/Google Drive/Kaggle_ToxicComment/weights_base.best.hdf5
Epoch 3/5
 ROC-AUC - epoch: 3 - score: 0.986566

Epoch 00003: val_loss improved from 0.04565 to 0.04471, saving model to C:/Users/alex_/Google Drive/Kaggle_ToxicComment/weights_base.best.hdf5
Epoch 4/5
 ROC-AUC - epoch: 4 - score: 0.984780

Epoch 00004: val_loss did not improve
Epoch 5/5
 ROC-AUC - epoch: 5 - score: 0.987077

Epoch 00005: val_loss improved from 0.04471 to 0.04270, saving model to C:/Users/alex_/Google Drive/Kaggle_ToxicComment/weights_base.best.hdf5


<keras.callbacks.History at 0x1dab431da58>

In [42]:
model_dpcnn.save(path + '\\Emb_dpcnn.h5')
model_dpcnn.load_weights(path + '\\Emb_dpcnn.h5')
print('Predicting....')
ypred = model_dpcnn.predict(xval,batch_size=1024,verbose=1)

Predicting....


In [43]:
# Prepare the file
df_subm = pd.DataFrame(ypred)
df_subm.columns = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

# Export to a csv 
df_subm.to_csv(path + '\\submission_2\\val_wordembeddings_dpCNN.csv', index=True, header = True)

### % of comments well categorized for all the categories 

In [44]:
from sklearn.metrics import confusion_matrix, classification_report
from utils import unique_category

ypred_cat = np.where((ypred >= 0.5), 1, 0) 

YCompare = np.hstack([yval, ypred])

TotalAccuracy = float(np.sum(yval == ypred_cat, axis=0)[0])/len(yval) * 100
print(TotalAccuracy)

96.44468582887701


In [45]:
# Keep columns reffering to categories
arr_colnames = train.columns.values.tolist()[2:8]

unique_category('toxic', arr_colnames, yval, ypred_cat)
unique_category('severe_toxic', arr_colnames, yval, ypred_cat)
unique_category('obscene', arr_colnames, yval, ypred_cat)
unique_category('threat', arr_colnames, yval, ypred_cat)
unique_category('insult', arr_colnames, yval, ypred_cat)
unique_category('identity_hate', arr_colnames, yval, ypred_cat)

Name of the category : toxic
Accuracy : 96.44468582887701
[[21327   307]
 [  544  1758]]
             precision    recall  f1-score   support

          0       0.98      0.99      0.98     21634
          1       0.85      0.76      0.81      2302

avg / total       0.96      0.96      0.96     23936

Name of the category : severe_toxic
Accuracy : 98.91377005347593
[[23612    62]
 [  198    64]]
             precision    recall  f1-score   support

          0       0.99      1.00      0.99     23674
          1       0.51      0.24      0.33       262

avg / total       0.99      0.99      0.99     23936

Name of the category : obscene
Accuracy : 98.18683155080214
[[22402   222]
 [  212  1100]]
             precision    recall  f1-score   support

          0       0.99      0.99      0.99     22624
          1       0.83      0.84      0.84      1312

avg / total       0.98      0.98      0.98     23936

Name of the category : threat
Accuracy : 99.72426470588235
[[23851     7]
 [   

In [35]:
## Export results - test set

In [36]:
model_dpcnn.load_weights(path + '\\Emb_dpcnn.h5')
y_test_pred = model_dpcnn.predict(x_test,batch_size=1024,verbose=1)

# Prepare the file
df_subm = pd.DataFrame(y_test_pred, index=test.id)
df_subm.columns = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

# Export to a csv 
df_subm.to_csv(path + '\\submission_2\\wordembeddings_dpCNN.csv', index=True, header = True)



# Model Ensemble

In [31]:
from keras.models import load_model

path = "C:/Users/alex_/Google Drive/Kaggle_ToxicComment/"
sys.path.append(path)


### Predict on validation set

In [32]:
simple_logreg = pd.read_csv(path + "/submission_2/val_logreg_words_chars.csv",
skipinitialspace=True,
header = 0,
sep = ',',
encoding='utf8')

wordembedding_dpcnn = pd.read_csv(path + "/submission_2/val_wordembeddings_dpcnn.csv",
skipinitialspace=True,
header = 0,
sep = ',',
encoding='utf8')
 
wordembedding_gru_cnn = pd.read_csv(path + "/submission_2/val_wordembeddings_RNN_CNN.csv",
skipinitialspace=True,
header = 0,
sep = ',',
encoding='utf8')

In [38]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

model_ens = simple_logreg.copy()

model_ens[label_cols] = (simple_logreg[label_cols] +
                         wordembedding_dpcnn[label_cols] + 
                         wordembedding_gru_cnn[label_cols]) / 3

model_ens = model_ens[label_cols]

print(model_ens.columns)
print(type(model_ens))
yval = y_train[idx_val]
print(model_ens.shape)
print(yval.shape)

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
(23936, 6)
(23936, 6)


In [39]:
from sklearn.metrics import confusion_matrix, classification_report
from utils import unique_category

ypred_cat = np.where((model_ens >= 0.5), 1, 0) 

YCompare = np.hstack([yval, ypred])

TotalAccuracy = float(np.sum(yval == ypred_cat, axis=0)[0])/len(yval) * 100
print(TotalAccuracy)

96.75802139037432


In [41]:
# Keep columns reffering to categories
arr_colnames = train.columns.values.tolist()[2:8]

unique_category('toxic', arr_colnames, yval, ypred_cat)
unique_category('severe_toxic', arr_colnames, yval, ypred_cat)
unique_category('obscene', arr_colnames, yval, ypred_cat)
unique_category('threat', arr_colnames, yval, ypred_cat)
unique_category('insult', arr_colnames, yval, ypred_cat)
unique_category('identity_hate', arr_colnames, yval, ypred_cat)

Name of the category : toxic
Accuracy : 96.75802139037432
[[21404   230]
 [  546  1756]]
             precision    recall  f1-score   support

          0       0.98      0.99      0.98     21634
          1       0.88      0.76      0.82      2302

avg / total       0.97      0.97      0.97     23936

Name of the category : severe_toxic
Accuracy : 98.96808155080214
[[23628    46]
 [  201    61]]
             precision    recall  f1-score   support

          0       0.99      1.00      0.99     23674
          1       0.57      0.23      0.33       262

avg / total       0.99      0.99      0.99     23936

Name of the category : obscene
Accuracy : 98.32052139037432
[[22469   155]
 [  247  1065]]
             precision    recall  f1-score   support

          0       0.99      0.99      0.99     22624
          1       0.87      0.81      0.84      1312

avg / total       0.98      0.98      0.98     23936

Name of the category : threat
Accuracy : 99.73262032085562
[[23854     4]
 [   

### Predict on test set

In [None]:
simple_logreg = pd.read_csv(path + "/submission_2/logreg_words_chars.csv",
skipinitialspace=True,
header = 0,
sep = ',',
encoding='utf8')

wordembedding_dpcnn = pd.read_csv(path + "/submission_2/wordembeddings_dpcnn.csv",
skipinitialspace=True,
header = 0,
sep = ',',
encoding='utf8')
 
wordembedding_gru_cnn = pd.read_csv(path + "/submission_2/wordembeddings_RNN_CNN.csv",
skipinitialspace=True,
header = 0,
sep = ',',
encoding='utf8')

In [164]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

model_ens = simple_logreg.copy()
model_ens[label_cols] = (simple_logreg[label_cols] +
                         wordembedding_dpcnn[label_cols] + 
                         wordembedding_gru_cnn[label_cols]) / 3

df_subm = pd.DataFrame(model_ens)
df_subm = df_subm[['id', 'toxic','severe_toxic','obscene','threat','insult','identity_hate']]
df_subm.to_csv(path + '\\submission_2\\model_ensemble_1.csv', index=False, header = True)