In [47]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D,Bidirectional, Flatten, SimpleRNN
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import matplotlib.pyplot as plt
import chakin
import gensim.models.keyedvectors as word2vec
import gc

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
embed_size=0

In [36]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

In [4]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [5]:
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [6]:
def loadEmbeddingMatrix():
        word2vecDict = word2vec.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
        embed_size = 300
        
        embeddings_index = dict()
        for word in word2vecDict.wv.vocab:
            embeddings_index[word] = word2vecDict.word_vec(word)
        print('Loaded %s word vectors.' % len(embeddings_index))
            
        gc.collect()
        #We get the mean and standard deviation of the embedding weights so that we could maintain the 
        #same statistics for the rest of our own random generated weights. 
        all_embs = np.stack(list(embeddings_index.values()))
        emb_mean,emb_std = all_embs.mean(), all_embs.std()
        
        nb_words = len(tokenizer.word_index)
        #We are going to set the embedding size to the pretrained dimension as we are replicating it.
        #the size will be Number of Words in Vocab X Embedding Size
        embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
        gc.collect()

        #With the newly created embedding matrix, we'll fill it up with the words that we have in both 
        #our own dictionary and loaded pretrained embedding. 
        embeddedCount = 0
        for word, i in tokenizer.word_index.items():
            i-=1
            #then we see if this word is in glove's dictionary, if yes, get the corresponding weights
            embedding_vector = embeddings_index.get(word)
            #and store inside the embedding matrix that we will train later on.
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
                embeddedCount+=1
        print('total embedded:',embeddedCount,'common words')
        
        del(embeddings_index)
        gc.collect()
        
        #finally, return the embedding matrix
        return embedding_matrix

In [7]:
embedding_matrix = loadEmbeddingMatrix()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  


Loaded 3000000 word vectors.
total embedded: 66078 common words


In [13]:
inp = Input(shape=(maxlen, ))
x = Embedding(len(tokenizer.word_index), embedding_matrix.shape[1],weights=[embedding_matrix],trainable=False)(inp)
x = Bidirectional(LSTM(60, return_sequences=True,name='lstm_layer',dropout=0.1,recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 300)          63101100  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 120)          173280    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 120)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 120)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6050      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
__________

In [14]:
batch_size = 32
epochs = 4
hist = model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [15]:
y = model.predict(X_te)
y = np.round(y,0)

In [16]:
test_label_df = pd.read_csv('test_labels.csv')
test_pred_df = pd.DataFrame(data=y, columns=['toxic_pred','severe_toxic_pred','obscene_pred','threat_pred','insult_pred','identity_hate_pred'])
test_df = pd.concat([test_label_df,test_pred_df], axis=1, sort=False)

In [17]:
dum = tokenizer.word_index
dum = {k:v for k,v in dum.items()}
id_to_word = {value:key for key,value in dum.items()}
id_to_word[0]=''

In [21]:
comment = []
for LIST in X_te:
    comment.append(' '.join(id_to_word[id] for id in LIST))
test_df['comment_text'] = comment

In [22]:
data = test_df[test_df['toxic']!=-1]

In [23]:
data = test_df[test_df['toxic']!=-1]
field_LIST = [['toxic_pred','toxic'],
['severe_toxic_pred','severe_toxic'],
['obscene_pred','obscene'],
['threat_pred','threat'],
['insult_pred','insult'],
['identity_hate_pred','identity_hate']]

for field_LIST_ITEM in field_LIST:
    print(field_LIST_ITEM[1]+'\n')
    print(len(data[(data[field_LIST_ITEM[0]]==1)]))
    TP = len(data[(data[field_LIST_ITEM[0]]==1) & (data[field_LIST_ITEM[1]]==1)])
    FP = len(data[(data[field_LIST_ITEM[0]]==1) & (data[field_LIST_ITEM[1]]==0)])
    TN = len(data[(data[field_LIST_ITEM[0]]==0) & (data[field_LIST_ITEM[1]]==0)])
    FN = len(data[(data[field_LIST_ITEM[0]]==0) & (data[field_LIST_ITEM[1]]==1)])
    #print("TP: "+str(TP))
    #print("FP: "+str(FP))
    #print("TN: "+str(TN))
    #print("FN: "+str(FN))
    #print('Total test records: '+str(len(data)))
    P = TP/(TP+FP)
    A = (TP+TN)/(TP+TN+FP+FN)
    R = TP/(TP+FN)
    print('Precission = '+str(round(P*100,2))+'%')
    print('Accuracy = '+str(round(A*100,2))+'%')
    print('Recall = '+str(round(R*100,2))+'%\n\n')

toxic

6932
Precission = 64.73%
Accuracy = 93.67%
Recall = 73.68%


severe_toxic

397
Precission = 33.25%
Accuracy = 99.22%
Recall = 35.97%


obscene

3859
Precission = 67.09%
Accuracy = 96.29%
Recall = 70.14%


threat

91
Precission = 53.85%
Accuracy = 99.68%
Recall = 23.22%


insult

2542
Precission = 73.17%
Accuracy = 96.48%
Recall = 54.27%


identity_hate

246
Precission = 76.02%
Accuracy = 99.09%
Recall = 26.26%




In [38]:
max_features = 20000
tokenizer1 = Tokenizer(num_words=max_features)
tokenizer1.fit_on_texts(list(list_sentences_train))
list_tokenized_train1 = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test1 = tokenizer.texts_to_sequences(list_sentences_test)

In [39]:
maxlen = 100
X_t1 = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te1 = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [40]:
def loadEmbeddingMatrix1():
        word2vecDict = word2vec.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
        embed_size = 300
        
        embeddings_index = dict()
        for word in word2vecDict.wv.vocab:
            embeddings_index[word] = word2vecDict.word_vec(word)
        print('Loaded %s word vectors.' % len(embeddings_index))
            
        gc.collect()
        #We get the mean and standard deviation of the embedding weights so that we could maintain the 
        #same statistics for the rest of our own random generated weights. 
        all_embs = np.stack(list(embeddings_index.values()))
        emb_mean,emb_std = all_embs.mean(), all_embs.std()
        
        nb_words = len(tokenizer1.word_index)
        #We are going to set the embedding size to the pretrained dimension as we are replicating it.
        #the size will be Number of Words in Vocab X Embedding Size
        embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
        gc.collect()

        #With the newly created embedding matrix, we'll fill it up with the words that we have in both 
        #our own dictionary and loaded pretrained embedding. 
        embeddedCount = 0
        for word, i in tokenizer1.word_index.items():
            i-=1
            #then we see if this word is in glove's dictionary, if yes, get the corresponding weights
            embedding_vector = embeddings_index.get(word)
            #and store inside the embedding matrix that we will train later on.
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
                embeddedCount+=1
        print('total embedded:',embeddedCount,'common words')
        
        del(embeddings_index)
        gc.collect()
        
        #finally, return the embedding matrix
        return embedding_matrix

In [41]:
embedding_matrix = loadEmbeddingMatrix()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  


Loaded 3000000 word vectors.
total embedded: 66078 common words


In [None]:
inp1 = Input(shape=(maxlen, ))
x1 = Embedding(len(tokenizer1.word_index), embedding_matrix1.shape[1],weights=[embedding_matrix1],trainable=False)(inp1)
x1 = Bidirectional(LSTM(60, return_sequences=True,name='lstm_layer',dropout=0.1,recurrent_dropout=0.1))(x1)
x1 = GlobalMaxPool1D()(x1)
#x1 = Dropout(0.1)(x1)
x1 = Dense(50, activation="relu")(x1)
#x1 = Dropout(0.1)(x1)
x1 = Dense(6, activation="sigmoid")(x1)

In [None]:
model1 = Model(inputs=inp1, outputs=x1)
model1.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model1.summary()
batch_size = 32
epochs = 3
with tf.device('/device:GPU:2'):
    hist1 = model1.fit(X_t1,y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

In [None]:
y1 = model1.predict(X_te1)
y1 = np.round(y1,0)

In [None]:
test_pred_df1 = pd.DataFrame(data=y1, columns=['toxic_pred','severe_toxic_pred','obscene_pred','threat_pred','insult_pred','identity_hate_pred'])
test_df1 = pd.concat([test_label_df,test_pred_df1], axis=1, sort=False)

In [None]:
data = test_df1[test_df['toxic']!=-1]
field_LIST = [['toxic_pred','toxic'],
['severe_toxic_pred','severe_toxic'],
['obscene_pred','obscene'],
['threat_pred','threat'],
['insult_pred','insult'],
['identity_hate_pred','identity_hate']]

for field_LIST_ITEM in field_LIST:
    print(field_LIST_ITEM[1]+'\n')
    TP = len(data[(data[field_LIST_ITEM[0]]==1) & (data[field_LIST_ITEM[1]]==1)])
    FP = len(data[(data[field_LIST_ITEM[0]]==1) & (data[field_LIST_ITEM[1]]==0)])
    TN = len(data[(data[field_LIST_ITEM[0]]==0) & (data[field_LIST_ITEM[1]]==0)])
    FN = len(data[(data[field_LIST_ITEM[0]]==0) & (data[field_LIST_ITEM[1]]==1)])
    #print("TP: "+str(TP))
    #print("FP: "+str(FP))
    #print("TN: "+str(TN))
    #print("FN: "+str(FN))
    #print('Total test records: '+str(len(data)))
    P = TP/(TP+FP)
    A = (TP+TN)/(TP+TN+FP+FN)
    R = TP/(TP+FN)
    print('Precission = '+str(round(P*100,2))+'%')
    print('Accuracy = '+str(round(A*100,2))+'%')
    print('Recall = '+str(round(R*100,2))+'%\n\n')

In [42]:
inp1 = Input(shape=(maxlen, ))
x1 = Embedding(len(tokenizer.word_index), embedding_matrix.shape[1],weights=[embedding_matrix],trainable=False)(inp1)
x1 = LSTM(60, return_sequences=True,name='lstm_layer',dropout=0.1,recurrent_dropout=0.1)(x1)
x1 = Flatten()(x1)
x1 = Dense(6, activation="sigmoid")(x1)
model1 = Model(inputs=inp1, outputs=x1)
model1.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 100, 300)          63101100  
_________________________________________________________________
lstm_layer (LSTM)            (None, 100, 60)           86640     
_________________________________________________________________
flatten_2 (Flatten)          (None, 6000)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 36006     
Total params: 63,223,746
Trainable params: 122,646
Non-trainable params: 63,101,100
_________________________________________________________________


In [45]:
batch_size = 32
epochs = 3
hist1 = model1.fit(X_t1,y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [48]:
inp2 = Input(shape=(maxlen, ))
x2 = Embedding(len(tokenizer.word_index), embedding_matrix.shape[1],weights=[embedding_matrix],trainable=False)(inp2)
x2 = SimpleRNN(60, return_sequences=True,name='rnn_layer',dropout=0.1,recurrent_dropout=0.1)(x2)
x2 = Flatten()(x2)
x2 = Dense(6, activation="sigmoid")(x2)
model2 = Model(inputs=inp2, outputs=x2)
model2.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 100, 300)          63101100  
_________________________________________________________________
rnn_layer (SimpleRNN)        (None, 100, 60)           21660     
_________________________________________________________________
flatten_3 (Flatten)          (None, 6000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 6)                 36006     
Total params: 63,158,766
Trainable params: 57,666
Non-trainable params: 63,101,100
_________________________________________________________________


In [49]:
batch_size = 32
epochs = 3
hist2 = model2.fit(X_t1,y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [50]:
y1 = model1.predict(X_te1)
y1 = np.round(y1,0)

In [51]:
test_pred_df1 = pd.DataFrame(data=y1, columns=['toxic_pred','severe_toxic_pred','obscene_pred','threat_pred','insult_pred','identity_hate_pred'])
test_df1 = pd.concat([test_label_df,test_pred_df1], axis=1, sort=False)

In [52]:
data = test_df1[test_df1['toxic']!=-1]
field_LIST = [['toxic_pred','toxic'],
['severe_toxic_pred','severe_toxic'],
['obscene_pred','obscene'],
['threat_pred','threat'],
['insult_pred','insult'],
['identity_hate_pred','identity_hate']]

for field_LIST_ITEM in field_LIST:
    print(field_LIST_ITEM[1]+'\n')
    TP = len(data[(data[field_LIST_ITEM[0]]==1) & (data[field_LIST_ITEM[1]]==1)])
    FP = len(data[(data[field_LIST_ITEM[0]]==1) & (data[field_LIST_ITEM[1]]==0)])
    TN = len(data[(data[field_LIST_ITEM[0]]==0) & (data[field_LIST_ITEM[1]]==0)])
    FN = len(data[(data[field_LIST_ITEM[0]]==0) & (data[field_LIST_ITEM[1]]==1)])
    #print("TP: "+str(TP))
    #print("FP: "+str(FP))
    #print("TN: "+str(TN))
    #print("FN: "+str(FN))
    #print('Total test records: '+str(len(data)))
    P = TP/(TP+FP)
    A = (TP+TN)/(TP+TN+FP+FN)
    R = TP/(TP+FN)
    print('Precission = '+str(round(P*100,2))+'%')
    print('Accuracy = '+str(round(A*100,2))+'%')
    print('Recall = '+str(round(R*100,2))+'%\n\n')

toxic

Precission = 62.85%
Accuracy = 93.19%
Recall = 69.7%


severe_toxic

Precission = 36.44%
Accuracy = 99.33%
Recall = 22.34%


obscene

Precission = 73.14%
Accuracy = 96.47%
Recall = 61.37%


threat

Precission = 51.72%
Accuracy = 99.67%
Recall = 21.33%


insult

Precission = 69.9%
Accuracy = 96.21%
Recall = 51.5%


identity_hate

Precission = 56.7%
Accuracy = 98.93%
Recall = 15.45%




In [53]:
y2 = model2.predict(X_te1)
y2 = np.round(y2,0)

In [54]:
test_pred_df2 = pd.DataFrame(data=y2, columns=['toxic_pred','severe_toxic_pred','obscene_pred','threat_pred','insult_pred','identity_hate_pred'])
test_df2 = pd.concat([test_label_df,test_pred_df2], axis=1, sort=False)


In [55]:
data = test_df2[test_df2['toxic']!=-1]
field_LIST = [['toxic_pred','toxic'],
['severe_toxic_pred','severe_toxic'],
['obscene_pred','obscene'],
['threat_pred','threat'],
['insult_pred','insult'],
['identity_hate_pred','identity_hate']]

for field_LIST_ITEM in field_LIST:
    print(field_LIST_ITEM[1]+'\n')
    TP = len(data[(data[field_LIST_ITEM[0]]==1) & (data[field_LIST_ITEM[1]]==1)])
    FP = len(data[(data[field_LIST_ITEM[0]]==1) & (data[field_LIST_ITEM[1]]==0)])
    TN = len(data[(data[field_LIST_ITEM[0]]==0) & (data[field_LIST_ITEM[1]]==0)])
    FN = len(data[(data[field_LIST_ITEM[0]]==0) & (data[field_LIST_ITEM[1]]==1)])
    #print("TP: "+str(TP))
    #print("FP: "+str(FP))
    #print("TN: "+str(TN))
    #print("FN: "+str(FN))
    #print('Total test records: '+str(len(data)))
    P = TP/(TP+FP)
    A = (TP+TN)/(TP+TN+FP+FN)
    R = TP/(TP+FN)
    print('Precission = '+str(round(P*100,2))+'%')
    print('Accuracy = '+str(round(A*100,2))+'%')
    print('Recall = '+str(round(R*100,2))+'%\n\n')

toxic

Precission = 61.44%
Accuracy = 91.77%
Recall = 36.47%


severe_toxic

Precission = 35.82%
Accuracy = 99.3%
Recall = 27.52%


obscene

Precission = 62.78%
Accuracy = 95.19%
Recall = 40.94%


threat

Precission = 17.57%
Accuracy = 99.6%
Recall = 6.16%


insult

Precission = 53.25%
Accuracy = 94.89%
Recall = 37.82%


identity_hate

Precission = 27.52%
Accuracy = 98.81%
Recall = 4.21%


