In [1]:
from __future__ import print_function, division
from builtins import range 

In [2]:
#for os and paths
import os
import sys
# mathematical 
import numpy as np
# datahandling 
import pandas as pd
# to plot graphs and visualize
import seaborn as sns
import matplotlib.pyplot as plt
# natural lanuage processing 
import gensim



In [3]:
# for gpu training
import plaidml.keras
plaidml.keras.install_backend()

In [3]:
'''
keras for deep learning models

Preprocessing Imports
'''
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
'''
Different Neural Network Layers
'''
from keras.layers import Dense,  Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPool1D, Embedding
from keras.layers import Dropout 
'''
Build Model
'''
from keras.models import Model

Using TensorFlow backend.


In [4]:
# ROC Curve
from sklearn.metrics import roc_auc_score

In [5]:
#set configurations and dimensions 

MAX_SEQUENCE_LENGTH = 200
MAX_VOCAB_SIZE = 20000

VALIDATION_SPLIT = 0.2
EMBEDDING_DIM = 100
BATCH_SIZE = 128
EPOCHS = 10


In [6]:
# Path to data 
train_data_path = './toxic_comments_dataset/train.csv'
test_data_path = './toxic_comments_dataset/test.csv'

# path to GolVe
glove_path = './glove.6B/glove.6B.{0}d.txt'.format(EMBEDDING_DIM)


In [7]:
'''
loading word2vectors from GloVe
'''
print ('loading word2vec...')

word2vec = {}

with open(os.path.join(glove_path), encoding='utf8') as fs:
    for line in fs:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
print ('number of vectors : {0}'.format(len(word2vec)))

loading word2vec...
number of vectors : 400000


In [8]:
'''
loading training data
'''
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [9]:
print(train_data.head())
print(train_data.shape)

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
(159571, 8)


In [11]:
#loading all row wise comment_text data into sentences
sentences = train_data['comment_text'].fillna('DUMMY_VALUES').values
# storing labels intp possible_labels
possible_labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
# loading all row wise possible_labels into target
targets = train_data[possible_labels].values

In [12]:

'''
    converting sentences into interger sequences

'''
# initialize tokenizer
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE)
# downsizing or fitting the sentences into respective tokens
tokenizer.fit_on_texts(sentences)
# transforming text to integer sequences 
sequences = tokenizer.texts_to_sequences(sentences)
    

In [13]:
print('sequences type', type(sequences))
print (sequences[0])

sequences type <class 'list'>
[688, 75, 1, 126, 130, 177, 29, 672, 4511, 12052, 1116, 86, 331, 51, 2278, 11448, 50, 6864, 15, 60, 2756, 148, 7, 2937, 34, 117, 1221, 15190, 2825, 4, 45, 59, 244, 1, 365, 31, 1, 38, 27, 143, 73, 3462, 89, 3085, 4583, 2273, 985]


In [14]:
len_seq = [len(each_seq) for each_seq in sequences]
print('maximum sequence length : {0}'.format(max(len_seq)))
print('minimum sequence length : {0}'.format(min(len_seq)))
len_seq = sorted(len_seq)
idx = len(len_seq)//2
print ('median sequences length : {0}'.format(len_seq[idx]))

maximum sequence length : 1400
minimum sequence length : 0
median sequences length : 35


In [15]:
# map word to integer [indexing]
word_index = tokenizer.word_index
# number of unique words
print(len(word_index))
# type 
print(type(word_index))

210337
<class 'dict'>


In [16]:
# convert all different input sizes into constant size of max_sequence_length
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)
# checking shape of data
print('shape of data {0}'.format(data.shape))

shape of data (159571, 200)


In [17]:
# preparing embedding matrix 
print('Filling pre-trained embeddings...')

num_words = min(MAX_VOCAB_SIZE,len(word_index)+1)

# initially populate embedding matrix to be all zeros
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word_index.get(word)
        
        if embedding_vector is not None:
            # words which are found will be updated
            embedding_matrix[i] = embedding_vector

#shape of embedding_matrix
print('shape of embedding matrix is {0}'.format(embedding_matrix.shape))

Filling pre-trained embeddings...
shape of embedding matrix is (20000, 100)


In [18]:
# creating a embeddings object for neural net using pretrained weights
embedding_layer = Embedding(
num_words,
EMBEDDING_DIM,
weights =[embedding_matrix],
input_length = MAX_SEQUENCE_LENGTH,
trainable = False
)

In [23]:
print('Building the Model...')

Building the Model...


In [24]:
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))

x = embedding_layer(input_)

x = Conv1D(128,3,activation = 'relu')(x)

x = MaxPool1D(3)(x)

x = Conv1D(128,3, activation = 'relu')(x)

x = MaxPool1D(3)(x)

x = Conv1D(128,3,activation = 'relu')(x)

x = Dropout(0.3)(x) 

x = GlobalMaxPooling1D()(x)

x = Dense(128, activation ='relu')(x)

x = Dropout(0.3)(x)

output = Dense(len(possible_labels), activation = 'sigmoid')(x)


In [25]:
model = Model(input_, output)

In [26]:
model.compile( 
loss = 'binary_crossentropy',
optimizer = 'rmsprop',
metrics = ['accuracy'])

In [27]:
print('Training Model...')
r = model.fit(
    data,
    targets,
    batch_size = BATCH_SIZE,
    epochs = EPOCHS,
    validation_split = VALIDATION_SPLIT
)

Training Model...
Train on 127656 samples, validate on 31915 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [30]:
# computing on test data
test_data_sentences = test_data['comment_text'].fillna('DUMMY_VALUES').values
test_data_sequences = tokenizer.texts_to_sequences(test_data_sentences)

In [31]:
test_data_feed =  pad_sequences(test_data_sequences, maxlen = MAX_SEQUENCE_LENGTH)

In [32]:
predict_test = model.predict(test_data_feed)

In [38]:
submission_path = './toxic_comments_dataset/sample_submission.csv'

submission  = pd.read_csv(submission_path)

submission[possible_labels] = predict_test

In [42]:
submission.to_csv('./submissions/first_submission.csv', index=False)