In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
kaggle_train = pd.read_csv('train.csv')
kaggle_test = pd.read_csv('test.csv')
OUTPUT = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [3]:
len(kaggle_train)

159571

In [4]:
from sklearn.utils import shuffle
kaggle_train = shuffle(kaggle_train)

In [5]:
X_train = kaggle_train['comment_text']
y_train = kaggle_train[OUTPUT]

In [6]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
split_text = []
max_len = 0
for line in X_train:
    split_line = text_to_word_sequence(line)
    max_len = max(max_len, len(split_line))
    split_text.extend(split_line)
unique_word_set = set(split_text)

print('Max comment length:', max_len)
print('No. of unique words:', len(unique_word_set))

Max comment length: 1403
No. of unique words: 210337


In [7]:
tokenizer = Tokenizer(num_words=35000)
tokenizer.fit_on_texts(X_train)
tX_train = tokenizer.texts_to_sequences(X_train)
padded_X_train = pad_sequences(tX_train, maxlen = 1500)
word_index = tokenizer.word_index

#Print word_index
word_index

{'the': 1,
 'to': 2,
 'of': 3,
 'and': 4,
 'a': 5,
 'you': 6,
 'i': 7,
 'is': 8,
 'that': 9,
 'in': 10,
 'it': 11,
 'for': 12,
 'this': 13,
 'not': 14,
 'on': 15,
 'be': 16,
 'as': 17,
 'have': 18,
 'are': 19,
 'your': 20,
 'with': 21,
 'if': 22,
 'article': 23,
 'was': 24,
 'or': 25,
 'but': 26,
 'page': 27,
 'wikipedia': 28,
 'my': 29,
 'an': 30,
 'from': 31,
 'by': 32,
 'do': 33,
 'at': 34,
 'me': 35,
 'about': 36,
 'so': 37,
 'talk': 38,
 'what': 39,
 'can': 40,
 'there': 41,
 'all': 42,
 'has': 43,
 'will': 44,
 'please': 45,
 'no': 46,
 'would': 47,
 'one': 48,
 'like': 49,
 'just': 50,
 'they': 51,
 'he': 52,
 'which': 53,
 'any': 54,
 'been': 55,
 'should': 56,
 'more': 57,
 'we': 58,
 "don't": 59,
 'some': 60,
 'other': 61,
 'who': 62,
 'here': 63,
 'see': 64,
 'also': 65,
 'his': 66,
 'think': 67,
 'because': 68,
 'know': 69,
 'how': 70,
 'edit': 71,
 'am': 72,
 "i'm": 73,
 'people': 74,
 'why': 75,
 'up': 76,
 'only': 77,
 "it's": 78,
 'out': 79,
 'articles': 80,
 'use': 81,

In [8]:
vocab_size = len(word_index)
vocab_size

210337

In [9]:
dim = 300
e_index = {}
f = open('wiki-news-300d-1M.vec', encoding='utf-8')
for line in f:
    text = line.rstrip().rsplit(' ', dim)
    word = text[0]
    coefs = np.asarray(text[1:], dtype='float32')
    e_index[word] = coefs
f.close()
print('Found word vectors: ', len(e_index))

Found word vectors:  999995


In [10]:
e_matrix = np.zeros((len(word_index) + 1, dim))
for word, i in word_index.items():
    vector = e_index.get(word)
    if vector is not None:
        e_matrix[i] = vector
        
len(e_matrix)
e_matrix.shape

(210338, 300)

In [11]:
import h5py
with h5py.File('embedding-2.h5', 'w') as hf:
    hf.create_dataset('fasttext', data=e_matrix)

#### READING FROM THE embedding-2.h5 files:

In [13]:
import h5py
with h5py.File('embedding-2.h5', 'r') as hf:
    mat = hf['fasttext'][:]
mat.shape

(210338, 300)

In [15]:
import keras.backend
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D
from keras.layers import Dropout, GlobalMaxPooling1D, BatchNormalization
from keras.layers.embeddings import Embedding

model = Sequential()
dim = 300
model.add(Embedding(vocab_size + 1, dim, weights=[e_matrix], input_length=1500, trainable=True))

# CNN LAYERS
model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(3))
model.add(GlobalMaxPooling1D())
model.add(BatchNormalization())

# FULLY CONNECTED LAYERS
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(6, activation='sigmoid'))

model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1500, 300)         63101400  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1500, 128)         192128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 500, 128)          0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 128)               512       
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6450      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)              

In [16]:
from sklearn.model_selection import train_test_split
[X, x_test_data, y, y_test_data] = train_test_split(padded_X_train, y_train, test_size=0.2, shuffle=False)
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.fit(X, y, batch_size = 128, epochs = 2, validation_data = (x_test_data, y_test_data), verbose = 1, shuffle=False)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 127656 samples, validate on 31915 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x267680657c8>

In [18]:
new_input = ['I am doing good']
new_input = tokenizer.texts_to_sequences(new_input)
new_input = pad_sequences(new_input, maxlen = 1500)
prediction = model.predict(new_input)

print('Toxic:         {:.0%}'.format(prediction[0][0]))
print('Severe Toxic:  {:.0%}'.format(prediction[0][1]))
print('Obscene:       {:.0%}'.format(prediction[0][2]))
print('Threat:        {:.0%}'.format(prediction[0][3]))
print('Insult:        {:.0%}'.format(prediction[0][4]))
print('Identity Hate: {:.0%}'.format(prediction[0][5]))

Toxic:         0%
Severe Toxic:  0%
Obscene:       0%
Threat:        0%
Insult:        0%
Identity Hate: 0%


In [19]:
def toxicity_level(string):
    new_input = [string]
    new_input = tokenizer.texts_to_sequences(new_input)
    new_input = pad_sequences(new_input, maxlen = 1500)
    prediction = model.predict(new_input)

    print('Toxic:         {:.0%}'.format(prediction[0][0]))
    print('Severe Toxic:  {:.0%}'.format(prediction[0][1]))
    print('Obscene:       {:.0%}'.format(prediction[0][2]))
    print('Threat:        {:.0%}'.format(prediction[0][3]))
    print('Insult:        {:.0%}'.format(prediction[0][4]))
    print('Identity Hate: {:.0%}'.format(prediction[0][5]))

In [24]:
toxicity_level('You are wise')

Toxic:         11%
Severe Toxic:  0%
Obscene:       0%
Threat:        0%
Insult:        2%
Identity Hate: 0%


In [25]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# # loading
# with open('tokenizer.pickle', 'rb') as handle:
#     tokenizer = pickle.load(handle)

In [26]:
model.save('second_iter.hdf5')