In [None]:
import sys, os, re, csv, codecs, io, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, model_from_json
from keras import initializers, regularizers, constraints, optimizers, layers

### Read Training Data

In [None]:
BASE_DIR = "./data"
train = pd.read_csv(os.path.join(BASE_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(BASE_DIR, 'test.csv'))
print(train.size)

In [None]:
train.head()

In [None]:
train["comment_text"] = train["comment_text"].fillna(" ")
test["comment_text"] = test["comment_text"].fillna(" ")

In [None]:
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
labels = train[classes].values
print(train["comment_text"][6])
print(labels[6])

### Preprocess data

In [None]:
nlp = spacy.load('en',disable=['parser', 'ner', 'textcat'])

In [None]:
def reduce_to_double_max(text):
    """Removes unecessary doubling/tripling/etc of characters
    
    Steps:
        1. Replaces every 3+ consecutive identical chars by 2 consecutive identical chars
        2. Replaces every 2+ consecutive non-word character by a single
    """
    import re
    text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
    return re.sub(r'(\W)\1+', r'\1', text)

In [None]:
def preprocess_corpus(corpus):
    """Applies all preprocessing rules to the corpus"""
    corpus = (reduce_to_double_max(s.lower()) for s in corpus)
    docs = nlp.pipe(corpus, batch_size=1000, n_threads=4)
    return [' '.join([x.lemma_ for x in doc if x.is_alpha]) for doc in docs]

In [None]:
fname_train_processed = './data/train_processed.txt'

if os.path.isfile(fname_train_processed):
    with open(fname_train_processed, 'r') as fin:
        train_processed = [line.strip() for line in fin if line]
    
else:
    train_processed = preprocess_corpus(train['comment_text'])

    with open(fname_train_processed, 'w') as fout:
        for doc in train_processed:
            fout.write('{}\n'.format(doc))
    
train['comment_text_processed'] = train_processed

In [None]:
fname_test_processed = './data/test_processed.txt'

if os.path.isfile(fname_test_processed):
    with open(fname_test_processed, 'r') as fin:
        test_processed = [line.strip() for line in fin if line]
    
else:
    test_processed = preprocess_corpus(test['comment_text'])

    with open(fname_test_processed, 'w') as fout:
        for doc in test_processed:
            fout.write('{}\n'.format(doc))
    
test['comment_text_processed'] = test_processed

### Tokenise the data

In [None]:
t = Tokenizer()
t.fit_on_texts(train['comment_text_processed'])
vocab_size = len(t.word_index) + 1

encoded_docs = t.texts_to_sequences(train['comment_text_processed'])
max_length = 100
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

# Test Data
test_encoded_docs = t.texts_to_sequences(test['comment_text_processed'])
test_padded_docs = pad_sequences(test_encoded_docs, maxlen=max_length, padding='post')

### Load the word embeddings

In [None]:
EMBEDDING_PATH = "./data/glove.6B.50d.txt"

In [None]:
embeddings_index = dict()
f = open(EMBEDDING_PATH)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
print(vocab_size)

### Define the Model

In [None]:
model = Sequential()
#Embedding Layer. This layer will output the word vectors for each one of the words in the sentence
model.add(Embedding(vocab_size, 
                    50, weights=[embedding_matrix], 
                    input_length=100, 
                    trainable=False))

model.add(Bidirectional(LSTM(units=50, return_sequences=False, dropout=0.1, recurrent_dropout=0.1)))
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.1))

model.add(Dense(6, activation='sigmoid'))

optimizer = optimizers.Adam(lr=0.001)
model.compile(loss='binary_crossentropy',
            optimizer=optimizer,
            metrics=['accuracy'])

### Start Training

In [None]:
model.summary()

In [None]:
model.fit(padded_docs, labels, epochs=2, batch_size=32)

In [None]:
print(padded_docs.shape)
print(labels.shape)

### Create the submission.csv file

In [None]:
y_test = model.predict(test_padded_docs, batch_size=1024, verbose=1)
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission[classes] = y_test
sample_submission.to_csv('./data/submission.csv', index=False)

### Save the model

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("./trainedModel/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("./trainedModel/model.h5")
print("Saved model to disk")

In [None]:
submission = pd.read_csv(os.path.join(BASE_DIR, 'submission.csv'))
submission.head()