In [72]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np

### Get Training data

In [73]:
data = pd.read_csv('./train.csv')
classes = {"EAP": 1, "HPL": 2, "MWS": 3}
data = data.replace({"author": classes});

In [74]:
data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",1
1,id17569,It never once occurred to me that the fumbling...,2
2,id11008,"In his left hand was a gold snuff box, from wh...",1
3,id27763,How lovely is spring As we looked from Windsor...,3
4,id12958,"Finding nothing else, not even gold, the Super...",2


In [75]:
text = data["text"]
raw_labels = data["author"]

In [76]:
raw_labels = np.array(raw_labels)
m, = raw_labels.shape
labels = np.zeros((m,4))
labels[np.arange(m), raw_labels] = 1

labels = labels[:,1:]

### Tokenise data

In [77]:
#t = Tokenizer()

In [78]:
%%time
#word_index = t.word_index
num_words=10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index

CPU times: user 665 ms, sys: 10.4 ms, total: 676 ms
Wall time: 675 ms


In [79]:
### Text to sequence of tokens
text_tokens = tokenizer.texts_to_sequences(text)

In [80]:
num_tokens = [len(tokens) for tokens in text_tokens]
num_tokens = np.array(num_tokens)

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)

print(np.sum(num_tokens < max_tokens) / len(num_tokens))

pad = 'post'

#Pad training set
text_pad = pad_sequences(text_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)


0.9654732110935186


### Prepare Embedding layer

In [35]:
# embeddings_index = {}
# embedding_size = 50
# f = open('./glove.6B.50d.txt')
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

# print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [36]:
# embedding_matrix = np.zeros((len(word_index) + 1, embedding_size))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         embedding_matrix[i] = embedding_vector

### Create Model

In [97]:
model = Sequential()

# model.add(Embedding(input_dim=len(word_index) + 1, 
#                     output_dim=embedding_size,
#                     weights=[embedding_matrix],
#                     input_length=max_tokens,
#                     trainable=False,
#                     name='embedding_layer'))

model.add(Embedding(input_dim=num_words, 
                        output_dim=8,
                       input_length=max_tokens,
                       name='embedding_layer'))

model.add(Bidirectional(LSTM(units=16, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model.add(Bidirectional(LSTM(units=8, return_sequences=False)))
model.add(Dropout(0.1))
model.add(Dense(3, activation='softmax'))

optimizer = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy',
                optimizer=optimizer,
                metrics=['accuracy'])

In [98]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 61, 8)             80000     
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 61, 32)            3200      
_________________________________________________________________
bidirectional_14 (Bidirectio (None, 16)                2624      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 51        
Total params: 85,875
Trainable params: 85,875
Non-trainable params: 0
_________________________________________________________________


### Start Training

In [99]:
model.fit(text_pad, labels, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x14b41ba90>

### Save the model

In [100]:
# serialize model to JSON
model_json = model.to_json()
with open(".model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("./model.h5")
print("Saved model to disk")

Saved model to disk


### Create submission file for the test data

In [101]:
test_data = pd.read_csv('./test.csv')

In [102]:
test_data = test_data["text"]

In [103]:
test_tokens = tokenizer.texts_to_sequences(test_data)

In [104]:
test_pad = pad_sequences(test_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

In [105]:
pred = model.predict(test_pad, batch_size=1024, verbose=1)



In [106]:
pred.shape

(8392, 3)

In [107]:
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission.shape

(8392, 4)

In [108]:
classes = ["EAP", "HPL", "MWS"]

In [109]:
sample_submission[classes] = pred

In [110]:
sample_submission.shape

(8392, 4)

In [111]:
sample_submission.to_csv('./submission.csv', index=False)