In [1]:
import spacy
import numpy as np
import csv
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, LSTM, SpatialDropout1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import  Sequential
from keras import backend as K
from keras.callbacks import TensorBoard

Using TensorFlow backend.


In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
path = "data/train.tsv"
with open(path, 'rt') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    data = np.array(list(reader))

In [9]:
data[0]

array(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='<U283')

In [5]:
xtrain = data[1:,:][:,2]
ytrain = data[1:,:][:,3]
ytrain_categorical = to_categorical(ytrain) # one-hot encoding ytrain

In [6]:
MAX_NUM_WORDS = 16000
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)
tokenizer.fit_on_texts(xtrain)
sequences = tokenizer.texts_to_sequences(xtrain)

In [134]:
MAX_SEQUENCE_LENGTH = len(max(xtrain, key = len))
xtrain_padded = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding = "post")

In [151]:
word_index = tokenizer.word_index
EMBEDDING_DIM = 300 # vector length in spacy model

In [159]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    token = nlp(word)
    embedding_vector = token.vector  #getting vector form spacy
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [234]:
K.clear_session()
model = Sequential([
    Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False),
    SpatialDropout1D(0.2),
    LSTM(128,  
         recurrent_activation='hard_sigmoid', 
         use_bias=True, 
         kernel_initializer='glorot_uniform', 
         recurrent_initializer='orthogonal', 
         bias_initializer='zeros',
         dropout=0.2,
         recurrent_dropout=0.2),
 #   Dense(128, activation='relu'),
    Dense(5, activation = "softmax"),
])


In [238]:
K.clear_session()
model = Sequential([
    Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False),
    Conv1D(265, 5, activation='relu'),
    MaxPooling1D(5),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(5),
    Conv1D(64, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dense(5, activation = "softmax"),
])


In [239]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [240]:
tboard = TensorBoard(log_dir='./output', histogram_freq=5, write_graph=True, write_images=True)

In [241]:
model.fit(xtrain_padded, ytrain_categorical, epochs = 2, validation_split=0.2, batch_size=1000, verbose = 1,  callbacks=[tboard])

Train on 124848 samples, validate on 31212 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x173eb8cfef0>

In [242]:
model.save('kaggle01.h5')

In [None]:
model.load('kaggle01.h5')

In [244]:
path = "data/test.tsv"
with open(path, 'rt') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    data = np.array(list(reader))
xtest = data[1:,:][:,2]

In [246]:
MAX_NUM_WORDS = 16000
# tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)
# tokenizer.fit_on_texts(xtest)
#DONT RE-FIT THE TOKENIZER
sequences = tokenizer.texts_to_sequences(xtest)

#MAX_SEQUENCE_LENGTH = len(max(xtest, key = len))
xtest_padded = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding = "post")

In [247]:
preds = model.predict(xtest_padded)

In [248]:
preds[9]

array([0.00128725, 0.03170123, 0.89330286, 0.07119197, 0.00251666],
      dtype=float32)

In [249]:
import pickle

In [250]:
with open('PREDS', 'wb') as file:
    pickle.dump(preds, file)
#to open it later, you would say with open('PRED', 'rb') as file
    #pickle.load(file)