In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, metrics, linear_model
from nltk import word_tokenize
from nltk.corpus import stopwords
from keras.utils import np_utils
stop_words = stopwords.words('english')

In [2]:
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

In [31]:
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

In [14]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="glove.840B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt")

(2196017, 300)

In [3]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

In [4]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(glove_model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [44]:
train_data = pd.read_csv("train.csv")
print(f'{train_data.shape} <- train data shape')

lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train_data.author.values)

xtrain, xvalid, ytrain, yvalid = train_test_split(train_data.text.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

(19579, 3) <- train data shape


In [6]:
xtrain_glove = [sent2vec(x) for x in xtrain]
xvalid_glove = [sent2vec(x) for x in xvalid]

In [7]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [9]:
# basic normalization of subtracting mean & scaling to unit variance
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

print

In [25]:
# convert labels binary matrix format (easy for evaluation when using categorical cross-entropy)
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)
print(f'{ytrain[0]} binarized as -> {ytrain_enc[0]}')
print(f'{ytrain[3]} binarized as -> {ytrain_enc[3]}')
print(f'{ytrain[1]} binarized as -> {ytrain_enc[1]}')

2 binarized as -> [ 0.  0.  1.]
1 binarized as -> [ 0.  1.  0.]
0 binarized as -> [ 1.  0.  0.]


In [43]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

print(f'{xtrain[0]}')
printmd('<span style="color:brown"> sequenced as = </span>')
print(f'{xtrain_seq[0]}')
printmd('<span style="color:violet"> padded as = </span>')
print(f'{xtrain_pad[0]}\n')

printmd('word_to_ix created for **' + str(len(word_index)) + '** words')

Her hair was the brightest living gold, and despite the poverty of her clothing, seemed to set a crown of distinction on her head.


<span style="color:brown"> sequenced as = </span>

[29, 560, 8, 1, 5924, 459, 714, 3, 987, 1, 1794, 2, 29, 3695, 98, 4, 326, 5, 2545, 2, 3103, 27, 29, 166]


<span style="color:violet"> padded as = </span>

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0   29  560    8    1 5924  459  714    3  987    1 1794    2   29 3695
   98    4  326    5 2545    2 3103   27   29  166]



word_to_ix created for **25943** words

In [53]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    if word not in glove_model:
        continue
    embedding_vector = glove_model[word]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
print(f'embedding matrix shape -> {embedding_matrix.shape}')
print(f'example: word \'brightest\' has index of -> {word_index["brightest"]}')
printmd('<span style="color:#f49542"> stored as = </span>')
print(f'{embedding_matrix[word_index["brightest"]]}')

embedding matrix shape -> (25944, 300)
example: word 'brightest' has index of -> 5924


<span style="color:#f49542"> stored as = </span>

[  3.98229994e-02  -2.04089999e-01   3.91770005e-01  -1.72519997e-01
   3.56979996e-01  -4.32429999e-01  -8.56499970e-02   6.00610018e-01
  -2.54280001e-01   1.33080006e+00   5.33129990e-01  -2.30560005e-01
  -1.89569995e-01  -3.94380018e-02   5.91380009e-03  -2.78990000e-01
   1.84570000e-01   3.42770010e-01   2.64090002e-01  -9.01220024e-01
   1.54640004e-01  -3.89690012e-01  -2.58080006e-01   1.90899998e-01
   9.39050019e-02   3.23870003e-01  -5.19240022e-01   3.02240014e-01
  -2.69179996e-02   3.67590010e-01   8.36490020e-02   2.77469993e-01
  -2.53490001e-01   2.40429994e-02  -4.54030007e-01   2.16340005e-01
  -1.26560003e-01  -5.38709983e-02   4.70470011e-01   1.88040003e-01
  -1.96160004e-01   1.89950004e-01  -9.57890004e-02   4.27300006e-01
  -4.40200008e-02  -5.31979978e-01   6.56029999e-01   1.65560007e-01
   6.88600019e-02  -1.28020000e+00   1.78409994e-01   5.35510004e-01
  -4.75500003e-02  -9.73879993e-02   1.29590005e-01  -4.27089989e-01
   1.23779997e-01   4.21169996e-01

In [54]:
# A simple LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [55]:
model.fit(xtrain_pad, y=ytrain_enc, batch_size=128, epochs=40, verbose=1, validation_data=(xvalid_pad, yvalid_enc))

Train on 17621 samples, validate on 1958 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f5637c46fd0>

In [80]:
from keras import Input, Model

embedding_layer = Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False)

sequence_input = Input(shape=(max_len,), dtype='float')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(512, 15, activation='relu')(embedded_sequences)
x = MaxPooling1D(1)(x)
x = Conv1D(512, 15, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(512, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)  # global max pooling
x = Flatten()(x)
x = Dense(512, activation='relu')(x)
preds = Dense(3, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])


model.fit(xtrain_pad, y=ytrain_enc, batch_size=128, epochs=10, verbose=1, validation_data=(xvalid_pad, yvalid_enc))

Train on 17621 samples, validate on 1958 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5635d9bb00>