In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

import sys
sys.path.append('../')
from utils import preprocess, Tokenizer4keras, create_fastText_model

import keras
from keras.layers import Dense, Embedding, LSTM, Bidirectional
import keras.backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential
from keras.optimizers import RMSprop
from sklearn.metrics import log_loss
from keras.utils import to_categorical
from sklearn.model_selection import KFold
np.random.seed(129)

Using TensorFlow backend.


In [2]:
df_train = pd.read_csv('./../data/train_feature.csv')
df_test = pd.read_csv('./../data/test_feature.csv')
df_train_texts = df_train.text.values
df_test_texts = df_test.text.values

author2class = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
class2author = ['EAP', 'HPL', 'MWS']
y = np.array([author2class[a] for a in df_train.author])
y = to_categorical(y)

In [3]:
min_count = 2
maxlen = 64
embedding_dim = 32
reccurent_dim = 32

tokenizer = Tokenizer4keras(maxlen=maxlen, min_count=min_count, n_gram_max=1, lower=True, single=False, add_ngram_first=True)
x = tokenizer.fit_transform(df_texts=df_train_texts)
x_test = tokenizer.transofrm(df_test_texts)
input_dim = np.max(x) + 1

In [4]:
def create_model(input_dim, embedding_dim, reccurent_dim):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dim, mask_zero=True))
    model.add(Bidirectional(LSTM(embedding_dim), 'sum'))
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

In [5]:
epochs = 45
num_split = 5
sum_loss = 0.

predict_prob_features = np.zeros((len(x), 3))
predict_prob_features_test = np.zeros((len(x_test), 3))
ite = 0
kf = KFold(n_splits=num_split, random_state=8, shuffle=True)
for train_index, val_index in kf.split(x):
    ite += 1
    x_train, x_val = x[train_index], x[val_index]
    y_train, y_val = y[train_index], y[val_index]

    model = create_model(input_dim, embedding_dim, reccurent_dim)

    checkpointer = ModelCheckpoint(filepath='./../fasttext_weights/lstm.hdf5', verbose=0, save_best_only=True)

    hist = model.fit(x_train, y_train,
                     batch_size=32,
                     validation_data=(x_val, y_val),
                     epochs=epochs,
                     callbacks=[EarlyStopping(patience=1, monitor='val_loss'), checkpointer])
    
    model.load_weights('./../fasttext_weights/lstm.hdf5')
    y_pred = model.predict_proba(x_val)
    sum_loss += log_loss(y_pred=y_pred, y_true=np.nonzero(y_val)[1])
    
    # save features
    predict_prob_features[val_index] = y_pred
    predict_prob_features_test += model.predict_proba(x_test)
    print('valLoss: {}'.format(sum_loss/ite))


Train on 15663 samples, validate on 3916 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
valLoss: 0.4113873973604394
Train on 15663 samples, validate on 3916 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Train on 15663 samples, validate on 3916 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Train on 15663 samples, validate on 3916 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
valLoss: 0.41138072294795364
Train on 15664 samples, validate on 3915 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45


In [7]:
for a, c in author2class.items():
    df_train['{}_lstm'.format(a)] = predict_prob_features[:, c]
    df_test['{}_lstm'.format(a)] = predict_prob_features_test[:, c]/num_split


In [8]:
df_train.to_csv('./../data/train_feature.csv', index=False)
df_test.to_csv('./../data/test_feature.csv', index=False)