In [1]:
import sys
sys.path.append('../')
from utils import preprocess, Tokenizer4keras, create_fastText_model

import numpy as np
import h5py

import pandas as pd

from collections import defaultdict
import string

from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

import keras

from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras.utils import to_categorical

np.random.seed(1234)

Using TensorFlow backend.


In [2]:
df_train = pd.read_csv('./../data/train_feature.csv')
df_test = pd.read_csv('./../data/test_feature.csv')
df_train_texts = df_train.text.values
df_test_tests = df_test.text.values

author2class = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
class2author = ['EAP', 'HPL', 'MWS']
y = np.array([author2class[a] for a in df_train.author])
y = to_categorical(y)

In [3]:
# preprocessin parameters
n_gram_max = 2
min_count = 2
maxlen = 256

tokenizer = Tokenizer4keras(maxlen=maxlen, min_count=min_count, n_gram_max=n_gram_max, lower=False, single=False, add_ngram_first=True)
x = tokenizer.fit_transform(df_texts=df_train_texts)
x_test = tokenizer.transofrm(df_test_tests)

In [4]:
input_dim = np.max(x) + 1

# for next training
predict_prob_features = np.zeros((len(df_train), 3))
predict_prob_features_test = np.zeros((len(df_test), 3))

# training parameters
seed = 7
num_split = 5
epochs = 60

ite = 0
sum_loss = 0.
losses = []

kf = KFold(n_splits=num_split, random_state=seed, shuffle=True)
for train_index, val_index in kf.split(x):
    ite += 1
    print('{}/{}: #Trains: {}, #Val: {}'.format(ite, num_split, len(train_index), len(val_index)), end=' ')
    
    x_train = x[train_index]
    x_val = x[val_index]

    y_train, y_val = y[train_index], y[val_index]

    print(x_train.shape, x_val.shape, x_test.shape)
    
    model = create_fastText_model(input_dim, embedding_dim=10, optimizer='adam')
    
    checkpointer = ModelCheckpoint(filepath='./../fasttext_weights/weights_bi.hdf5', verbose=0, save_best_only=True)

    hist = model.fit(x_train, y_train,
                     batch_size=16,
                     validation_data=(x_val, y_val),
                     epochs=epochs,
                     callbacks=[EarlyStopping(patience=4, monitor='val_loss'), checkpointer]
                    )


    # load best weights
    model.load_weights('./../fasttext_weights/weights_bi.hdf5')
    y_pred = model.predict_proba(x_val)
    l = log_loss(y_pred=y_pred, y_true=np.nonzero(y_val)[1])
    losses.append(l)
    sum_loss += l
    
    print('valLoss: {}'.format(sum_loss/ite))

    # save features
    predict_prob_features[val_index] = y_pred
    predict_prob_features_test += model.predict_proba(x_test)

1/5: #Trains: 15663, #Val: 3916 (15663, 256) (3916, 256) (8392, 256)
Train on 15663 samples, validate on 3916 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Train on 15663 samples, validate on 3916 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Train on 15663 samples, validate on 3916 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 1

Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60

In [5]:
losses, np.mean(losses)

([0.33592844438726438,
  0.33082837027308792,
  0.33538871400276848,
  0.3544567039720749,
  0.34576111468915938],
 0.34047266946487104)

In [6]:
for a, c in author2class.items():
    df_train['{}_fasttext_bigram'.format(a)] = predict_prob_features[:, c]
    df_test['{}_fasttext_bigram'.format(a)] = predict_prob_features_test[:, c]/num_split

In [8]:
df_train.to_csv('./../data/train_feature.csv', index=False)
df_test.to_csv('./../data/test_feature.csv', index=False)