In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

import keras
from keras.layers import Dense, Embedding, LSTM, Bidirectional
import keras.backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential
from sklearn.metrics import log_loss
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import KFold
np.random.seed(129)

Using TensorFlow backend.


In [2]:
df = pd.read_csv('./../data/train_feature.csv')
df_test = pd.read_csv('./../data/test_feature.csv')
text = df.text.values
text_test = df_test.text.values

author2class = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
class2author = ['EAP', 'HPL', 'MWS']
y = np.array([author2class[a] for a in df.author])
y = to_categorical(y)

In [3]:
import re
def preprocess(text):
    text = text.replace('"', ' " ')    
    text = re.sub(r"(')(\s|$)", r" \1 ", text)
    text = re.sub(r"(^|\s)(')", r" \2 ", text)

    for sign in ';:,': #?
        text = re.sub(r'(\s|^)({})'.format(sign), r' \2 ', text)
        text = re.sub(r'({})($|\s)'.format(sign), r' \1 ', text)

    text = re.sub(r'(\.+)(\s|$)', r' \1 ', text)

    text = re.sub(r"(')(\s|$)", r" \1 ", text) # special case: 'hoge'. 
    
    text = re.sub(r"(\?)(\s|$)", r' \1 ', text)
    text = re.sub(r"(^|\s)(\?+)", r' \2 ', text)    
    
    return text

In [4]:
def create_docs(df):
    docs = []
    for doc in df.text:
        doc = preprocess(doc).lower().split()
        docs.append(' '.join(doc).split())
    
    return docs

In [5]:
min_count = 2
maxlen = 64
embedding_dims = 32

In [6]:
docs = raw_docs = create_docs(df)

prev_sum_words = np.sum(np.array([len(d) for d in raw_docs]))
prev_sum_words
print('start {}'.format(prev_sum_words))
preprocessed_sum_words = 0

while preprocessed_sum_words != prev_sum_words:
    prev_sum_words = preprocessed_sum_words
    freq = defaultdict(int)
    for doc in docs:
        for w in doc:
            freq[w] += 1
    num_vocab = len(freq)

    for w, c in freq.copy().items():
        if c < min_count:
            del freq[w]
    print('#vocab: {}'.format(len(freq)))

    new_docs = []
    for doc in docs:
        new_doc = []
        for w in doc:
            if w in freq:
                new_doc.append(w)
        new_docs.append(new_doc[:maxlen])
    docs = new_docs
    preprocessed_sum_words = np.sum(np.array([len(d) for d in docs]))
    print(preprocessed_sum_words)

word2int = {}
int_docs = []
for doc in docs:
    int_doc = []
    for w in doc:
        if w not in word2int:
            wid = len(word2int) + 1
            word2int[w] = wid
        else:
            wid = word2int[w]
        int_doc.append(wid)
    int_docs.append(int_doc)

docs = pad_sequences(int_docs)
input_dim = np.max(docs) + 1



start 594056
#vocab: 16012
563239
#vocab: 15691
562941
#vocab: 15691
562941


In [7]:
docs_test = create_docs(df_test)

new_docs = []
for doc in docs_test:
    new_doc = []
    for w in doc:
        if w in freq:
            new_doc.append(w)
    new_docs.append(new_doc[:maxlen])
docs_test = new_docs


int_docs = []
for doc in docs_test:
    int_doc = []
    for w in doc:
        if w in word2int:
            wid = word2int[w]
            int_doc.append(wid)
    int_docs.append(int_doc)

x_test = pad_sequences(int_docs)


In [8]:
def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims, mask_zero=True))
    model.add(Bidirectional(LSTM(embedding_dims), 'sum'))
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

In [9]:
epochs = 45
num_split = 5
sum_loss = 0.

predict_prob_features = np.zeros((len(df), 3))
predict_prob_features_test = np.zeros((len(df_test), 3))
ite = 0
kf = KFold(n_splits=num_split, random_state=8, shuffle=True)
for train_index, val_index in kf.split(text):
    ite += 1
    x_train, x_val = docs[train_index], docs[val_index]
    y_train, y_val = y[train_index], y[val_index]

    model = create_model()

    checkpointer = ModelCheckpoint(filepath='./../fasttext_weights/lstm.hdf5', verbose=0, save_best_only=True)

    hist = model.fit(x_train, y_train,
                     batch_size=32,
                     validation_data=(x_val, y_val),
                     epochs=epochs,
                     callbacks=[EarlyStopping(patience=1, monitor='val_loss'), checkpointer])
    
    model.load_weights('./../fasttext_weights/lstm.hdf5')
    y_pred = model.predict_proba(x_val)
    sum_loss += log_loss(y_pred=y_pred, y_true=np.nonzero(y_val)[1])
    
    # save features
    predict_prob_features[val_index] = y_pred
    predict_prob_features_test += model.predict_proba(x_test)
    print('valLoss: {}'.format(sum_loss/ite))


Train on 15663 samples, validate on 3916 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Train on 15663 samples, validate on 3916 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
valLoss: 0.4027153504348068
Train on 15663 samples, validate on 3916 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Train on 15663 samples, validate on 3916 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
valLoss: 0.41149616068232847
Train on 15664 samples, validate on 3915 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45


In [10]:
for a, c in author2class.items():
    df['{}_lstm'.format(a)] = predict_prob_features[:, c]
    df_test['{}_lstm'.format(a)] = predict_prob_features_test[:, c]/num_split


In [11]:
df.to_csv('./../data/train_feature.csv')
df_test.to_csv('./../data/test_feature.csv')