In [1]:
import numpy as np

import pandas as pd
from collections import defaultdict
import string

import keras
from keras.layers import Dense, GlobalAveragePooling1D, Embedding, Lambda, Input, Concatenate
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.models import Sequential, Model

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from gensim.models import word2vec
from utils import preprocess
np.random.seed(1234)

Using TensorFlow backend.


In [2]:
embedding_dims = 20

In [3]:
df = pd.read_csv('./data/train.csv')

In [4]:
df.text[0]

'This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.'

In [5]:
counter = {name : defaultdict(int) for name in set(df.author)}
for (text, author) in zip(df.text, df.author):
    text = text.replace(' ', '')
    for c in text:
        counter[author][c] += 1
    

In [6]:
chars = set()
for v in counter.values():
    chars |= v.keys()
    
names = [author for author in counter.keys()]
special_latters = set()
print('c ', end='')
for n in names:
    print(n, end='   ')
print()
for c in chars:    
    print(c, end=' ')
    for n in names:
        print(counter[n][c], end=' ')
    print()
    special_latters.add(c)

c EAP   MWS   HPL   
. 8406 5761 5908 
Y 282 234 111 
i 60952 46080 44250 
B 835 395 533 
y 17001 14877 12534 
I 4846 4917 3480 
w 17507 16062 15554 
Æ 1 0 4 
q 1030 677 779 
æ 36 0 10 
Ν 0 0 1 
x 1951 1267 1061 
X 17 4 5 
ê 28 0 2 
g 16088 12601 14951 
' 1334 476 1710 
ä 1 0 6 
l 35371 27819 30273 
F 383 232 269 
N 411 204 345 
Q 21 7 10 
Å 0 0 1 
o 67145 53386 50996 
U 166 46 94 
E 435 445 281 
p 17422 12361 10965 
a 68525 55274 56815 
d 36862 35315 33366 
ï 0 0 7 
: 176 339 47 
L 458 307 249 
é 47 0 15 
Π 0 0 1 
H 864 669 741 
ñ 0 0 7 
ἶ 0 0 2 
G 313 246 318 
ç 1 0 0 
m 22792 20471 17622 
u 26311 21025 19519 
R 258 385 237 
â 6 0 0 
A 1258 943 1167 
ë 0 0 12 
z 634 400 529 
" 2987 1469 513 
n 62636 50291 50879 
à 10 0 0 
O 414 282 503 
ô 8 0 0 
ü 1 0 5 
? 510 419 169 
α 0 0 2 
V 156 57 67 
K 86 35 176 
ö 16 0 3 
b 13245 9611 10636 
Ο 0 0 3 
f 22354 18351 16272 
P 442 365 320 
c 24127 17911 18338 
j 683 682 424 
M 1065 415 645 
; 1354 2662 1143 
r 51221 44042 40590 
S 729 578 841 
î 

## 特徴とか

- MWSはasciiしか系しか使わない（イギリスの作家だから？、それ以外の2人は使い、アメリカ）

In [7]:
special_latters = special_latters - set(string.ascii_uppercase) - set(string.ascii_lowercase) - set(',.:;"\'?')

In [8]:
' '.join(special_latters)

'à ô ü α Æ æ ö Ν ê Ο ä Å î è ï Σ é Π ñ ἶ ç δ â ë Υ'

In [9]:
# for (text, author) in zip(df.text, df.author):
#     if len(set(text) & special_latters):
#         print(set(text) & special_latters, author, text)


In [10]:
def create_docs(df, n_gram_max=1):
    docs = []

    for i, text in enumerate(df.text):    
        def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams

        doc = preprocess(text).split()
        
        prod = special_latters & set(text)
        special_chars = ''
        if prod:
            for c in prod:
                freq = text.count(c)
                special_chars += (' {} '.format(c) * freq)            
                
        docs.append(' '.join(add_ngram(doc, n_gram_max)) + special_chars)
        
    return docs

In [11]:
docs = create_docs(df, n_gram_max=1)

In [12]:
tokenizer = Tokenizer(num_words=2, filters='', lower=False)
tokenizer.fit_on_texts(docs)

In [13]:
min_count = 1
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

In [14]:
tokenizer = Tokenizer(num_words=num_words, filters='', lower=False)
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)
maxlen = max([len(i) for i in  docs])
docs = pad_sequences(sequences=docs, maxlen=maxlen)

In [15]:
len(tokenizer.word_index)

28285

In [16]:
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}

In [17]:
y = np.array([a2c[a] for a in df.author])
y = to_categorical(y)

In [18]:
input_dim = np.max(docs)+1
input_dim, maxlen

(28285, 875)

In [19]:
vec = word2vec.KeyedVectors.load_word2vec_format('./fastText/model.vec') # ./fasttext skipgram -input ../data/fasttext-inputs.txt -output model -minCount 1  -neg 15 -ws 10 -epoch 7
fasttext_dim = vec.vector_size
fasttext_emb = np.zeros((len(df), fasttext_dim))

for i, text in enumerate(df.text):
    doc_vec = np.zeros(fasttext_dim)
    words = preprocess(text).lower().split()
    for w in words:
        doc_vec += vec[w]
    doc_vec /= len(words)
    fasttext_emb[i] = doc_vec


In [20]:
inputs = Input(shape=(None, ))
emb = Embedding(input_dim=input_dim, output_dim=embedding_dims)(inputs)
mean = GlobalAveragePooling1D()(emb)

fasttext_input = Input(shape=(fasttext_dim, ))
concat = Concatenate()([mean, fasttext_input])
out = Dense(3, activation='softmax')(concat)

model = Model(inputs=[inputs, fasttext_input], outputs=[out])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [21]:
fasttext_emb.shape, docs.shape

((19579, 100), (19579, 875))

In [22]:
# # train validation case
# epochs = 45
# x_train, x_test, x_fast_train, x_fast_test, y_train, y_test = train_test_split(docs, fasttext_emb, y, test_size=0.15)

# # n_samples = x_train.shape[0]
# # class_weight = {c : (n_samples/(np.sum(y_train, axis=0)[c] * 3)) for c in range(3)}

# hist = model.fit([x_train, x_fast_train], [y_train],
#                  batch_size=16,
#                  validation_data=([x_test, x_fast_test], [y_test]),
#                  epochs=epochs,
#                  callbacks=[EarlyStopping(patience=4, monitor='val_loss')])

In [23]:
# history = hist.history
# for i in range(len(history['acc'])):
#     print('{0:2d}'.format(i), 
#           '{0:.4f}'.format(history['loss'][i]), 
#           '{0:.4f}'.format(history['acc'][i]),
#           '{0:.4f}'.format(history['val_loss'][i]),
#           '{0:.4f}'.format(history['val_acc'][i]))

In [24]:
model.fit([docs, fasttext_emb], [y],
          batch_size=16,
          epochs=36)

Epoch 1/36
Epoch 2/36
Epoch 3/36
Epoch 4/36
Epoch 5/36
Epoch 6/36
Epoch 7/36
Epoch 8/36
Epoch 9/36
Epoch 10/36
Epoch 11/36
Epoch 12/36
Epoch 13/36
Epoch 14/36
Epoch 15/36
Epoch 16/36
Epoch 17/36
Epoch 18/36
Epoch 19/36
Epoch 20/36
Epoch 21/36
Epoch 22/36
Epoch 23/36
Epoch 24/36
Epoch 25/36
Epoch 26/36
Epoch 27/36
Epoch 28/36
Epoch 29/36
Epoch 30/36
Epoch 31/36
Epoch 32/36
Epoch 33/36
Epoch 34/36
Epoch 35/36
Epoch 36/36


<keras.callbacks.History at 0x1a24fa7518>

In [25]:
test_df = pd.read_csv('./data/test.csv')
docs = create_docs(test_df)
docs = tokenizer.texts_to_sequences(docs)
docs = pad_sequences(sequences=docs, maxlen=maxlen)

# create feature for test

fasttext_emb = np.zeros((len(test_df), fasttext_dim))
for i, text in enumerate(test_df.text):
    doc_vec = np.zeros(fasttext_dim)
    words = preprocess(text).lower().split()
    for w in words:
        if w in vec.vocab:
            doc_vec += vec[w]
    doc_vec /= len(words)
    fasttext_emb[i] = doc_vec
    
model_predict_prob = Model(inputs=[model.layers[0].input, model.layers[3].input],
                                 outputs=[model.layers[-1].output])
prob = model_predict_prob.predict([docs, fasttext_emb])

df = pd.read_csv('./data/sample_submission.csv')
for a, i in a2c.items():
    df[a] = prob[:, i]

In [28]:
with open('./results/fastText_submission_preprocess_min1-ngram1_epoch36-unsuper-concat.csv', 'w') as f:
    f.write('"id","EAP","HPL","MWS"\n')
    for (index, row), t in zip(df.iterrows(), test_df.text):
        f.write(','.join(['\"' + row['id'] + '\"'] +  list(map(str, [row['EAP'], row['HPL'], row['MWS']]))) + '\n')


In [27]:
# min=1, ngram=1, preprocess upper special dim=20 test=0.15

# eppoh 40 since previous result is not good...
#  0 0.9201 0.6252 0.7953 0.7385
#  1 0.7406 0.7405 0.6770 0.7671
#  2 0.6576 0.7613 0.6168 0.7794
#  3 0.6085 0.7769 0.5794 0.7889
#  4 0.5733 0.7877 0.5476 0.7954
#  5 0.5467 0.7980 0.5271 0.7971
#  6 0.5227 0.8035 0.5073 0.8073
#  7 0.5020 0.8114 0.4912 0.8124
#  8 0.4820 0.8197 0.4800 0.8165
#  9 0.4626 0.8279 0.4723 0.8158
# 10 0.4447 0.8354 0.4529 0.8260
# 11 0.4277 0.8427 0.4431 0.8335
# 12 0.4104 0.8491 0.4345 0.8335
# 13 0.3947 0.8562 0.4235 0.8383
# 14 0.3787 0.8622 0.4183 0.8366
# 15 0.3638 0.8674 0.4107 0.8390
# 16 0.3487 0.8757 0.4016 0.8434
# 17 0.3350 0.8799 0.3920 0.8481
# 18 0.3227 0.8864 0.3884 0.8458
# 19 0.3097 0.8917 0.3863 0.8488
# 20 0.2963 0.8974 0.3790 0.8498
# 21 0.2834 0.9027 0.3705 0.8573
# 22 0.2731 0.9069 0.3710 0.8539
# 23 0.2621 0.9116 0.3586 0.8604
# 24 0.2519 0.9162 0.3624 0.8570
# 25 0.2419 0.9195 0.3511 0.8611
# 26 0.2317 0.9245 0.3573 0.8604
# 27 0.2228 0.9277 0.3453 0.8638
# 28 0.2145 0.9294 0.3470 0.8655
# 29 0.2058 0.9339 0.3626 0.8597
# 30 0.1973 0.9362 0.3436 0.8665
# 31 0.1906 0.9404 0.3570 0.8614
# 32 0.1829 0.9415 0.3395 0.8699
# 33 0.1763 0.9445 0.3381 0.8686
# 34 0.1692 0.9463 0.3346 0.8716
# 35 0.1630 0.9493 0.3388 0.8682
# 36 0.1571 0.9512 0.3372 0.8716
# 37 0.1518 0.9528 0.3459 0.8618
# 38 0.1461 0.9549 0.3275 0.8733 xx
# 39 0.1405 0.9582 0.3307 0.8733
# 40 0.1355 0.9584 0.3319 0.8767
# 41 0.1312 0.9590 0.3723 0.8509
# 42 0.1262 0.9622 0.3545 0.8621
# 43 0.1228 0.9617 0.3379 0.8682