In [1]:
import numpy as np

import pandas as pd
from collections import defaultdict
import string

import keras
from keras.layers import Dense, GlobalAveragePooling1D, Embedding, Lambda
import keras.backend as K
from keras.callbacks import EarlyStopping

from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
np.random.seed(1234)

Using TensorFlow backend.


In [2]:
def preprocess(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

In [3]:
embedding_dims = 20

In [4]:
df = pd.read_csv('./train.csv')

In [5]:
df.text[0]

'This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.'

In [6]:
counter = {name : defaultdict(int) for name in set(df.author)}
for (text, author) in zip(df.text, df.author):
    text = text.replace(' ', '')
    for c in text:
        counter[author][c] += 1
    

In [7]:
chars = set()
for v in counter.values():
    chars |= v.keys()
    
names = [author for author in counter.keys()]
special_latters = set()
print('c ', end='')
for n in names:
    print(n, end='   ')
print()
for c in chars:    
    print(c, end=' ')
    for n in names:
        print(counter[n][c], end=' ')
    print()
    special_latters.add(c)

c HPL   MWS   EAP   
A 1167 943 1258 
e 88259 97515 114885 
w 15554 16062 17507 
" 513 1469 2987 
p 10965 12361 17422 
G 318 246 313 
R 237 385 258 
t 62235 63142 82426 
K 176 35 86 
Ν 1 0 0 
. 5908 5761 8406 
ê 2 0 28 
i 44250 46080 60952 
B 533 395 835 
Å 1 0 0 
S 841 578 729 
I 3480 4917 4846 
Π 1 0 0 
U 94 46 166 
ä 6 0 1 
T 1583 1230 2217 
x 1061 1267 1951 
W 732 681 739 
ë 12 0 0 
æ 10 0 36 
ô 0 0 8 
N 345 204 411 
ç 0 0 1 
X 5 4 17 
: 47 339 176 
J 210 66 164 
P 320 365 442 
r 40590 44042 51221 
O 503 282 414 
Ο 3 0 0 
F 269 232 383 
â 0 0 6 
H 741 669 864 
ï 7 0 0 
D 334 227 491 
o 50996 53386 67145 
q 779 677 1030 
à 0 0 10 
g 14951 12601 16088 
ñ 7 0 0 
Q 10 7 21 
m 17622 20471 22792 
v 6529 7948 9624 
L 249 307 458 
Υ 1 0 0 
u 19519 21025 26311 
M 645 415 1065 
y 12534 14877 17001 
Z 51 2 23 
? 169 419 510 
c 18338 17911 24127 
Æ 4 0 1 
V 67 57 156 
Y 111 234 282 
ö 3 0 16 
δ 2 0 0 
è 0 0 15 
α 2 0 0 
é 15 0 47 
' 1710 476 1334 
; 1143 2662 1354 
a 56815 55274 68525 
h 42770

## 特徴とか

- MWSはasciiしか系しか使わない（イギリスの作家だから？、それ以外の2人は使い、アメリカ）

In [8]:
special_latters = special_latters - set(string.ascii_uppercase) - set(string.ascii_lowercase) - set(',.:;"\'?')

In [9]:
' '.join(special_latters)

'à ñ Ν ê Υ Å Æ ö δ è Π α é ä ë æ ô ç ü ἶ Ο â î ï Σ'

In [10]:
# for (text, author) in zip(df.text, df.author):
#     if len(set(text) & special_latters):
#         print(set(text) & special_latters, author, text)


In [11]:
def create_docs(df, n_gram_max=1):
    docs = []

    for i, text in enumerate(df.text):    
        def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams

        doc = preprocess(text).split()
        
        prod = special_latters & set(text)
        special_chars = ''
        if prod:
            for c in prod:
                freq = text.count(c)
                special_chars += (' {} '.format(c) * freq)            
                
        docs.append(' '.join(add_ngram(doc, n_gram_max)) + special_chars)
        
    return docs

In [12]:
docs = create_docs(df, n_gram_max=1)

In [13]:
tokenizer = Tokenizer(num_words=2, filters='', lower=False)
tokenizer.fit_on_texts(docs)

In [14]:
min_count = 1
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

In [15]:
tokenizer = Tokenizer(num_words=num_words, filters='', lower=False)
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)
maxlen = max([len(i) for i in  docs])
docs = pad_sequences(sequences=docs, maxlen=maxlen)

In [16]:
len(tokenizer.word_index)

28285

In [17]:
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}

In [18]:
y = np.array([a2c[a] for a in df.author])
y = to_categorical(y)

In [19]:
input_dim = np.max(docs)+1
input_dim, maxlen

(28285, 875)

In [20]:
model = Sequential()
model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
model.add(GlobalAveragePooling1D())
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [21]:
# train validation case
# epochs = 45
# x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.15)

# n_samples = x_train.shape[0]
# class_weight = {c : (n_samples/(np.sum(y_train, axis=0)[c] * 3)) for c in range(3)}

# hist = model.fit(x_train, y_train,
#                  batch_size=16,
#                  validation_data=(x_test, y_test),
#                  epochs=epochs,
#                  callbacks=[EarlyStopping(patience=4, monitor='val_loss')])

In [22]:
# history = hist.history
# for i in range(epochs):
#     print('{0:2d}'.format(i), 
#           '{0:.4f}'.format(history['loss'][i]), 
#           '{0:.4f}'.format(history['acc'][i]),
#           '{0:.4f}'.format(history['val_loss'][i]),
#           '{0:.4f}'.format(history['val_acc'][i]))

In [23]:
model.fit(docs, y,
          batch_size=16,
          epochs=38)

Epoch 1/38
Epoch 2/38
Epoch 3/38
Epoch 4/38
Epoch 5/38
Epoch 6/38
Epoch 7/38
Epoch 8/38
Epoch 9/38
Epoch 10/38
Epoch 11/38
Epoch 12/38
Epoch 13/38
Epoch 14/38
Epoch 15/38
Epoch 16/38
Epoch 17/38
Epoch 18/38
Epoch 19/38
Epoch 20/38
Epoch 21/38
Epoch 22/38
Epoch 23/38
Epoch 24/38
Epoch 25/38
Epoch 26/38
Epoch 27/38
Epoch 28/38
Epoch 29/38
Epoch 30/38
Epoch 31/38
Epoch 32/38
Epoch 33/38
Epoch 34/38
Epoch 35/38
Epoch 36/38
Epoch 37/38
Epoch 38/38


<keras.callbacks.History at 0x1a1d416ba8>

In [24]:
test_df = pd.read_csv('test.csv')
docs = create_docs(test_df)
docs = tokenizer.texts_to_sequences(docs)
docs = pad_sequences(sequences=docs, maxlen=maxlen)
y = model.predict_proba(docs)

df = pd.read_csv('./sample_submission.csv')
for a, i in a2c.items():
    df[a] = y[:, i]



In [25]:
with open('./fastText_submission_preprocess_min1-ngram1_epoch38.csv', 'w') as f:
    f.write('"id","EAP","HPL","MWS"\n')
    for (index, row), t in zip(df.iterrows(), test_df.text):
        f.write(','.join(['\"' + row["id"] + '\"'] +  list(map(str, [row['EAP'], row['HPL'], row['MWS']]))) + '\n')


In [25]:
# min=1, ngram=1, preprocess upper special dim=20 test=0.15

# eppoh 40 since previous result is not good...
#  0 1.0851 0.4049 1.0840 0.3956
#  1 1.0721 0.4067 1.0625 0.3997
#  2 1.0372 0.4494 1.0153 0.4760
#  3 0.9778 0.5570 0.9535 0.6919
#  4 0.9042 0.6492 0.8816 0.6779
#  5 0.8326 0.7074 0.8199 0.7283
#  6 0.7666 0.7405 0.7659 0.7293
#  7 0.7096 0.7651 0.7179 0.7320
#  8 0.6606 0.7821 0.6797 0.7477
#  9 0.6165 0.7938 0.6554 0.7508
# 10 0.5786 0.8069 0.6121 0.7797
# 11 0.5444 0.8185 0.5876 0.7865
# 12 0.5126 0.8289 0.5651 0.7916
# 13 0.4851 0.8408 0.5426 0.8066
# 14 0.4590 0.8461 0.5288 0.7947
# 15 0.4352 0.8566 0.5089 0.8035
# 16 0.4118 0.8659 0.4905 0.8138
# 17 0.3919 0.8709 0.4754 0.8189
# 18 0.3738 0.8773 0.4657 0.8240
# 19 0.3549 0.8860 0.4565 0.8338
# 20 0.3378 0.8953 0.4465 0.8223
# 21 0.3215 0.9009 0.4300 0.8352
# 22 0.3075 0.9047 0.4211 0.8437
# 23 0.2948 0.9082 0.4121 0.8403
# 24 0.2816 0.9122 0.4114 0.8458
# 25 0.2692 0.9167 0.3990 0.8444
# 26 0.2576 0.9225 0.4038 0.8369
# 27 0.2471 0.9254 0.3846 0.8485
# 28 0.2377 0.9298 0.3855 0.8509
# 29 0.2275 0.9323 0.4034 0.8366
# 30 0.2174 0.9334 0.3786 0.8492
# 31 0.2096 0.9385 0.3894 0.8413
# 32 0.2019 0.9394 0.3703 0.8597
# 33 0.1940 0.9430 0.3634 0.8594
# 34 0.1862 0.9451 0.3590 0.8553
# 35 0.1792 0.9468 0.3681 0.8550
# 36 0.1730 0.9499 0.3610 0.8533
# 37 0.1666 0.9509 0.3589 0.8662
# 38 0.1605 0.9536 0.3477 0.8655
# 39 0.1550 0.9554 0.3494 0.8621
# 40 0.1497 0.9561 0.3462 0.8601 xxx
# 41 0.1448 0.9572 0.3858 0.8471
# 42 0.1390 0.9600 0.3600 0.8645
# 43 0.1351 0.9620 0.3530 0.8679
# 44 0.1298 0.9632 0.3835 0.8515