In [1]:
import numpy as np

import pandas as pd
from collections import defaultdict
import string

import keras
from keras.layers import Dense, GlobalAveragePooling1D, Embedding, Lambda, Input, Concatenate, Add, Multiply
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.models import Sequential, Model

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from gensim.models import word2vec
from utils import preprocess
np.random.seed(1234)

Using TensorFlow backend.


In [2]:
embedding_dims = 20

In [3]:
df = pd.read_csv('./data/train.csv')

In [4]:
counter = {name : defaultdict(int) for name in set(df.author)}
for (text, author) in zip(df.text, df.author):
    text = text.replace(' ', '')
    for c in text:
        counter[author][c] += 1
    

In [5]:
chars = set()
for v in counter.values():
    chars |= v.keys()
    
names = [author for author in counter.keys()]
special_latters = set()
print('c ', end='')
for n in names:
    print(n, end='   ')
print()
for c in chars:    
    print(c, end=' ')
    for n in names:
        print(counter[n][c], end=' ')
    print()
    special_latters.add(c)

c HPL   MWS   EAP   
ç 0 0 1 
ë 12 0 0 
s 43915 45962 53841 
G 318 246 313 
Σ 1 0 0 
ñ 7 0 0 
â 0 0 6 
H 741 669 864 
î 0 0 1 
δ 2 0 0 
Å 1 0 0 
F 269 232 383 
α 2 0 0 
v 6529 7948 9624 
æ 10 0 36 
b 10636 9611 13245 
w 15554 16062 17507 
ï 7 0 0 
i 44250 46080 60952 
r 40590 44042 51221 
j 424 682 683 
Ο 3 0 0 
f 16272 18351 22354 
U 94 46 166 
A 1167 943 1258 
d 33366 35315 36862 
K 176 35 86 
; 1143 2662 1354 
e 88259 97515 114885 
x 1061 1267 1951 
Ν 1 0 0 
ô 0 0 8 
S 841 578 729 
D 334 227 491 
: 47 339 176 
E 281 445 435 
. 5908 5761 8406 
l 30273 27819 35371 
? 169 419 510 
Z 51 2 23 
X 5 4 17 
a 56815 55274 68525 
' 1710 476 1334 
M 645 415 1065 
B 533 395 835 
é 15 0 47 
Y 111 234 282 
P 320 365 442 
I 3480 4917 4846 
Π 1 0 0 
T 1583 1230 2217 
" 513 1469 2987 
g 14951 12601 16088 
, 8581 12045 17594 
t 62235 63142 82426 
ü 5 0 1 
q 779 677 1030 
Q 10 7 21 
z 529 400 634 
o 50996 53386 67145 
N 345 204 411 
V 67 57 156 
L 249 307 458 
n 50879 50291 62636 
c 18338 17911 24127 


## 特徴とか

- MWSはasciiしか系しか使わない（イギリスの作家だから？、それ以外の2人は使い、アメリカ）

In [6]:
special_latters = special_latters - set(string.ascii_uppercase) - set(string.ascii_lowercase) - set(',.:;"\'?')

In [7]:
' '.join(special_latters)

'ç ë Σ ñ â é î Π δ Å α ü æ ï Ο Æ Υ Ν ô ö ἶ à è ä ê'

In [8]:
# for (text, author) in zip(df.text, df.author):
#     if len(set(text) & special_latters):
#         print(set(text) & special_latters, author, text)


In [9]:
def create_docs(df, n_gram_max=1):
    docs = []

    for i, text in enumerate(df.text):    
        def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams

        doc = preprocess(text).split()
        
        prod = special_latters & set(text)
        special_chars = ''
        if prod:
            for c in prod:
                freq = text.count(c)
                special_chars += (' {} '.format(c) * freq)            
                
        docs.append(' '.join(add_ngram(doc, n_gram_max)) + special_chars)
        
    return docs

In [10]:
docs = create_docs(df, n_gram_max=2)

In [11]:
tokenizer = Tokenizer(num_words=2, filters='', lower=False)
tokenizer.fit_on_texts(docs)

In [12]:
min_count = 2
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

In [13]:
tokenizer = Tokenizer(num_words=num_words, filters='', lower=False)
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)
maxlen = max([len(i) for i in docs])
maxlen = 256
# 512 0.3326
# min=1 256 0.3242 # min = 2 0.3236

docs = pad_sequences(sequences=docs, maxlen=maxlen)

In [14]:
embedding_dims = 20

In [15]:
len(tokenizer.word_index)

257110

In [16]:
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}

In [17]:
y = np.array([a2c[a] for a in df.author])
y = to_categorical(y)

In [18]:
input_dim = np.max(docs)+1
input_dim, maxlen

(76614, 256)

In [19]:
# ./fasttext skipgram -input ../data/fasttext-inputs.txt -output skip20 -minCount 1 -neg 15 -ws 20 -epoch 7 -dim 20
vec = word2vec.KeyedVectors.load_word2vec_format('./fastText/skip20_min2_neg15_epoch_7_ws_20.vec')
fasttext_dim = vec.vector_size
fasttext_emb = np.zeros((len(df), fasttext_dim))

for i, text in enumerate(df.text):
    doc_vec = np.zeros(fasttext_dim)
    words = preprocess(text).lower().split()
    num_words = 0
    for w in words:
        if w in vec.vocab:
            doc_vec += vec[w]
            num_words += 1
    doc_vec /= num_words
    fasttext_emb[i] = doc_vec


In [20]:
from keras.layers import Dropout
inputs = Input(shape=(None, ))
emb = Embedding(input_dim=input_dim, output_dim=embedding_dims)(inputs)
mean = GlobalAveragePooling1D()(emb)

fasttext_input = Input(shape=(fasttext_dim, ))
concat = Concatenate()([mean, fasttext_input])
out = Dense(3, activation='softmax')(concat)

model = Model(inputs=[inputs, fasttext_input], outputs=[out])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [21]:
fasttext_emb.shape, docs.shape

((19579, 20), (19579, 256))

In [22]:
# # train validation case
# epochs = 45
# x_train, x_test, x_fast_train, x_fast_test, y_train, y_test = train_test_split(docs, fasttext_emb, y, test_size=0.15)

# # n_samples = x_train.shape[0]
# # class_weight = {c : (n_samples/(np.sum(y_train, axis=0)[c] * 3)) for c in range(3)}

# hist = model.fit([x_train, x_fast_train], [y_train],
#                  batch_size=16,
#                  validation_data=([x_test, x_fast_test], [y_test]),
#                  epochs=epochs,
#                  callbacks=[EarlyStopping(patience=4, monitor='val_loss')])

In [23]:
# history = hist.history
# for i in range(len(history['acc'])):
#     print('{0:2d}'.format(i), 
#           '{0:.4f}'.format(history['loss'][i]), 
#           '{0:.4f}'.format(history['acc'][i]),
#           '{0:.4f}'.format(history['val_loss'][i]),
#           '{0:.4f}'.format(history['val_acc'][i]))

In [24]:
model.fit([docs, fasttext_emb], [y],
          batch_size=16,
          epochs=14,
          shuffle=True)

Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14


<keras.callbacks.History at 0x1a273a8e10>

In [25]:
test_df = pd.read_csv('./data/test.csv')
docs = create_docs(test_df)
docs = tokenizer.texts_to_sequences(docs)
docs = pad_sequences(sequences=docs, maxlen=maxlen)

# create feature for test

fasttext_emb = np.zeros((len(test_df), fasttext_dim))
for i, text in enumerate(test_df.text):
    doc_vec = np.zeros(fasttext_dim)
    words = preprocess(text).lower().split()
    num_words = 0
    for w in words:
        if w in vec.vocab:
            doc_vec += vec[w]
            num_words += 1
    doc_vec /= num_words
    fasttext_emb[i] = doc_vec
    
model_predict_prob = Model(inputs=[model.layers[0].input, model.layers[3].input],
                                 outputs=[model.layers[-1].output])
prob = model_predict_prob.predict([docs, fasttext_emb])

df = pd.read_csv('./data/sample_submission.csv')
for a, i in a2c.items():
    df[a] = prob[:, i]

In [26]:
with open('./results/fastText_submission_preprocess_min2-ngram2-maxlength_epoch14-unsuper-concat.csv', 'w') as f:
    f.write('"id","EAP","HPL","MWS"\n')
    for (index, row), t in zip(df.iterrows(), test_df.text):
        f.write(','.join(['\"' + row['id'] + '\"'] +  list(map(str, [row['EAP'], row['HPL'], row['MWS']]))) + '\n')


In [None]:
# min=2, ngram=2, preprocess upper special dim=20 test=0.15

# eppoh 40 since previous result is not good...

#  0 0.9574 0.5803 0.8303 0.7732
#  1 0.7220 0.7836 0.6339 0.8131
#  2 0.5502 0.8333 0.5265 0.8260
#  3 0.4399 0.8664 0.4612 0.8427
#  4 0.3595 0.8909 0.4182 0.8522
#  5 0.2979 0.9105 0.3882 0.8601
#  6 0.2472 0.9305 0.3624 0.8679
#  7 0.2064 0.9421 0.3440 0.8693
#  8 0.1715 0.9552 0.3339 0.8710
#  9 0.1423 0.9653 0.3256 0.8764
# 10 0.1184 0.9722 0.3139 0.8795
# 11 0.0986 0.9788 0.3125 0.8812
# 12 0.0817 0.9823 0.3070 0.8822
# 13 0.0688 0.9856 0.3080 0.8815
# 14 0.0573 0.9888 0.3056 0.8819
# 15 0.0481 0.9902 0.3080 0.8836
# 16 0.0404 0.9922 0.3157 0.8808
# 17 0.0337 0.9933 0.3222 0.8788
# 18 0.0287 0.9947 0.3297 0.8764
# 19 0.0240 0.9956 0.3256 0.8798