In [1]:
import numpy as np

import pandas as pd

from collections import defaultdict

import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

np.random.seed(7)

Using TensorFlow backend.


In [3]:
df = pd.read_csv('./data/train.csv')
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
y = np.array([a2c[a] for a in df.author])
y = to_categorical(y)

In [4]:
counter = {name : defaultdict(int) for name in set(df.author)}
for (text, author) in zip(df.text, df.author):
    text = text.replace(' ', '')
    for c in text:
        counter[author][c] += 1

chars = set()
for v in counter.values():
    chars |= v.keys()
    
names = [author for author in counter.keys()]

print('c ', end='')
for n in names:
    print(n, end='   ')
print()
for c in chars:    
    print(c, end=' ')
    for n in names:
        print(counter[n][c], end=' ')
    print()

c MWS   EAP   HPL   
y 14877 17001 12534 
æ 0 36 10 
X 4 17 5 
s 45962 53841 43915 
Å 0 0 1 
q 677 1030 779 
, 12045 17594 8581 
T 1230 2217 1583 
ñ 0 0 7 
' 476 1334 1710 
o 53386 67145 50996 
x 1267 1951 1061 
ë 0 0 12 
ἶ 0 0 2 
à 0 10 0 
c 17911 24127 18338 
ç 0 1 0 
r 44042 51221 40590 
f 18351 22354 16272 
a 55274 68525 56815 
t 63142 82426 62235 
d 35315 36862 33366 
N 204 411 345 
W 681 739 732 
n 50291 62636 50879 
D 227 491 334 
j 682 683 424 
B 395 835 533 
U 46 166 94 
Ο 0 0 3 
C 308 395 439 
v 7948 9624 6529 
A 943 1258 1167 
α 0 0 2 
G 246 313 318 
Æ 0 1 4 
z 400 634 529 
S 578 729 841 
ô 0 8 0 
L 307 458 249 
â 0 6 0 
: 339 176 47 
δ 0 0 2 
ö 0 16 3 
b 9611 13245 10636 
P 365 442 320 
ï 0 0 7 
g 12601 16088 14951 
Ν 0 0 1 
é 0 47 15 
î 0 1 0 
O 282 414 503 
i 46080 60952 44250 
h 43738 51580 42770 
M 415 1065 645 
Σ 0 0 1 
I 4917 4846 3480 
; 2662 1354 1143 
m 20471 22792 17622 
Z 2 23 51 
V 57 156 67 
R 385 258 237 
Υ 0 0 1 
" 1469 2987 513 
e 97515 114885 88259 
E 445 4

In [5]:
def preprocess(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

In [6]:
def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df.text:
        doc = preprocess(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

In [7]:
min_count = 2

docs = create_docs(df)
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=False, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 256

docs = pad_sequences(sequences=docs, maxlen=maxlen)

In [8]:
input_dim = np.max(docs) + 1
embedding_dims = 20

In [9]:
def create_model(embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [10]:
epochs = 25
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.2)

model = create_model()
hist = model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])

Train on 15663 samples, validate on 3916 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25


In [11]:
docs = create_docs(df)
tokenizer = Tokenizer(lower=True, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=True, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 256

docs = pad_sequences(sequences=docs, maxlen=maxlen)

input_dim = np.max(docs) + 1

In [12]:
epochs = 25
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.2)

model = create_model()
hist = model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=epochs,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])

Train on 15663 samples, validate on 3916 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25


In [17]:
test_df = pd.read_csv('./data/test.csv')
docs = create_docs(test_df)
docs = tokenizer.texts_to_sequences(docs)
docs = pad_sequences(sequences=docs, maxlen=maxlen)
y = model.predict_proba(docs)

result = pd.read_csv('./data/sample_submission.csv')
for a, i in a2c.items():
    result[a] = y[:, i]
result.to_csv('submission.csv', index=False)

In [15]:
!head ./data/sample_submission.csv

"id","EAP","HPL","MWS"
"id02310",0.403493538995863,0.287808366106543,0.308698094897594
"id24541",0.403493538995863,0.287808366106543,0.308698094897594
"id00134",0.403493538995863,0.287808366106543,0.308698094897594
"id27757",0.403493538995863,0.287808366106543,0.308698094897594
"id04081",0.403493538995863,0.287808366106543,0.308698094897594
"id27337",0.403493538995863,0.287808366106543,0.308698094897594
"id24265",0.403493538995863,0.287808366106543,0.308698094897594
"id25917",0.403493538995863,0.287808366106543,0.308698094897594
"id04951",0.403493538995863,0.287808366106543,0.308698094897594


In [18]:
!head submission.csv

id,EAP,HPL,MWS
id02310,0.022793320938944817,0.009503418579697609,0.9677032232284546
id24541,0.9998486042022705,0.00015134006389416754,5.5520096964301047e-08
id00134,0.001943304785527289,0.9928625226020813,0.005194144323468208
id27757,0.969866156578064,0.02821045182645321,0.0019233745988458395
id04081,0.7887112498283386,0.07711251080036163,0.1341763436794281
id27337,0.9975106716156006,0.0011480755638331175,0.0013412671396508813
id24265,0.9875584840774536,0.009479498490691185,0.002961952006444335
id25917,0.0029075471684336662,0.03873061388731003,0.958361804485321
id04951,0.9998929500579834,8.331518620252609e-05,2.3681956008658744e-05
