In [1]:
import numpy as np
import h5py

import pandas as pd
from utils import preprocess
from collections import defaultdict
import string

import sklearn
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import keras
from keras.layers import Dense, GlobalAveragePooling1D, Embedding, Lambda
import keras.backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

from gensim.models import word2vec

from utils import preprocess
np.random.seed(1234)

Using TensorFlow backend.


In [2]:
num_split = 5

In [3]:
df = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
text = df.text.values
text_test = df_test.text.values

author2class = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
class2author = ['EAP', 'HPL', 'MWS']
y = np.array([author2class[a] for a in df.author])

In [4]:
def create_vector(vec):
    n = vec.vector_size
    x = np.zeros((len(df), n))
    for i, doc in enumerate(text):
        doc_vec = np.zeros(n)
        words = preprocess(doc).lower().split()
        num_words = 0
        for w in words:
            if w in vec.vocab:
                doc_vec += vec[w]
                num_words += 1
        doc_vec /= num_words
        x[i] = doc_vec

    x_test = np.zeros((len(df_test), n))
    for i, doc in enumerate(text_test):
        doc_vec = np.zeros(n)
        words = preprocess(doc).lower().split()
        num_words = 0
        for w in words:
            if w in vec.vocab:
                doc_vec += vec[w]
                num_words += 1
        doc_vec /= num_words
        x_test[i] = doc_vec
    return x, x_test

In [5]:
def logistic(x, x_test, seed=7):
    num_split = 5
    kf = KFold(n_splits=num_split, random_state=seed, shuffle=True)
    loss = 0.

    predict_prob_features = np.zeros((len(df), 3))
    predict_prob_features_test = np.zeros((len(df_test), 3))

    for train_index, val_index in kf.split(x):
        x_train, x_val = x[train_index], x[val_index]
        y_train, y_val = y[train_index], y[val_index]
        model = LogisticRegression()
        model.fit(x_train, y_train)
        y_pred = model.predict_proba(x_val)
        predict_prob_features_test += model.predict_proba(x_test)
        predict_prob_features[val_index] = y_pred
        loss += log_loss(y_pred=y_pred, y_true=y_val)

    print(loss/5)
    return predict_prob_features, predict_prob_features_test

# Unsupervised FastText

In [6]:
#  ./fasttext skipgram -input ../data/fasttext-inputs.txt -output model -minCount 1  -neg 15 -ws 10 -epoch 7
vec = word2vec.KeyedVectors.load_word2vec_format('./fastText/model.vec')

x, x_test = create_vector(vec)

predict_prob_features, predict_prob_features_test = logistic(x, x_test)

for a, c in author2class.items():
    df['{}_fasttext_logi'.format(a)] = predict_prob_features[:, c]
    df_test['{}_fasttext_logi'.format(a)] = predict_prob_features_test[:, c]/num_split

0.521292042282


In [7]:
vec = word2vec.KeyedVectors.load_word2vec_format('./fastText/skip20_min2_neg15_epoch_7_ws_20.vec') # 0.527408070746

x, x_test = create_vector(vec)

predict_prob_features, predict_prob_features_test = logistic(x, x_test, 7)

for a, c in author2class.items():
    df['{}_fasttext_low_dim_logi'.format(a)] = predict_prob_features[:, c]
    df_test['{}_fasttext_low_dim_logi'.format(a)] = predict_prob_features_test[:, c]/num_split

0.527408070746


In [8]:
vec = word2vec.KeyedVectors.load_word2vec_format('./fastText/cbow100_min1_neg15_epoch_7_ws_10.vec')

x, x_test = create_vector(vec)

predict_prob_features, predict_prob_features_test = logistic(x, x_test, 9)

for a, c in author2class.items():
    df['{}_fasttext_cbow_logi'.format(a)] = predict_prob_features[:, c]
    df_test['{}_fasttext_cbow_logi'.format(a)] = predict_prob_features_test[:, c]/num_split

0.802494276952


In [9]:
def vectorizer_feature(vectorizer, seed=8, num_split=5, alphas=[1.]):
    param_grid = dict(alpha=alphas)
    print(param_grid, vectorizer)
    
    kf = KFold(n_splits=num_split, random_state=seed, shuffle=True)
    sum_loss = 0.

    predict_prob_features = np.zeros((len(df), 3))
    predict_prob_features_test = np.zeros((len(df_test), 3))
    ite = 0
    for train_index, val_index in kf.split(text):
        ite += 1
        print('{}/{}: #Trains: {}, #Val: {}'.format(ite, num_split, len(train_index), len(val_index)), end=' ')
        text_train, text_val = text[train_index], text[val_index]
        y_train, y_val = y[train_index], y[val_index]

        x_train = vectorizer.fit_transform(text_train)
        x_val = vectorizer.transform(text_val)
        
        
        if len(alphas) > 1:
            model = MultinomialNB()
            clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='neg_log_loss', n_jobs=-1)
            clf.fit(x_train, y_train)
            model = clf.best_estimator_            
        else:
            alpha = 1.
            if len(alphas) == 1:
                alpha = alphas[0]
            model = MultinomialNB(alpha)
            model.fit(x_train, y_train)
        
        y_pred = model.predict_proba(x_val)

        # save features
        predict_prob_features[val_index] = y_pred
        predict_prob_features_test += model.predict_proba(vectorizer.transform(text_test))
        
        best_param = model.alpha

        loss = log_loss(y_pred=y_pred, y_true=y_val)
        sum_loss += loss

        
        print('valLoss: {}, best_param α= {}'.format(loss, best_param))
        
    print(sum_loss/num_split)
    return predict_prob_features, predict_prob_features_test


# Naive Bayes

In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='word')
predict_prob_features, predict_prob_features_test = vectorizer_feature(vectorizer, 7, alphas=[0.007]) # from [0.005,0.007, 0.01]
for a, c in author2class.items():
    df['{}_word_tfidf_NB'.format(a)] = predict_prob_features[:, c]
    df_test['{}_word_tfidf_NB'.format(a)] = predict_prob_features_test[:, c]/num_split

{'alpha': [0.007]} TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
1/5: #Trains: 15663, #Val: 3916 valLoss: 0.4191531979947033, best_param α= 0.007
2/5: #Trains: 15663, #Val: 3916 valLoss: 0.3979100889218946, best_param α= 0.007
3/5: #Trains: 15663, #Val: 3916 valLoss: 0.403906857121704, best_param α= 0.007
4/5: #Trains: 15663, #Val: 3916 valLoss: 0.4146626933703398, best_param α= 0.007
5/5: #Trains: 15664, #Val: 3915 valLoss: 0.4054966330640256, best_param α= 0.007
0.408225894095


In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1, 5), analyzer='char')
predict_prob_features, predict_prob_features_test = vectorizer_feature(vectorizer, 8, alphas=[0.013]) # 0.012, 0.013, 0.014
for a, c in author2class.items():
    df['{}_char_tfidf_NB'.format(a)] = predict_prob_features[:, c]
    df_test['{}_char_tfidf_NB'.format(a)] = predict_prob_features_test[:, c]/num_split

{'alpha': [0.013]} TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 5), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
1/5: #Trains: 15663, #Val: 3916 valLoss: 0.3951854930297089, best_param α= 0.013
2/5: #Trains: 15663, #Val: 3916 valLoss: 0.37123938784736266, best_param α= 0.013
3/5: #Trains: 15663, #Val: 3916 valLoss: 0.4002779893173318, best_param α= 0.013
4/5: #Trains: 15663, #Val: 3916 valLoss: 0.3916746370981128, best_param α= 0.013
5/5: #Trains: 15664, #Val: 3915 valLoss: 0.3654883416610354, best_param α= 0.013
0.384773169791


In [12]:
vectorizer = CountVectorizer(ngram_range=(1, 3), analyzer='word')
predict_prob_features, predict_prob_features_test = vectorizer_feature(vectorizer, 9, alphas=[1.2])
for a, c in author2class.items():
    df['{}_word_count_NB'.format(a)] = predict_prob_features[:, c]
    df_test['{}_word_count_NB'.format(a)] = predict_prob_features_test[:, c]/num_split

{'alpha': [1.2]} CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)
1/5: #Trains: 15663, #Val: 3916 valLoss: 0.7884866155256142, best_param α= 1.2
2/5: #Trains: 15663, #Val: 3916 valLoss: 0.8623959123023731, best_param α= 1.2
3/5: #Trains: 15663, #Val: 3916 valLoss: 0.8252652372000494, best_param α= 1.2
4/5: #Trains: 15663, #Val: 3916 valLoss: 0.776055308824219, best_param α= 1.2
5/5: #Trains: 15664, #Val: 3915 valLoss: 0.8172614666911712, best_param α= 1.2
0.813892908109


In [13]:
vectorizer = CountVectorizer(ngram_range=(1, 4), analyzer='char')
predict_prob_features, predict_prob_features_test = vectorizer_feature(vectorizer, seed=7, alphas=[0.15, 0.2, 0.3, 0.4, 0.5])
for a, c in author2class.items():
    df['{}_char_count_NB'.format(a)] = predict_prob_features[:, c]
    df_test['{}_char_count_NB'.format(a)] = predict_prob_features_test[:, c]/num_split

{'alpha': [0.15, 0.2, 0.3, 0.4, 0.5]} CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)
1/5: #Trains: 15663, #Val: 3916 valLoss: 2.5132982409813023, best_param α= 0.2
2/5: #Trains: 15663, #Val: 3916 valLoss: 2.4407679975214327, best_param α= 0.4
3/5: #Trains: 15663, #Val: 3916 valLoss: 2.504654489912232, best_param α= 0.3
4/5: #Trains: 15663, #Val: 3916 valLoss: 2.837462002734221, best_param α= 0.2
5/5: #Trains: 15664, #Val: 3915 valLoss: 2.561142701900116, best_param α= 0.2
2.57146508661


In [14]:
vectorizer = CountVectorizer(ngram_range=(1, 5), analyzer='char_wb')
predict_prob_features, predict_prob_features_test = vectorizer_feature(vectorizer, seed=10, alphas=[1.5, 2., 2.5]) # 0.5, 0.1, 1., 1.5, 2., 2.5, 3.
for a, c in author2class.items():
    df['{}_char_wb_count_NB'.format(a)] = predict_prob_features[:, c]
    df_test['{}_char_wb_count_NB'.format(a)] = predict_prob_features_test[:, c]/num_split


{'alpha': [1.5, 2.0, 2.5]} CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 5), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)
1/5: #Trains: 15663, #Val: 3916 valLoss: 2.911286873284481, best_param α= 2.5
2/5: #Trains: 15663, #Val: 3916 valLoss: 2.693418535058206, best_param α= 1.5
3/5: #Trains: 15663, #Val: 3916 valLoss: 2.600940347993735, best_param α= 2.0
4/5: #Trains: 15663, #Val: 3916 valLoss: 3.018726215529485, best_param α= 2.0
5/5: #Trains: 15664, #Val: 3915 valLoss: 2.699994701500262, best_param α= 2.0
2.78487333467


# MetaFeatures

In [15]:
normal_latters = set(string.ascii_uppercase) | set(string.ascii_lowercase) | set(',.:;"\'?! ')

In [16]:
df['num_words']      = np.array([len(t.split()) for t in df.text])
df_test['num_words'] = np.array([len(t.split()) for t in df_test.text])

df['num_chars']      = np.array([len(t) for t in df.text])
df_test['num_chars'] = np.array([len(t) for t in df_test.text])

df['average_num_chars']      = np.array([np.mean([len(word) for word in t.split()]) for t in df.text])
df_test['average_num_chars'] = np.array([np.mean([len(word) for word in t.split()]) for t in df_test.text])

df['num_uniq_words']      = np.array([len(set(t.split())) for t in df.text])
df_test['num_uniq_words'] = np.array([len(set(t.split())) for t in df_test.text])

df['num_uniq_chars']      = np.array([len(set(t)) for t in df.text])
df_test['num_uniq_chars'] = np.array([len(set(t)) for t in df_test.text])

df['rate_uniq_words']      = np.array([len(set(t.split()))/len(t.split()) for t in df.text])
df_test['rate_uniq_words'] = np.array([len(set(t.split()))/len(t.split()) for t in df_test.text])

df['rate_uniq_chars']       = np.array([len(set(t))/len(t) for t in df.text])
df_test['rate_uniq_chars'] = np.array([len(set(t))/len(t) for t in df_test.text])


special = ',!' # ',.:;"\!'?!'
for c in special:
    df['num_'+c] = np.array([t.count(c) for t in df.text])
    df_test['num_'+c] = np.array([t.count(c) for t in df_test.text])


# FastText!!!!

In [17]:
y = np.array([author2class[a] for a in df.author])
y = to_categorical(y)

In [18]:
embedding_dims = 20

In [19]:
def create_docs(df, n_gram_max=1):
    docs = []
    special_latters = set('à ñ Ν ê Υ Å Æ ö δ è Π α é ä ë æ ô ç ü ἶ Ο â î ï Σ'.split()) - set(string.ascii_uppercase) - set(string.ascii_lowercase) - set(',.:;"\'?')
    for i, text in enumerate(df.text):    
        def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams

        doc = preprocess(text).split()
        
        prod = special_latters & set(text)
        special_chars = ''
        if prod:
            for c in prod:
                freq = text.count(c)
                special_chars += (' {} '.format(c) * freq)            
                
        docs.append(' '.join(add_ngram(doc, n_gram_max)) + special_chars)
        
    return docs

In [21]:
def create_model(input_dim, embeddings_dims=20):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [22]:
raw_docs = create_docs(df, n_gram_max=1)
raw_docs_test = create_docs(df_test, n_gram_max=1)

seed = 7
num_split = 5
epochs = 45

# for next training
predict_prob_features = np.zeros((len(df), 3))
predict_prob_features_test = np.zeros((len(df_test), 3))

ite = 0
sum_loss = 0.
min_count = 1

kf = KFold(n_splits=num_split, random_state=seed, shuffle=True)
for train_index, val_index in kf.split(text):
    ite += 1
    print('{}/{}: #Trains: {}, #Val: {}'.format(ite, num_split, len(train_index), len(val_index)), end=' ')
    
    docs_train = [raw_docs[i] for i in train_index]
    docs_val = [raw_docs[i] for i in val_index]

    # get vocab
    tokenizer = Tokenizer(filters='', lower=False)
    #     tokenizer.fit_on_texts(docs_train)
    tokenizer.fit_on_texts(docs_train + docs_val)

    num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

    tokenizer = Tokenizer(num_words=num_words, filters='', lower=False)
    #     tokenizer.fit_on_texts(docs_train)
    tokenizer.fit_on_texts(docs_train + docs_val)    
    
    docs_train = tokenizer.texts_to_sequences(docs_train)
    docs_val = tokenizer.texts_to_sequences(docs_val)
    docs_test = tokenizer.texts_to_sequences(raw_docs_test)

    maxlen = 256# max([len(i) for i in  docs_train] + [len(i) for i in  docs_val])
    x_train = pad_sequences(sequences=docs_train, maxlen=maxlen)
    x_val = pad_sequences(sequences=docs_val, maxlen=maxlen)
    x_test = pad_sequences(sequences=docs_test, maxlen=maxlen)

    y_train, y_val = y[train_index], y[val_index]

    input_dim = max(np.max(x_train), np.max(x_val)) + 1
#     input_dim = np.max(np.max(x_train) + 1    
    print('#vocab: {} '.format(num_words), end=' ')
    print(x_train.shape, x_val.shape, x_test.shape)
    

    model = create_model(input_dim)
    
    checkpointer = ModelCheckpoint(filepath='./fasttext_weights/weights.hdf5', verbose=0, save_best_only=True)

    hist = model.fit(x_train, y_train,
                     batch_size=16,
                     validation_data=(x_val, y_val),
                     epochs=epochs,
                     callbacks=[EarlyStopping(patience=4, monitor='val_loss'), 
                                checkpointer])

    # load best weights
    model.load_weights('./fasttext_weights/weights.hdf5')
    y_pred = model.predict_proba(x_val)
    sum_loss += log_loss(y_pred=y_pred, y_true=np.nonzero(y_val)[1])

    # save features
    predict_prob_features[val_index] = y_pred
    predict_prob_features_test += model.predict_proba(x_test)

    print('valLoss: {}'.format(sum_loss/ite))


1/5: #Trains: 15663, #Val: 3916 #vocab: 28285  (15663, 256) (3916, 256) (8392, 256)
Train on 15663 samples, validate on 3916 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
2/5: #Trains: 15663, #Val: 3916 #vocab: 28285  (15663, 256) (3916, 256) (8392, 256)
Train on 15663 samples, validate on 3916 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
3/5: #Trains: 15663, #Val: 3916 #vocab: 28285  (15663, 256) (3916, 256) (8392, 256)
Train on 15663 sampl

Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
4/5: #Trains: 15663, #Val: 3916 #vocab: 28285  (15663, 256) (3916, 256) (8392, 256)
Train on 15663 samples, validate on 3916 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
5/5: #Trains: 15664, #Val: 3915 #vocab: 28285  (15664, 256) (3915, 256) (8392, 256)
Train on 15664 samples, validate on 3915 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45


Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
valLoss: 0.3726591018155969


In [23]:
for a, c in author2class.items():
    df['{}_fasttext_ngram1'.format(a)] = predict_prob_features[:, c]
    df_test['{}_fasttext_ngram1'.format(a)] = predict_prob_features_test[:, c]/num_split

In [24]:
df.to_csv('./data/train_feature.csv')
df_test.to_csv('./data/test_feature.csv')