In [1]:
import numpy as np
import h5py

import pandas as pd

import sys
sys.path.append('../')
from utils import preprocess

from collections import defaultdict
import string

import sklearn
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from gensim.models import word2vec

np.random.seed(1234)

In [2]:
num_split = 5

In [3]:
data_path = '../data/'
df = pd.read_csv(data_path + 'train_feature.csv')
df_test = pd.read_csv(data_path + 'test_feature.csv')
text = df.text.values
text_test = df_test.text.values

author2class = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
class2author = ['EAP', 'HPL', 'MWS']
y = np.array([author2class[a] for a in df.author])

In [4]:
def create_vector(vec):
    n = vec.vector_size
    x = np.zeros((len(df), n))
    for i, doc in enumerate(text):
        doc_vec = np.zeros(n)
        words = preprocess(doc).lower().split()
        num_words = 0
        for w in words:
            if w in vec.vocab:
                doc_vec += vec[w]
                num_words += 1
        doc_vec /= num_words
        x[i] = doc_vec

    x_test = np.zeros((len(df_test), n))
    for i, doc in enumerate(text_test):
        doc_vec = np.zeros(n)
        words = preprocess(doc).lower().split()
        num_words = 0
        for w in words:
            if w in vec.vocab:
                doc_vec += vec[w]
                num_words += 1
        doc_vec /= num_words
        x_test[i] = doc_vec
    return x, x_test

In [5]:
def logistic(x, x_test, seed=7):
    num_split = 5
    kf = KFold(n_splits=num_split, random_state=seed, shuffle=True)
    loss = 0.

    predict_prob_features = np.zeros((len(df), 3))
    predict_prob_features_test = np.zeros((len(df_test), 3))

    for train_index, val_index in kf.split(x):
        x_train, x_val = x[train_index], x[val_index]
        y_train, y_val = y[train_index], y[val_index]
        model = LogisticRegression()
        model.fit(x_train, y_train)
        y_pred = model.predict_proba(x_val)
        predict_prob_features_test += model.predict_proba(x_test)
        predict_prob_features[val_index] = y_pred
        c_loss = log_loss(y_pred=y_pred, y_true=y_val)
        print(c_loss)
        loss += c_loss

    print(loss/num_split)
    return predict_prob_features, predict_prob_features_test

# Unsupervised FastText

In [6]:
vec = word2vec.KeyedVectors.load_word2vec_format('./../fastText/cbow20_min2_neg15_ws20_epoch7.vec')

x, x_test = create_vector(vec)

predict_prob_features, predict_prob_features_test = logistic(x, x_test, 15)

for a, c in author2class.items():
    df['{}_fasttext_cbow_wide_logi'.format(a)] = predict_prob_features[:, c]
    df_test['{}_fasttext_cbow_wide_logi'.format(a)] = predict_prob_features_test[:, c]/num_split

0.779193831664
0.800083893574
0.775175414186
0.766897550796
0.763102977046
0.776890733453


In [7]:
df.to_csv('./../data/train_feature.csv')
df_test.to_csv('./../data/test_feature.csv')