In [1]:
import numpy as np
import pandas as pd

import sys
sys.path.append('../')
from utils import preprocess, create_vector, logistic, vectorizer2NB

from collections import defaultdict
import string

import sklearn
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from gensim.models import word2vec

np.random.seed(1234)

In [2]:
num_split = 5

In [3]:
data_path = '../data/'
df = pd.read_csv(data_path + 'train.csv')
df_test = pd.read_csv(data_path + 'test.csv')
text = df.text.values
text_test = df_test.text.values

author2class = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
class2author = ['EAP', 'HPL', 'MWS']
y = np.array([author2class[a] for a in df.author])

In [4]:
fnames = ['cbow100_min1_neg15_ws20_epoch7.vec', 'skip100_min1_neg15_ws20_epoch7.vec', 'cbow100_min1_neg15_ws5_epoch7.vec', 'skip100_min1_neg15_ws5_epoch7.vec']
for i, fname in enumerate(fnames):
    vec = word2vec.KeyedVectors.load_word2vec_format('./../fastText/' + fname)
    x, x_test = create_vector(text, text_test, vec, preprocess_single=True)
    predict_prob_features, predict_prob_features_test = logistic(x, y, x_test, 7+i)
    print(fname)
    for a, c in author2class.items():
        df['{}_{}_logi'.format(a, fname)] = predict_prob_features[:, c]
        df_test['{}_{}_logi'.format(a, fname)] = predict_prob_features_test[:, c]/num_split

0.781053627128
cbow100_min1_neg15_ws20_epoch7.vec
0.485307596257
skip100_min1_neg15_ws20_epoch7.vec
0.836930458733
cbow100_min1_neg15_ws5_epoch7.vec
0.590803953107
skip100_min1_neg15_ws5_epoch7.vec


# Naive Bayes

In [5]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='word', token_pattern='(?u)\\b\\w+\\b')
predict_prob_features, predict_prob_features_test = vectorizer2NB(vectorizer,
                                                                  text, 
                                                                  y,
                                                                  text_test,
                                                                  7, alphas=[0.008]) # from 0.007, 0.008, 0.009
for a, c in author2class.items():
    df['{}_word_tfidf_NB'.format(a)] = predict_prob_features[:, c]
    df_test['{}_word_tfidf_NB'.format(a)] = predict_prob_features_test[:, c]/num_split

{'alpha': [0.008]} TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
1/5: #Trains: 15663, #Val: 3916 valLoss: 0.4015830474795028, best_param α= 0.008
2/5: #Trains: 15663, #Val: 3916 valLoss: 0.3821852810736139, best_param α= 0.008
3/5: #Trains: 15663, #Val: 3916 valLoss: 0.3911772365943706, best_param α= 0.008
4/5: #Trains: 15663, #Val: 3916 valLoss: 0.40242707162070324, best_param α= 0.008
5/5: #Trains: 15664, #Val: 3915 valLoss: 0.39431323529028456, best_param α= 0.008
0.394337174412


In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1, 5), analyzer='char', token_pattern='(?u)\\b\\w+\\b')
predict_prob_features, predict_prob_features_test = vectorizer2NB(vectorizer,
                                                                  text, 
                                                                  y,
                                                                  text_test,
                                                                  8, alphas=[0.013, 0.014]) # 0.012, 0.013, 0.014
for a, c in author2class.items():
    df['{}_char_tfidf_NB'.format(a)] = predict_prob_features[:, c]
    df_test['{}_char_tfidf_NB'.format(a)] = predict_prob_features_test[:, c]/num_split

{'alpha': [0.013, 0.014]} TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 5), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
1/5: #Trains: 15663, #Val: 3916 valLoss: 0.3951854930297089, best_param α= 0.013
2/5: #Trains: 15663, #Val: 3916 valLoss: 0.37123938784736266, best_param α= 0.013
3/5: #Trains: 15663, #Val: 3916 valLoss: 0.40039094882541354, best_param α= 0.014
4/5: #Trains: 15663, #Val: 3916 valLoss: 0.3916746370981128, best_param α= 0.013
5/5: #Trains: 15664, #Val: 3915 valLoss: 0.36536618930442233, best_param α= 0.014
0.384771331221


In [7]:
vectorizer = CountVectorizer(ngram_range=(1, 3), analyzer='word', token_pattern='(?u)\\b\\w+\\b')
predict_prob_features, predict_prob_features_test = vectorizer2NB(vectorizer,
                                                                  text, 
                                                                  y,
                                                                  text_test,
                                                                  9, alphas=[1.1]) # 1.0, 1.1, 1.2, 1.3

for a, c in author2class.items():
    df['{}_word_count_NB'.format(a)] = predict_prob_features[:, c]
    df_test['{}_word_count_NB'.format(a)] = predict_prob_features_test[:, c]/num_split

{'alpha': [1.1]} CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w+\\b', tokenizer=None,
        vocabulary=None)
1/5: #Trains: 15663, #Val: 3916 valLoss: 0.8214301928473606, best_param α= 1.1
2/5: #Trains: 15663, #Val: 3916 valLoss: 0.8746266739437928, best_param α= 1.1
3/5: #Trains: 15663, #Val: 3916 valLoss: 0.8440447896828697, best_param α= 1.1
4/5: #Trains: 15663, #Val: 3916 valLoss: 0.8016879710430628, best_param α= 1.1
5/5: #Trains: 15664, #Val: 3915 valLoss: 0.8354528599965615, best_param α= 1.1
0.835448497503


In [8]:
vectorizer = CountVectorizer(ngram_range=(1, 4), analyzer='char', token_pattern='(?u)\\b\\w+\\b')
predict_prob_features, predict_prob_features_test = vectorizer2NB(vectorizer,
                                                                  text, 
                                                                  y,
                                                                  text_test,
                                                                  10, alphas=[0.15, 0.2, 0.3, 0.4, 0.5])

for a, c in author2class.items():
    df['{}_char_count_NB'.format(a)] = predict_prob_features[:, c]
    df_test['{}_char_count_NB'.format(a)] = predict_prob_features_test[:, c]/num_split

{'alpha': [0.15, 0.2, 0.3, 0.4, 0.5]} CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w+\\b', tokenizer=None,
        vocabulary=None)
1/5: #Trains: 15663, #Val: 3916 valLoss: 2.6198967115985408, best_param α= 0.5
2/5: #Trains: 15663, #Val: 3916 valLoss: 2.4562692766909535, best_param α= 0.2
3/5: #Trains: 15663, #Val: 3916 valLoss: 2.3908851123573673, best_param α= 0.4
4/5: #Trains: 15663, #Val: 3916 valLoss: 2.8082645875690333, best_param α= 0.2
5/5: #Trains: 15664, #Val: 3915 valLoss: 2.474757660679112, best_param α= 0.2
2.55001466978


In [9]:
vectorizer = CountVectorizer(ngram_range=(1, 5), analyzer='char_wb', token_pattern='(?u)\\b\\w+\\b')
predict_prob_features, predict_prob_features_test = vectorizer2NB(vectorizer,
                                                                  text, 
                                                                  y,
                                                                  text_test,
                                                                  11, alphas=[1.5, 2., 2.5]) # 2., 2.5
for a, c in author2class.items():
    df['{}_char_wb_count_NB'.format(a)] = predict_prob_features[:, c]
    df_test['{}_char_wb_count_NB'.format(a)] = predict_prob_features_test[:, c]/num_split


{'alpha': [1.5, 2.0, 2.5]} CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 5), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w+\\b', tokenizer=None,
        vocabulary=None)
1/5: #Trains: 15663, #Val: 3916 valLoss: 2.8954841178850548, best_param α= 2.0
2/5: #Trains: 15663, #Val: 3916 valLoss: 2.725757021467124, best_param α= 2.5
3/5: #Trains: 15663, #Val: 3916 valLoss: 2.6258308016995384, best_param α= 2.0
4/5: #Trains: 15663, #Val: 3916 valLoss: 2.919685035156428, best_param α= 2.0
5/5: #Trains: 15664, #Val: 3915 valLoss: 2.792691870927538, best_param α= 2.0
2.79188976943


# MetaFeatures

In [10]:
normal_latters = set(string.ascii_uppercase) | set(string.ascii_lowercase) | set(',.:;"\'?! ')

In [11]:
df['num_words']      = np.array([len(t.split()) for t in df.text])
df_test['num_words'] = np.array([len(t.split()) for t in df_test.text])

df['num_chars']      = np.array([len(t) for t in df.text])
df_test['num_chars'] = np.array([len(t) for t in df_test.text])

df['average_num_chars']      = np.array([np.mean([len(word) for word in t.split()]) for t in df.text])
df_test['average_num_chars'] = np.array([np.mean([len(word) for word in t.split()]) for t in df_test.text])

df['num_uniq_words']      = np.array([len(set(t.split())) for t in df.text])
df_test['num_uniq_words'] = np.array([len(set(t.split())) for t in df_test.text])

df['num_uniq_chars']      = np.array([len(set(t)) for t in df.text])
df_test['num_uniq_chars'] = np.array([len(set(t)) for t in df_test.text])

df['rate_uniq_words']      = np.array([len(set(t.split()))/len(t.split()) for t in df.text])
df_test['rate_uniq_words'] = np.array([len(set(t.split()))/len(t.split()) for t in df_test.text])

df['rate_uniq_chars']       = np.array([len(set(t))/len(t) for t in df.text])
df_test['rate_uniq_chars'] = np.array([len(set(t))/len(t) for t in df_test.text])


special = ',' # ',.:;"\!'?!'
for c in special:
    df['num_'+c] = np.array([t.count(c) for t in df.text])
    df_test['num_'+c] = np.array([t.count(c) for t in df_test.text])


In [12]:
df.to_csv('./../data/train_feature.csv')
df_test.to_csv('./../data/test_feature.csv')

# Go supervised FastText notebook!