In [5]:
import pandas as pd
import numpy as np
#import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPool1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [37]:
import xgboost as xgb

In [15]:
from sklearn.linear_model import LogisticRegression

In [6]:
train = pd.read_csv('train/train.csv')
test = pd.read_csv('test/test.csv')
sample = pd.read_csv('sample_submission/sample_submission.csv')

In [7]:
train.head(2)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL


In [8]:
test.head(2)

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."


In [9]:
sample.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.403494,0.287808,0.308698
1,id24541,0.403494,0.287808,0.308698
2,id00134,0.403494,0.287808,0.308698
3,id27757,0.403494,0.287808,0.308698
4,id04081,0.403494,0.287808,0.308698


In [10]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i,val] = 1
        actual = actual2
    clip = np.clip(predicted, eps, 1-eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0/rows * vsota

In [11]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)

In [12]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, y,
                                                 stratify=y,
                                                 random_state=42,
                                                 test_size=0.1,
                                                 shuffle=True)

In [13]:
print(xtrain.shape)
print(xvalid.shape)

(17621,)
(1958,)


Basic model

In [14]:
tfv = TfidfVectorizer(min_df=3, max_features=None,
                     strip_accents='unicode', analyzer='word',
                     token_pattern=r'\w{1,}',
                     ngram_range=(1,3),
                     use_idf=1,
                     smooth_idf=1,
                     sublinear_tf=1,
                     stop_words='english')
tfv.fit(list(xtrain)+list(xvalid))
xtrain_tfv = tfv.transform(xtrain)
xvalid_tfv = tfv.transform(xvalid)

In [17]:
clf = LogisticRegression(C=1.0, solver='newton-cg')
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
print("logloss: %0.3f " %multiclass_logloss(yvalid, predictions))

logloss: 0.626 


using count as features insted of tf-idf

In [18]:
ctv = CountVectorizer(analyzer='word', token_pattern=r'w{1,}',
                     ngram_range=(1,3),
                     stop_words='english')
ctv.fit(list(xtrain)+list(xvalid))
xtrain_ctv = ctv.transform(xtrain)
xvalid_ctv = ctv.transform(xvalid)

  'stop_words.' % sorted(inconsistent))


In [23]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)
print('logloss: %0.3f ' % multiclass_logloss(yvalid, predictions))



logloss: 1.079 


Naive bayes classifier

In [32]:
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
print('logloss: %0.3f ' % multiclass_logloss(yvalid, predictions))

logloss: 0.578 


In [33]:
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)
print('logloss: %0.3f' % multiclass_logloss(yvalid, predictions))

logloss: 1.085


reducing number of features with SVD

In [34]:
svd = decomposition.TruncatedSVD(n_components=150)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

#scaling
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

support vector machine classifier

In [35]:
clf = SVC(C=1.0, probability=True)
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)
print('logloss: %0.3f ' % multiclass_logloss(yvalid, predictions))

logloss: 0.720 


XGBoost

In [38]:
clf = xgb.XGBClassifier(max_depth=7, n_estimator=200,
                       colsample_bytree=0.8, 
                       subsample=0.8, 
                       nthread=10,
                       learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())
print('logloss: %0.3f' % multiclass_logloss(yvalid, predictions))

logloss: 0.851


In [40]:
clf = xgb.XGBClassifier(max_depth=7,n_estimators=200,
                       colsample_btree=0.8,
                       subsample=0.8,
                       nthread=10,
                       learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_ctv.tocsc())
print('logloss: %0.3f '% multiclass_logloss(yvalid, predictions))

logloss: 1.081 


In [43]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200,
                       colsample_bytree=0.8,
                       subsample=0.8,
                       nthread=10,
                       learning_rate=0.1)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)
print('loloss: %0.3f ' % multiclass_logloss(yvalid, predictions))

loloss: 0.760 


Grid search

In [48]:
mll_scorer = metrics.make_scorer(multiclass_logloss, 
                                greater_is_better=False,
                                needs_proba=True)

In [44]:
svd = TruncatedSVD()
scl = preprocessing.StandardScaler()
lr_model = LogisticRegression()
clf = pipeline.Pipeline([('svd', svd),
                        ('scl', scl),
                        ('lr', lr_model)])

In [45]:
param_grid = {'svd__n_components': [120,180],
             'lr__C': [0.1,1.0,10],
             'lr__penalty': ['l1', 'l2']}

In [51]:
model = GridSearchCV(estimator=clf, param_grid=param_grid,
                    scoring=mll_scorer,
                    verbose=10,
                    n_jobs=-1,
                    iid=True,
                    refit=True,
                    cv=2)
model.fit(xtrain_tfv,ytrain)

print('Best score: %0.3f' % model.best_score_)
print('Best parameters set:')
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  20 out of  24 | elapsed:  2.4min remaining:   28.5s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.5min finished


Best score: -0.742
Best parameters set:
	lr__C: 10
	lr__penalty: 'l1'
	svd__n_components: 180


In [52]:
nb_model = MultinomialNB()
clf = pipeline.Pipeline([('nb', nb_model)])

param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
model = GridSearchCV(estimator=clf, param_grid=param_grid,
                    scoring=mll_scorer,
                    verbose=10,
                    n_jobs=-1,
                    iid=True,
                    refit=True, cv=2)
model.fit(xtrain_tfv, ytrain)
print('Best score: %0.3f' %model.best_score_)
print('Best parameters set:')
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    1.6s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:    1.6s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    1.8s finished


Best score: -0.492
Best parameters set:
	nb__alpha: 0.1


Word vectors