In [88]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import nltk

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost.sklearn import XGBRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [2]:
# read in data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# save shape for splitting later
ntrain = train.shape[0]
ntest = test.shape[0]

In [4]:
# Get subset of data for each author
train_EAP = train[train.author.isin(['EAP'])]
train_MWS = train[train.author.isin(['MWS'])]
train_HPL = train[train.author.isin(['HPL'])]

In [5]:
train.author.value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

Maybe remove punctuation? Is "help, different than help different than "help different than help, ?

This will be done with tokenization, actually.


In [6]:
all_words = train['text'].str.split(expand=True).unstack().value_counts()

In [7]:
all_words_EAP = train_EAP['text'].str.split(expand=True).unstack().value_counts()
all_words_MWS = train_MWS['text'].str.split(expand=True).unstack().value_counts()
all_words_HPL = train_HPL['text'].str.split(expand=True).unstack().value_counts()

In [8]:
all_words_EAP

the               13927
of                 8930
and                5222
to                 4625
a                  4514
in                 3750
I                  3598
was                2109
that               2085
my                 1666
with               1635
it                 1555
is                 1530
at                 1464
as                 1458
which              1377
had                1265
for                1214
not                1184
his                1167
by                 1125
be                 1049
have               1037
this                964
he                  963
The                 954
upon                951
from                938
but                 803
an                  783
                  ...  
jacket                1
monotonously          1
"'We                  1
disorders             1
vegetation,           1
regretted.            1
defying               1
testimony,            1
testimony.            1
"Several              1
silly,          

In [9]:
# All tokens of each author
all_tokens = nltk.word_tokenize(" ".join(train.text.values).decode('utf8'))
all_tokens_EAP = nltk.word_tokenize(" ".join(train_EAP.text.values).decode('utf8'))
all_tokens_MWS = nltk.word_tokenize(" ".join(train_MWS.text.values).decode('utf8'))
all_tokens_HPL = nltk.word_tokenize(" ".join(train_HPL.text.values).decode('utf8'))
print(" ".join(all_tokens_EAP[0:50]))

This process , however , afforded me no means of ascertaining the dimensions of my dungeon ; as I might make its circuit , and return to the point whence I set out , without being aware of the fact ; so perfectly uniform seemed the wall . In his


In [10]:
# All tokens of each author, spearated by rows, appended to the original dataframe
train['tokens'] = [nltk.word_tokenize(i.decode('utf8')) for i in train.text.values]
train_EAP['tokens'] = [nltk.word_tokenize(i.decode('utf8')) for i in train_EAP.text.values]
train_MWS['tokens'] = [nltk.word_tokenize(i.decode('utf8')) for i in train_MWS.text.values]
train_HPL['tokens'] = [nltk.word_tokenize(i.decode('utf8')) for i in train_HPL.text.values]
print(train_EAP.tokens.values[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


[u'This', u'process', u',', u'however', u',', u'afforded', u'me', u'no', u'means', u'of', u'ascertaining', u'the', u'dimensions', u'of', u'my', u'dungeon', u';', u'as', u'I', u'might', u'make', u'its', u'circuit', u',', u'and', u'return', u'to', u'the', u'point', u'whence', u'I', u'set', u'out', u',', u'without', u'being', u'aware', u'of', u'the', u'fact', u';', u'so', u'perfectly', u'uniform', u'seemed', u'the', u'wall', u'.']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [11]:
# Stem/lemm the tokens
stemmer = nltk.stem.PorterStemmer()
lemmer = nltk.stem.WordNetLemmatizer()
train['lemmed'] = [[stemmer.stem(lemmer.lemmatize(j)) for j in i] for i in train.tokens.values ]
train_EAP['lemmed'] = [[stemmer.stem(lemmer.lemmatize(j)) for j in i] for i in train_EAP.tokens.values]
train_MWS['lemmed'] = [[stemmer.stem(lemmer.lemmatize(j)) for i in i] for i in train_MWS.tokens.values]
train_HPL['lemmed'] = [[stemmer.stem(lemmer.lemmatize(j)) for j in i] for i in train_HPL.tokens.values]
print(train_EAP.lemmed.values[0])


                                   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


[u'thi', u'process', u',', u'howev', u',', u'afford', u'me', u'no', u'mean', u'of', u'ascertain', u'the', u'dimens', u'of', u'my', u'dungeon', u';', u'a', u'I', u'might', u'make', u'it', u'circuit', u',', u'and', u'return', u'to', u'the', u'point', u'whenc', u'I', u'set', u'out', u',', u'without', u'be', u'awar', u'of', u'the', u'fact', u';', u'so', u'perfectli', u'uniform', u'seem', u'the', u'wall', u'.']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [12]:
pd.Series([j for i in train_EAP.lemmed.values for j in i]).value_counts()

,             17594
the           14969
of             8938
.              7700
a              5997
and            5733
to             4646
I              3775
in             3773
it             2491
that           2327
wa             2229
with           1695
my             1670
is             1645
``             1628
which          1488
at             1468
''             1359
;              1354
not            1347
for            1343
had            1318
be             1311
thi            1308
hi             1284
have           1242
but            1200
by             1144
upon           1025
              ...  
rôle              1
verbal            1
contr             1
bedizzen          1
unsoci            1
allbreath         1
kickapo           1
widdi             1
wheat             1
sadden            1
bethink           1
engross           1
treacher          1
bra               1
wilkin            1
tibia             1
aurelian          1
milki             1
snow              1


In [13]:
# Get one-hot encoding
author_one_hot = pd.get_dummies(train['author'])
author_one_hot.head()

Unnamed: 0,EAP,HPL,MWS
0,1,0,0
1,0,1,0
2,1,0,0
3,0,0,1
4,0,1,0


In [14]:
# Make 3 train sets from one hot
train_is_EAP = train[['id','text']].join(author_one_hot[['EAP']])
train_is_MWS = train[['id','text']].join(author_one_hot[['MWS']])
train_is_HPL = train[['id','text']].join(author_one_hot[['HPL']])
train_is_EAP.head()


Unnamed: 0,id,text,EAP
0,id26305,"This process, however, afforded me no means of...",1
1,id17569,It never once occurred to me that the fumbling...,0
2,id11008,"In his left hand was a gold snuff box, from wh...",1
3,id27763,How lovely is spring As we looked from Windsor...,0
4,id12958,"Finding nothing else, not even gold, the Super...",0


In [44]:
# -------Random attempt-------------

# Split training data so we have sove test data to validate our model
# Use test size of 30%
X_train_EAP, X_test_EAP, y_train_EAP, y_test_EAP = train_test_split(train_is_EAP.text, 
                                                                    train_is_EAP.EAP,
                                                                    test_size=.3,
                                                                    random_state=0)
# Set up a pipeline to tokenize and then estimate
pipe1 = Pipeline([
    ('cv', CountVectorizer()),
    ('clf', SGDClassifier()),
])

# Set up parameters to test in our grid search
parameters = {
    'cv__stop_words': [None,'english'],
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__loss':['log']
}


In [45]:
# Test all combinations of paramters for our estimator and vectorizer
gs = GridSearchCV(pipe1, parameters, n_jobs=-1, verbose=1)
gs = gs.fit(X_train_EAP, y_train_EAP)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   14.2s finished


In [46]:
gs.best_params_     # The best combination of parameters for our pipeline

{'clf__alpha': 1e-05,
 'clf__loss': 'log',
 'clf__penalty': 'l2',
 'cv__stop_words': None}

In [94]:
# Get a prediction from the test data we split off earlier -- We have the correct labels for this in y_test_EAP
y_pred_EAP = gs.predict_proba(X_test_EAP)

# Reformat results
y_pred_EAP = np.array(y_pred_EAP)
y_pred_EAP = pd.Series(index=y_test_EAP.index,
                       data=y_pred_EAP[:,1]
                      ).round()

print(y_pred_EAP.head(10))
print
print(y_test_EAP.head(10))

16527    0.0
6398     0.0
10604    0.0
17864    0.0
19368    0.0
15369    1.0
3949     0.0
14934    0.0
5469     1.0
6409     1.0
dtype: float64

16527    0
6398     0
10604    0
17864    0
19368    0
15369    0
3949     1
14934    0
5469     1
6409     1
Name: EAP, dtype: uint8


In [92]:
# ----- Results -----
y_test_EAP.eq(y_pred_EAP).describe()

count     5874
unique       2
top       True
freq      4732
dtype: object

In [93]:
4732.0/5874.0

0.8055839291794348

In [95]:
# Log_loss is the specified metric from the competition
log_loss(y_test_EAP,y_pred_EAP)

6.7149907991735525

I don't know why log loss is so high. I may be inputing the results wrong. Or maybe it's because I haven't done the other authors yet. Also, note the `.round()` when formatting `y_pred_EAP`. The actual results had floats that were effectively 0 (e-14,15,16 etc) or very nearly 1 (e.g 9.99807e-1). In rounding, there is inherently some information loss so maybe we can find a math function to map that value to some sort of confidence in the result.... 