In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import nltk
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')
nltk.download('stopwords')
# Loading in the training data with Pandas

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
all_words = train['text'].str.split(expand=True).unstack().value_counts()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\matth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\matth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#We have a bunch of wonky punctuation that we need to fix, let's try again.

train['words'] = train['text'].apply(lambda t: nltk.word_tokenize(str.lower(t))).values.tolist()
train['words']


0        [this, process, ,, however, ,, afforded, me, n...
1        [it, never, once, occurred, to, me, that, the,...
2        [in, his, left, hand, was, a, gold, snuff, box...
3        [how, lovely, is, spring, as, we, looked, from...
4        [finding, nothing, else, ,, not, even, gold, ,...
5        [a, youth, passed, in, solitude, ,, my, best, ...
6        [the, astronomer, ,, perhaps, ,, at, this, poi...
7        [the, surcingle, hung, in, ribands, from, my, ...
8        [i, knew, that, you, could, not, say, to, your...
9        [i, confess, that, neither, the, structure, of...
10       [he, shall, find, that, i, can, feel, my, inju...
11       [here, we, barricaded, ourselves, ,, and, ,, f...
12       [herbert, west, needed, fresh, bodies, because...
13       [the, farm, like, grounds, extended, back, ver...
14       [but, a, glance, will, show, the, fallacy, of,...
15       [he, had, escaped, me, ,, and, i, must, commen...
16       [to, these, speeches, they, gave, ,, of, cours.

In [3]:

stopwords = nltk.corpus.stopwords.words('english')

#remove stopwords
punctuation = [',','.',';','?',':','``',"''","'"]
stopwords = stopwords  + punctuation

train['words'] = train['words'].apply(lambda t: [word for word in t if word not in stopwords and len(word) > 0])


In [4]:
#Get all the words into a series and show the value counts of all the various words.
slist = []
for x in train['words']:
        slist.extend(x)

all_words = pd.Series(slist)
counts = all_words.value_counts()
#The most common word is still a blank string, and I'm not sure why this is an issue.
counts

#Train for bucket of words

#Train for categorization

one              1623
upon             1411
's               1355
could            1330
would            1258
man               777
time              730
yet               715
said              704
even              700
might             629
old               616
like              613
first             602
must              597
us                596
never             570
life              569
night             566
made              565
found             558
seemed            544
eyes              540
every             535
little            531
day               523
still             519
great             511
long              510
saw               502
                 ... 
overcrowded         1
parmly              1
vagary              1
alp                 1
tackle              1
meas                1
scuffling           1
blueness            1
supporter           1
inappreciable       1
haousekeeper        1
quivers             1
'all                1
waiters             1
seedy     

In [5]:
# save shape for splitting later
ntrain = train.shape[0]
ntest = test.shape[0]

# Get subset of data for each author
train_EAP = train[train.author.isin(['EAP'])]
train_MWS = train[train.author.isin(['MWS'])]
train_HPL = train[train.author.isin(['HPL'])]



In [6]:
# Get one-hot encoding
author_one_hot = pd.get_dummies(train['author'])


In [7]:
#Make a vector from all words bucket

unique_words = all_words.unique()
cv = CountVectorizer(vocabulary=unique_words)
output = cv.fit_transform(train['text'])
out_arr = output
#st = vec.fit_transform(train['words'])
#st

print(out_arr[4])

  (0, 28)	1
  (0, 64)	1
  (0, 65)	1
  (0, 66)	1
  (0, 67)	1
  (0, 68)	1
  (0, 69)	1
  (0, 70)	1
  (0, 71)	1
  (0, 72)	1
  (0, 73)	1
  (0, 74)	1
  (0, 75)	1
  (0, 76)	1
  (0, 77)	1
  (0, 78)	1


### Ok, at this point, I have a matrix of each individual word for each row in train['text'] it is in the var output

From here, I can do regular old regression against the various authors (one at a time, I presume)

In [10]:
train_matrix = pd.DataFrame(output.toarray())
#train_matrix['id'] = train['id']


#Now, I have a training matrix with everything, but the author isn't categorized out yet.
train_matrix[0:5]

#I want to get three separate models for guessing EAP, MHS, HPL, and setup three separate matrixes to do so.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25216,25217,25218,25219,25220,25221,25222,25223,25224,25225
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:

    
y_values = pd.DataFrame()    
y_values['author'] = train['author'].apply(lambda t: 0 if t == 'EAP' else (1 if t == 'MWS' else 2))    
print(y_values['author'][0:10])
print(train['author'][0:10])
train_matrix[0:5]

0    0
1    2
2    0
3    1
4    2
5    1
6    0
7    0
8    0
9    1
Name: author, dtype: int64
0    EAP
1    HPL
2    EAP
3    MWS
4    HPL
5    MWS
6    EAP
7    EAP
8    EAP
9    MWS
Name: author, dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25216,25217,25218,25219,25220,25221,25222,25223,25224,25225
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Ok! We have a working input matrix that looks right for words and authors. we use xgboost with binary:logistics and predict_prob to get probability from our non-scalar author values



