In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import nltk
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')
nltk.download('stopwords')
# Loading in the training data with Pandas

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
all_words = train['text'].str.split(expand=True).unstack().value_counts()


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matthewgalligan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matthewgalligan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#We have a bunch of wonky punctuation that we need to fix, let's try again.

train['words'] = train['text'].apply(lambda t: nltk.word_tokenize(str.lower(t))).values.tolist()
train['words']


0        [this, process, ,, however, ,, afforded, me, n...
1        [it, never, once, occurred, to, me, that, the,...
2        [in, his, left, hand, was, a, gold, snuff, box...
3        [how, lovely, is, spring, as, we, looked, from...
4        [finding, nothing, else, ,, not, even, gold, ,...
5        [a, youth, passed, in, solitude, ,, my, best, ...
6        [the, astronomer, ,, perhaps, ,, at, this, poi...
7        [the, surcingle, hung, in, ribands, from, my, ...
8        [i, knew, that, you, could, not, say, to, your...
9        [i, confess, that, neither, the, structure, of...
10       [he, shall, find, that, i, can, feel, my, inju...
11       [here, we, barricaded, ourselves, ,, and, ,, f...
12       [herbert, west, needed, fresh, bodies, because...
13       [the, farm, like, grounds, extended, back, ver...
14       [but, a, glance, will, show, the, fallacy, of,...
15       [he, had, escaped, me, ,, and, i, must, commen...
16       [to, these, speeches, they, gave, ,, of, cours.

In [4]:

stopwords = nltk.corpus.stopwords.words('english')

#remove stopwords
punctuation = [',','.',';','?',':','``',"''","'"]
stopwords = stopwords  + punctuation

train['words'] = train['words'].apply(lambda t: [word for word in t if word not in stopwords and len(word) > 0])


In [5]:
#Get all the words into a series and show the value counts of all the various words.
slist = []
for x in train['words']:
        slist.extend(x)

all_words = pd.Series(slist)
counts = all_words.value_counts()
#The most common word is still a blank string, and I'm not sure why this is an issue.
counts

#Train for bucket of words

#Train for categorization

one               1623
upon              1411
's                1355
could             1330
would             1258
man                777
time               730
yet                715
said               704
even               700
might              629
old                616
like               613
first              602
must               597
us                 596
never              570
life               569
night              566
made               565
found              558
seemed             544
eyes               540
every              535
little             531
day                523
still              519
great              511
long               510
saw                502
                  ... 
godhead              1
bankers              1
fero                 1
'offspring           1
precociousness       1
maguntinae           1
benjamin             1
insistently          1
hecate               1
warming              1
walnut               1
crossbones           1
trammpled  

In [6]:
# save shape for splitting later
ntrain = train.shape[0]
ntest = test.shape[0]

# Get subset of data for each author
train_EAP = train[train.author.isin(['EAP'])]
train_MWS = train[train.author.isin(['MWS'])]
train_HPL = train[train.author.isin(['HPL'])]



In [18]:
# Get one-hot encoding
author_one_hot = pd.get_dummies(train['author'])


0              process
1              however
2             afforded
3                means
4         ascertaining
5           dimensions
6              dungeon
7                might
8                 make
9              circuit
10              return
11               point
12              whence
13                 set
14             without
15               aware
16                fact
17           perfectly
18             uniform
19              seemed
20                wall
21               never
22            occurred
23            fumbling
24               might
25                mere
26             mistake
27                left
28                hand
29                gold
              ...     
257872           built
257873           brush
257874            lids
257875        clenched
257876        together
257877           spasm
257878            mais
257879              il
257880            faut
257881            agir
257882             say
257883       frenchman
257884     

In [43]:
unique_words = all_words.unique()
cv = CountVectorizer(vocabulary=unique_words)
output = cv.fit_transform(train['text'])
out_arr = output
#st = vec.fit_transform(train['words'])
#st

print(out_arr[4])

  (0, 28)	1
  (0, 64)	1
  (0, 65)	1
  (0, 66)	1
  (0, 67)	1
  (0, 68)	1
  (0, 69)	1
  (0, 70)	1
  (0, 71)	1
  (0, 72)	1
  (0, 73)	1
  (0, 74)	1
  (0, 75)	1
  (0, 76)	1
  (0, 77)	1
  (0, 78)	1
