In [146]:
from datetime import datetime
from pathlib import Path
import os
import itertools

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import argparse

class Ghostses:

    def __init__(self, filename):
        """ make a ghostses object
            read the corpus into object
        """
        self.filename = filename
        f = open(str(self.filename), 'r')
        self.corpus = f.read() # plaintext of the corpus


    def getSentences(self):
        """ tokenize corpus by sentence """
        self.sentences = sent_tokenize(score.corpus)        

    
    def getWords(self, preserveSpaces = True):
        """ tokenize corpus sentences by word """
        self.words = []
        for sentence in self.sentences:
            if preserveSpaces == True:
                words = [[word_tokenize(w), ' '] for w in sentence.split()]
                wordList = list(itertools.chain(*list(itertools.chain(*words))))
                if wordList[-1] == ' ':
                    # removes trailing whitespace @ end of sentence if there is any
                    wordList.pop()
                self.words.append(wordList)
            if preserveSpaces == False:
                words = word_tokenize(sentence)
                self.words.append(words)
        self.preserveSpaces = preserveSpaces


#     def getPOS(self):
#         """ filter out whitespace (if there is any) from tokens
#             output whitespace, in original location, to self.spaces
#             run parts of speech analysis on non-whitespace tokens
#             converts and stores output as 2d list
#             [ token, pos ] at self.pos """
#         pos_prep = []
#         if self.preserveSpaces == False:
#             pos = pos_tag(self.tokens)
#             self.pos = list(map(list, pos))
#         elif self.preserveSpaces == True:
#             size = len(self.tokens)
#             self.spaces = [None] * size
#             step = 0
#             for i in self.tokens:
#                 if i.isspace() != True:
#                     pos_prep.append(i)
#                 else:
#                     self.spaces[step] = i
#                 step+=1
#             pos = pos_tag(pos_prep)
#             self.pos = list(map(list, pos))


In [147]:
score = Ghostses("corpora/Stein-short.txt")

In [148]:
score.getSentences()

In [151]:
score.getWordsX(preserveSpaces=True)

In [152]:
score.words

[['There',
  ' ',
  'is',
  ' ',
  'no',
  ' ',
  'gratitude',
  ' ',
  'in',
  ' ',
  'mercy',
  ' ',
  'and',
  ' ',
  'in',
  ' ',
  'medicine',
  '.'],
 ['There',
  ' ',
  'can',
  ' ',
  'be',
  ' ',
  'breakages',
  ' ',
  'in',
  ' ',
  'Japanese',
  '.'],
 ['That', ' ', 'is', ' ', 'no', ' ', 'programme', '.']]

In [749]:
score.getWords2(preserveSpaces = False)

In [750]:
score.words

[['Nickel',
  ',',
  'what',
  'is',
  'nickel',
  ',',
  'it',
  'is',
  'originally',
  'rid',
  'of',
  'a',
  'cover',
  '.'],
 ['The',
  'change',
  'in',
  'that',
  'is',
  'that',
  'red',
  'weakens',
  'an',
  'hour',
  '.'],
 ['The', 'change', 'has', 'come', '.'],
 ['There', 'is', 'no', 'search', '.'],
 ['But',
  'there',
  'is',
  ',',
  'there',
  'is',
  'that',
  'hope',
  'and',
  'that',
  'interpretation',
  'and',
  'sometime',
  ',',
  'surely',
  'any',
  'is',
  'unwelcome',
  ',',
  'sometime',
  'there',
  'is',
  'breath',
  'and',
  'there',
  'will',
  'be',
  'a',
  'sinecure',
  'and',
  'charming',
  'very',
  'charming',
  'is',
  'that',
  'clean',
  'and',
  'cleansing',
  '.'],
 ['Certainly', 'glittering', 'is', 'handsome', 'and', 'convincing', '.'],
 ['There',
  'is',
  'no',
  'gratitude',
  'in',
  'mercy',
  'and',
  'in',
  'medicine',
  '.'],
 ['There', 'can', 'be', 'breakages', 'in', 'Japanese', '.'],
 ['That', 'is', 'no', 'programme', '.'],
 ['

In [230]:
for i in score.words:
    print(i)

There
 
is
 
no
 
gratitude
 
in
 
mercy
 
and
 
in
 
medicine
.
 
There
 
can
 
be
 
breakages
 
in
 
Japanese
.
 
That
 
is
 
no
 
programme
.
 


In [112]:
tokenized = sent_tokenize(score.corpus)

for i in tokenized:
	
	# Word tokenizers is used to find the words
	# and punctuation in a string
	wordsList = word_tokenize(i)

	# removing stop words from wordList
#	wordsList = [w for w in wordsList if not w in stop_words]

	# Using a Tagger. Which is part-of-speech
	# tagger or POS-tagger.
	tagged = pos_tag(wordsList)

	print(tagged)


[('But', 'CC'), ('there', 'EX'), ('is', 'VBZ'), (',', ','), ('there', 'EX'), ('is', 'VBZ'), ('that', 'IN'), ('hope', 'NN'), ('and', 'CC'), ('that', 'DT'), ('interpretation', 'NN'), ('and', 'CC'), ('sometime', 'RB'), (',', ','), ('surely', 'RB'), ('any', 'DT'), ('is', 'VBZ'), ('unwelcome', 'JJ'), (',', ','), ('sometime', 'RB'), ('there', 'EX'), ('is', 'VBZ'), ('breath', 'NN'), ('and', 'CC'), ('there', 'EX'), ('will', 'MD'), ('be', 'VB'), ('a', 'DT'), ('sinecure', 'NN'), ('and', 'CC'), ('charming', 'VBG'), ('very', 'RB'), ('charming', 'VBG'), ('is', 'VBZ'), ('that', 'IN'), ('clean', 'JJ'), ('and', 'CC'), ('cleansing', 'NN'), ('.', '.')]


In [113]:
compare = zip(tagged, score.pos)

In [114]:
for x,y in compare:
    x = list(x)
    if x[1] == y[1]:
        check = '='.join([str(x[1]),str(y[1])])
        print(check)
    else:
        print("NOT EQUAL")

CC=CC
EX=EX
VBZ=VBZ
,=,
EX=EX
VBZ=VBZ
IN=IN
NN=NN
CC=CC
DT=DT
NN=NN
CC=CC
RB=RB
,=,
RB=RB
DT=DT
VBZ=VBZ
JJ=JJ
,=,
RB=RB
EX=EX
VBZ=VBZ
NN=NN
CC=CC
EX=EX
MD=MD
VB=VB
DT=DT
NN=NN
CC=CC
VBG=VBG
RB=RB
VBG=VBG
VBZ=VBZ
IN=IN
JJ=JJ
CC=CC
NN=NN
.=.
