In [268]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from autocorrect import spell
from collections import Counter
from nltk.util import ngrams
import itertools, nltk, re, pprint, string, os

In [8]:
sentences = ["VADER is smart, handsome, and funny.", # positive sentence example
    "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
   "VADER is very smart, handsome, and funny.",  # booster words handled correctly (sentiment intensity adjusted)
    "VADER is VERY SMART, handsome, and FUNNY.",  # emphasis for ALLCAPS handled
    "VADER is VERY SMART, handsome, and FUNNY!!!",# combination of signals - VADER appropriately adjusts intensity
    "VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!",# booster words & punctuation make this close to ceiling for score
   "The book was good.",         # positive sentence
   "The book was kind of good.", # qualified positive sentence is handled correctly (intensity adjusted)
  "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence
   "A really bad, horrible book.",       # negative sentence with booster words
   "At least it isn't a horrible book.", # negated negative sentence with contraction
    ":) and :D",     # emoticons handled
    "",              # an empty string is correctly handled
    "Today sux",     #  negative slang handled
    "Today sux!",    #  negative slang with punctuation emphasis handled
    "Today SUX!",    #  negative slang with capitalization emphasis
   "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but"
]

In [9]:
sid = SentimentIntensityAnalyzer()
# for sentence in sentences:
#     print(sentence)
#     ss = sid.polarity_scores(sentence)
#     for k in sorted(ss):
#          print('{0}: {1}, '.format(k, ss[k]))
#     print()

In [11]:
test_sentence = 'OMG!! such a horrible thing.'
sid.polarity_scores(test_sentence)

In [13]:
test_sentence = 'NOT BAD.'
sid.polarity_scores(test_sentence)

{'compound': -0.5423, 'neg': 0.778, 'neu': 0.222, 'pos': 0.0}

In [239]:
text = """Dry/Reheated:  I kid you not I walked in after the lunch rush 
and ordered a few slices of pepperoni pizza, the guy 
(without washing his hands after taking the money) walks 
to some left over lunch cheese pizza sitting on a counter tosses 
some pepperoni on it and chucks it back into the oven.  
This pizza looked like it had been out for a few hours and it tasted like it 
did too (yes I bit into it against my basic instinct...I was hungry!). 
Needless to say I didn\'t finish it, the two men at the front counter were
not at all friendly in any way and I felt like I was interrupting break-time 
when I ordered my slice.  \n\nI didn\'t feel "Italian" anything from this 
restaurant setting or menu."""

In [16]:
class Splitter(object):
    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self, text):
        """
        input format: a paragraph of text
        output format: a list of lists of words.
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        """
        sentences = self.nltk_splitter.tokenize(text)
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
        return tokenized_sentences


class POSTagger(object):
    def __init__(self):
        pass
        
    def pos_tag(self, sentences):
        """
        input format: list of lists of words
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        output format: list of lists of tagged tokens. Each tagged tokens has a
        form, a lemma, and a list of tags
            e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
                    [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
        """

        pos = [nltk.pos_tag(sentence) for sentence in sentences]
        #adapt format
        pos = [[(word, [postag]) for (word, postag) in sentence] for sentence in pos]
        return pos

In [17]:
splitter = Splitter()
postagger = POSTagger()
splitted_sentences = splitter.split(text)
pos_tagged_sentences = postagger.pos_tag(splitted_sentences)

In [27]:
pos_tagged_sentences

In [29]:
# for r in rules:
#     print "%5.3f %5.3f %s" % (r.support, r.confidence, r)

In [37]:
text2 = "the little yellow dog barked at the cat"

In [48]:
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(pos_sentences[0])

In [53]:
grammar2 = "NP: {<DT>?<JJ.*>*<NN.*>+}"
pos_sentences2 = ie_preprocess(text)
cp = nltk.RegexpParser(grammar2)
# for sent in pos_sentences2:
#     print cp.parse(sent)
   

In [113]:
stop = [s_word.encode('utf-8') for s_word in set(stopwords.words("english"))]

In [218]:
def ie_preprocess(document):
    wordnet_lemmatizer = WordNetLemmatizer()
    sentences = nltk.sent_tokenize(document) 
    preprocessed = []
    for sent in sentences: 
        filtered_words = []
        sent = sent.translate(string.maketrans("",""), string.punctuation)
        sentence = nltk.word_tokenize(sent)
        for word in sentence:
            if word.lower() not in stop and word.lower() not in string.punctuation:
                filtered_words.append(wordnet_lemmatizer.lemmatize(spell(word)).encode('utf-8'))
        preprocessed.append((nltk.pos_tag(filtered_words)))       
    return preprocessed

while using stemmers like porter and Snowball words get stemmed where not necessary as shown in below cases:
hungry -> hungri,
restaurant -> restaur,
anything -> anyth,
break-time -> break-tim and so on

So decided to use a lemmantizer which extracts lemmas using wordnet

wordnet_lemmatizer = WordNetLemmatizer()
wordnet_lemmatizer.lemmatize("dogs")

result: dog

In [228]:
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
#     stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    word = word.lower()
#     word = stemmer.stem_word(word)
    word = lemmatizer.lemmatize(word)
    return word

u'dog'

In [162]:
import numpy as np

In [219]:
preprocessed_sentences = ie_preprocess(text)

In [229]:
preprocessed_sentences;

In [None]:
f = list(itertools.chain.from_iterable(preprocessed_sentences))

In [209]:
len(f) * 0.01

0.61

In [230]:
Counter(f);

In [266]:
import nltk
from nltk.corpus import stopwords

# text = """The Buddha, the Godhead, resides quite as comfortably in the circuits of a digital
# computer or the gears of a cycle transmission as he does at the top of a mountain
# or in the petals of a flower. To think otherwise is to demean the Buddha...which is
# to demean oneself."""


# Used when tokenizing words
sentence_re = r'(?:(?:[A-Z])(?:.[A-Z])+.?)|(?:\w+(?:-\w+)*)|(?:\$?\d+(?:.\d+)?%?)|(?:...|)(?:[][.,;"\'?():-_`])'

# sentence_re = r'''(?x)      # set flag to allow verbose regexps
#       ([A-Z])(\.[A-Z])+\.?  # abbreviations, e.g. U.S.A.
#     | \w+(-\w+)*            # words with optional internal hyphens
#     | \$?\d+(\.\d+)?%?      # currency and percentages, e.g. $12.40, 82%
#     | \.\.\.                # ellipsis
#     | [][.,;"'?():-_`]      # these are separate tokens
# '''

lemmatizer = nltk.WordNetLemmatizer()
# stemmer = nltk.stem.porter.PorterStemmer()

#Taken from Su Nam Kim Paper...
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""
chunker = nltk.RegexpParser(grammar)
stopwords = stopwords.words('english')

def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()

def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
#     word = stemmer.stem_word(word)
    word = lemmatizer.lemmatize(word)
    return word

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40) and (word.lower() not in stop)
    return accepted

def get_terms(tree):
    for leaf in leaves(tree):
        term = [normalise(w) for w,t in leaf if acceptable_word(w)]
        yield term
        
for sent in nltk.sent_tokenize(text):
    toks = nltk.regexp_tokenize(sent, sentence_re)
    postoks = nltk.tag.pos_tag(toks) #nltk.tag.pos_tag(toks)
    print postoks
    tree = chunker.parse(postoks)
    terms = get_terms(tree)
    for term in terms:
        for word in term:
            print word,
        print

[(u'Dry', 'NNP'), (u'Reheated', 'VBD'), (u':', ':'), (u'I', 'PRP'), (u'kid', 'VBP'), (u'you', 'PRP'), (u'not', 'RB'), (u'I', 'PRP'), (u'walked', 'VBD'), (u'in', 'IN'), (u'after', 'IN'), (u'the', 'DT'), (u'lunch', 'NN'), (u'rush', 'NN'), (u'and', 'CC'), (u'ordered', 'VBD'), (u'a', 'DT'), (u'few', 'JJ'), (u'slices', 'NNS'), (u'of', 'IN'), (u'pepperoni', 'NN'), (u'pizzaz', 'VBP'), (u'the', 'DT'), (u'guy', 'NN'), (u'without', 'IN'), (u'washing', 'VBG'), (u'his', 'PRP$'), (u'hands', 'NNS'), (u'after', 'IN'), (u'taking', 'VBG'), (u'the', 'DT'), (u'money', 'NN'), (u'walks', 'NNS'), (u'to', 'TO'), (u'some', 'DT'), (u'left', 'VBN'), (u'over', 'RP'), (u'lunch', 'JJ'), (u'cheese', 'JJ'), (u'pizza', 'NN'), (u'sitting', 'VBG'), (u'on', 'IN'), (u'a', 'DT'), (u'counter', 'NN'), (u'tosses', 'VBZ'), (u'some', 'DT'), (u'pepperoni', 'NN'), (u'on', 'IN'), (u'it', 'PRP'), (u'and', 'CC'), (u'chucks', 'VBZ'), (u'it', 'PRP'), (u'back', 'RB'), (u'into', 'IN'), (u'the', 'DT'), (u'oven', 'NN'), (u'This', 'DT'), 

In [242]:
spell("didn't")

'didnt'

In [123]:
!pip install autocorrect

Collecting autocorrect
  Downloading autocorrect-0.2.0.tar.gz (3.6MB)
[K    100% |████████████████████████████████| 3.6MB 136kB/s 
[?25hBuilding wheels for collected packages: autocorrect
  Running setup.py bdist_wheel for autocorrect ... [?25l- \ | done
[?25h  Stored in directory: /Users/datascientist/Library/Caches/pip/wheels/b2/1b/a1/e7e6980a801dcb6402363df7aceb4605f0d34acc23afd79f90
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-0.2.0


'autofocus'

'restaur'