In [91]:
import nltk
import pprint

In [5]:
my_text = """Cardiovascular disease, including heart disease and stroke, affects tens of millions of people in the 
United States.  Consumers and patients who do not suffer from cardiovascular disease sometimes consider taking aspirin 
to reduce the possibility of having a heart attack or stroke.  Reducing the possibility of having a first heart attack 
or stroke is called primary prevention.  The FDA has reviewed the available data and does not believe the evidence 
supports the general use of aspirin for primary prevention of a heart attack or stroke.  In fact, there are serious 
risks associated with the use of aspirin, including increased risk of bleeding in the stomach and brain, in situations
where the benefit of aspirin for primary prevention has not been established."""

#### Tokenize text and create 'part of speech' tag

In [82]:
tokens = nltk.word_tokenize(my_text)
nltk.pos_tag(tokens)

[('Cardiovascular', 'JJ'),
 ('disease', 'NN'),
 (',', ','),
 ('including', 'VBG'),
 ('heart', 'NN'),
 ('disease', 'NN'),
 ('and', 'CC'),
 ('stroke', 'NN'),
 (',', ','),
 ('affects', 'VBZ'),
 ('tens', 'NNS'),
 ('of', 'IN'),
 ('millions', 'NNS'),
 ('of', 'IN'),
 ('people', 'NNS'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('United', 'NNP'),
 ('States', 'NNPS'),
 ('.', '.'),
 ('Consumers', 'NNS'),
 ('and', 'CC'),
 ('patients', 'NNS'),
 ('who', 'WP'),
 ('do', 'VBP'),
 ('not', 'RB'),
 ('suffer', 'VB'),
 ('from', 'IN'),
 ('cardiovascular', 'JJ'),
 ('disease', 'NN'),
 ('sometimes', 'RB'),
 ('consider', 'VB'),
 ('taking', 'VBG'),
 ('aspirin', 'NN'),
 ('to', 'TO'),
 ('reduce', 'VB'),
 ('the', 'DT'),
 ('possibility', 'NN'),
 ('of', 'IN'),
 ('having', 'VBG'),
 ('a', 'DT'),
 ('heart', 'NN'),
 ('attack', 'NN'),
 ('or', 'CC'),
 ('stroke', 'NN'),
 ('.', '.'),
 ('Reducing', 'VBG'),
 ('the', 'DT'),
 ('possibility', 'NN'),
 ('of', 'IN'),
 ('having', 'VBG'),
 ('a', 'DT'),
 ('first', 'JJ'),
 ('heart', 'NN'),
 ('att

#### Tag named entities

In [83]:
ne_tree = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(my_text)))
# iob_tagged = tree2conlltags(ne_tree)
print ne_tree

(S
  Cardiovascular/JJ
  disease/NN
  ,/,
  including/VBG
  heart/NN
  disease/NN
  and/CC
  stroke/NN
  ,/,
  affects/VBZ
  tens/NNS
  of/IN
  millions/NNS
  of/IN
  people/NNS
  in/IN
  the/DT
  (GPE United/NNP States/NNPS)
  ./.
  Consumers/NNS
  and/CC
  patients/NNS
  who/WP
  do/VBP
  not/RB
  suffer/VB
  from/IN
  cardiovascular/JJ
  disease/NN
  sometimes/RB
  consider/VB
  taking/VBG
  aspirin/NN
  to/TO
  reduce/VB
  the/DT
  possibility/NN
  of/IN
  having/VBG
  a/DT
  heart/NN
  attack/NN
  or/CC
  stroke/NN
  ./.
  Reducing/VBG
  the/DT
  possibility/NN
  of/IN
  having/VBG
  a/DT
  first/JJ
  heart/NN
  attack/NN
  or/CC
  stroke/NN
  is/VBZ
  called/VBN
  primary/JJ
  prevention/NN
  ./.
  The/DT
  (ORGANIZATION FDA/NNP)
  has/VBZ
  reviewed/VBN
  the/DT
  available/JJ
  data/NNS
  and/CC
  does/VBZ
  not/RB
  believe/VB
  the/DT
  evidence/NN
  supports/VBZ
  the/DT
  general/JJ
  use/NN
  of/IN
  aspirin/NN
  for/IN
  primary/JJ
  prevention/NN
  of/IN
  a/DT
  heart/N

#### Extract NEs in a separate tree

In [84]:
from nltk.tree import Tree
named_entities = []
for i in ne_tree:
    if type(i) == Tree:
        named_entities.append([ i.label(), i.leaves()])
print named_entities

[['GPE', [('United', 'NNP'), ('States', 'NNPS')]], ['ORGANIZATION', [('FDA', 'NNP')]]]


#### Noun Phrase Chunking
In order to create an NP-chunker, we will first define a chunk grammar, consisting of rules that indicate how sentences should be chunked. In this case, we will define a simple grammar with a single regular-expression rule. This rule says that an NP chunk should be formed whenever the chunker finds an optional determiner (DT) followed by any number of adjectives (JJ) and then a noun (NN). Using this grammar, we create a chunk parser, and test it on our example sentence. The result is a tree, which we can either print, or display graphically.

In [9]:
def preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences

In [85]:
sentence = preprocess(my_text)[0]
print sentence

grammar = "NP: {<DT>?<JJ>*<NN>}";

cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)
result.draw()

[('Cardiovascular', 'JJ'), ('disease', 'NN'), (',', ','), ('including', 'VBG'), ('heart', 'NN'), ('disease', 'NN'), ('and', 'CC'), ('stroke', 'NN'), (',', ','), ('affects', 'VBZ'), ('tens', 'NNS'), ('of', 'IN'), ('millions', 'NNS'), ('of', 'IN'), ('people', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('.', '.')]
(S
  (NP Cardiovascular/JJ disease/NN)
  ,/,
  including/VBG
  (NP heart/NN)
  (NP disease/NN)
  and/CC
  (NP stroke/NN)
  ,/,
  affects/VBZ
  tens/NNS
  of/IN
  millions/NNS
  of/IN
  people/NNS
  in/IN
  the/DT
  United/NNP
  States/NNPS
  ./.)


#### Coreference resolution using CoreNLP

In [86]:
from pprint import pprint
from pycorenlp.corenlp import StanfordCoreNLP


host = "http://localhost"
port = "9000"
nlp = StanfordCoreNLP(host + ":" + port)

In [88]:
# my_text_c = "John works with Google. He lives in New York and uses his car for commuting to office. "
my_text_c = "John works with Google. Its the best company to work with and he lives in New York."
output = nlp.annotate(
    my_text_c,
    properties={
        "outputFormat": "json",
        "annotators": "dcoref"
    }
)
pprint(output['corefs'])

{u'1': [{u'animacy': u'ANIMATE',
         u'endIndex': 2,
         u'gender': u'MALE',
         u'headIndex': 1,
         u'id': 1,
         u'isRepresentativeMention': True,
         u'number': u'SINGULAR',
         u'position': [1, 1],
         u'sentNum': 1,
         u'startIndex': 1,
         u'text': u'John',
         u'type': u'PROPER'},
        {u'animacy': u'ANIMATE',
         u'endIndex': 10,
         u'gender': u'MALE',
         u'headIndex': 9,
         u'id': 6,
         u'isRepresentativeMention': False,
         u'number': u'SINGULAR',
         u'position': [2, 3],
         u'sentNum': 2,
         u'startIndex': 9,
         u'text': u'he',
         u'type': u'PRONOMINAL'}],
 u'2': [{u'animacy': u'INANIMATE',
         u'endIndex': 5,
         u'gender': u'NEUTRAL',
         u'headIndex': 4,
         u'id': 2,
         u'isRepresentativeMention': True,
         u'number': u'UNKNOWN',
         u'position': [1, 2],
         u'sentNum': 1,
         u'startIndex': 4,
         u

#### Training the POS tagger

In [89]:
# tagged_sentences = nltk.corpus.brown.tagged_sents()
tagged_sentences = nltk.corpus.treebank.tagged_sents()
 
print tagged_sentences[1]
print "Tagged sentences: ", len(tagged_sentences)
print "Tagged words:", len(nltk.corpus.brown.tagged_words())

[(u'Mr.', u'NNP'), (u'Vinken', u'NNP'), (u'is', u'VBZ'), (u'chairman', u'NN'), (u'of', u'IN'), (u'Elsevier', u'NNP'), (u'N.V.', u'NNP'), (u',', u','), (u'the', u'DT'), (u'Dutch', u'NNP'), (u'publishing', u'VBG'), (u'group', u'NN'), (u'.', u'.')]
Tagged sentences:  3914
Tagged words: 1161192


In [92]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

pprint.pprint(features(['This', 'is', 'a', 'simple', 'sentence'], 3))

{'capitals_inside': False,
 'has_hyphen': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'is_capitalized': False,
 'is_first': False,
 'is_last': False,
 'is_numeric': False,
 'next_word': 'sentence',
 'prefix-1': 's',
 'prefix-2': 'si',
 'prefix-3': 'sim',
 'prev_word': 'a',
 'suffix-1': 'e',
 'suffix-2': 'le',
 'suffix-3': 'ple',
 'word': 'simple'}


In [93]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

In [94]:
# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]

print len(training_sentences)
print len(test_sentences)

def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1]) 
    return X, y
 
X, y = transform_to_dataset(training_sentences)
X[0:3]

2935
979


[{'capitals_inside': False,
  'has_hyphen': False,
  'is_all_caps': False,
  'is_all_lower': False,
  'is_capitalized': True,
  'is_first': True,
  'is_last': False,
  'is_numeric': False,
  'next_word': u'Vinken',
  'prefix-1': u'P',
  'prefix-2': u'Pi',
  'prefix-3': u'Pie',
  'prev_word': '',
  'suffix-1': u'e',
  'suffix-2': u're',
  'suffix-3': u'rre',
  'word': u'Pierre'},
 {'capitals_inside': False,
  'has_hyphen': False,
  'is_all_caps': False,
  'is_all_lower': False,
  'is_capitalized': True,
  'is_first': False,
  'is_last': False,
  'is_numeric': False,
  'next_word': u',',
  'prefix-1': u'V',
  'prefix-2': u'Vi',
  'prefix-3': u'Vin',
  'prev_word': u'Pierre',
  'suffix-1': u'n',
  'suffix-2': u'en',
  'suffix-3': u'ken',
  'word': u'Vinken'},
 {'capitals_inside': False,
  'has_hyphen': False,
  'is_all_caps': True,
  'is_all_lower': True,
  'is_capitalized': True,
  'is_first': False,
  'is_last': False,
  'is_numeric': False,
  'next_word': u'61',
  'prefix-1': u',',
  '

In [42]:
### from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
 
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

clf.fit(X[:4000], y[:4000])   # Use only the first 4K samples. It takes a fair bit of time and memory

print 'Training completed'

X_test, y_test = transform_to_dataset(test_sentences)

print "Accuracy:", clf.score(X_test, y_test)

Training completed
Accuracy: 0.861762815362


In [95]:
def pos_tag(sentence):
    tagged_sentence = []
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return zip(sentence, tags)

print pos_tag(nltk.word_tokenize('This is my friend, John.'))

[('This', u'DT'), ('is', u'VBZ'), ('my', u'NN'), ('friend', u'NN'), (',', u','), ('John', u'NNP'), ('.', u'.')]


#### Custom NER using estnltk - Work in progress!!

In [1]:
from estnltk import Text

text = Text(my_text)

# Extract named entities
pprint(list(zip(text.named_entities, text.named_entity_labels, text.named_entity_spans)))

ImportError: No module named estnltk

#### Training a named entity chunker

In [9]:
# import nltk.tag, nltk.chunk, itertools
import itertools
from nltk.corpus import ieer

In [10]:
def ieertree2conlltags(tree, tag=nltk.tag.pos_tag):
    words, ents = zip(*tree.pos())
    iobs = []
    prev = None
    for ent in ents:
        if ent == tree.node:
            iobs.append('O')
            prev = None
        elif prev == ent:
            iobs.append('I-%s' % ent)
        else:
            iobs.append('B-%s' % ent)
            prev = ent

    words, tags = zip(*tag(words))
    return itertools.izip(words, tags, iobs)

def ieer_chunked_sents(tag=nltk.tag.pos_tag):
    for doc in ieer.parsed_docs():
        tagged = ieertree2conlltags(doc.text, tag)
        yield nltk.chunk.conlltags2tree(tagged)

In [11]:
from chunkers import ieer_chunked_sents, ClassifierChunker
# from nltk.corpus import treebank_chunk
ieer_chunks = list(ieer_chunked_sents())
# >>> len(ieer_chunks)
# 94
# >>> chunker = ClassifierChunker(ieer_chunks[:80])
# >>> chunker.parse(treebank_chunk.tagged_sents()[0])

ValueError: Attempted relative import in non-package

#### Introduction to gensim

In [96]:
from gensim.summarization import summarize
print("Original Text:")
print(my_text)

print ("\nSummary:")
print summarize(my_text, word_count=20)

Original Text:
Cardiovascular disease, including heart disease and stroke, affects tens of millions of people in the 
United States.  Consumers and patients who do not suffer from cardiovascular disease sometimes consider taking aspirin 
to reduce the possibility of having a heart attack or stroke.  Reducing the possibility of having a first heart attack 
or stroke is called primary prevention.  The FDA has reviewed the available data and does not believe the evidence 
supports the general use of aspirin for primary prevention of a heart attack or stroke.  In fact, there are serious 
risks associated with the use of aspirin, including increased risk of bleeding in the stomach and brain, in situations
where the benefit of aspirin for primary prevention has not been established.

Summary:
supports the general use of aspirin for primary prevention of a heart attack or stroke.


## Not for Demo

## 1: Text Preprocessing
#### 1.1 Noise Removal

In [25]:
# Function to convert a raw string of words. The input is a single string (a raw movie review),
# and the output is a single string (a preprocessed movie review)
from nltk.corpus import stopwords
def pre_process(raw_review):
    # 1. Remove HTML
    review_text = bs(raw_review).get_text()
    #
    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    #
    # 3. Remove stop words
    words = letters_only.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    #
    # 6. Join the words back into one string separated by space, and return the result.
    return( " ".join( meaningful_words ))

In [26]:
# let's call the function for a single review:
print train['review'][0]
print pre_process(train['review'][0])

"Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \"Hey, let's pool our money together and make a really bad movie!\" Or something like that. What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. All corners were cut, except the one that would have prevented this film's release. Life's like that."
watching time chasers obvious made bunch friends maybe sitting around one day film school said hey let pool money together make really bad movie something like ever said still ended making really bad movie dull story bad script lame acting poor cinematography bottom barrel stock music etc corners cut except one would prevented film release life like


In [27]:
# Initialize an empty list to hold the clean reviews
clean_train = []

# Loop over each review; create an index i that goes from 0 to the length of the movie review list 
for i in range(train["review"].size):
    # Call our function for each one, and add the result to the list of clean reviews
    clean_train.append(pre_process(train["review"][i]))

print "Cleansed Data Shape: {}".format(clean_train.shape)
print "Sample Cleansed Data: {}".format(clean_train['review'][0])

#### 1.2 Lexicon Normalization

In [None]:
# Create Lemmatizer instance
from nltk.stem.wordnet import WordNetLemmatizer 
lem = WordNetLemmatizer()

# Create Stemmer instance
from nltk.stem.porter import PorterStemmer 
stem = PorterStemmer()

In [None]:
word = "multiplying" 
lem.lemmatize(word, "v")
>> "multiply" 
stem.stem(word)
>> "multipli"
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer 
stem = PorterStemmer()

word = "multiplying" 
lem.lemmatize(word, "v")
>> "multiply" 
stem.stem(word)
>> "multipli"