In [1]:
from matplotlib import pyplot as plt
import numpy as np

%matplotlib inline

# Lesson 1 - Tokenizing words and sentences

NLTK Notes:
- Tokenizing: word tokenizers and sentence tokenizers
    - groups corpus by words and sentences
- Lexicon and Corporas
     - Lexicon: words and their meanings (like a dictionary)
     - Corporas: body of text, e.g., medical journals, presidential speeches, English language

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
example_text = """Hello Mr. Smith, how are you doing today? 
                    The weather is great and Python is awesome .The sky is pinkish blue. 
                    You should not eat cardboard"""

print(sent_tokenize(example_text))

['Hello Mr. Smith, how are you doing today?', 'The weather is great and Python is awesome .The sky is pinkish blue.', 'You should not eat cardboard']


In [4]:
print(word_tokenize(example_text))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', 'and', 'Python', 'is', 'awesome', '.The', 'sky', 'is', 'pinkish', 'blue', '.', 'You', 'should', 'not', 'eat', 'cardboard']


# Lesson 2 - Stop words

In [5]:
from nltk.corpus import stopwords

In [6]:
example_sentence = "This is an example showing off stop word filtration"

stop_words = set(stopwords.words("english"))
print(stop_words)

set([u'all', u'just', u"don't", u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'don', u'hadn', u'herself', u'll', u'had', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u"should've", u"haven't", u'do', u'them', u'his', u'very', u"you've", u'they', u'not', u'during', u'now', u'him', u'nor', u"wasn't", u'd', u'did', u'didn', u'this', u'she', u'each', u'further', u"won't", u'where', u"mustn't", u"isn't", u'few', u'because', u"you'd", u'doing', u'some', u'hasn', u"hasn't", u'are', u'our', u'ourselves', u'out', u'what', u'for', u"needn't", u'below', u're', u'does', u"shouldn't", u'above', u'between', u'mustn', u't', u'be', u'we', u'who', u"mightn't", u"doesn't", u'were', u'here', u'shouldn', u'hers', u"aren't", u'by', u'on', u'about', u'couldn', u'of', u"wouldn't", u'against', u's', u'isn', u'or', u'own', u'into', u'yourself', u'down', u"hadn't", u'mightn', u"couldn't", u'wasn', u'your', u"you're", u'from', u'her', u'their', u'aren', u"it's",

In [7]:
words = word_tokenize(example_sentence)
filtered_sentence = []
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)

filtered_sentence

['This', 'example', 'showing', 'stop', 'word', 'filtration']

In [8]:
# We can do this with list comprehension also
filtered_sentence2 = [w for w in words if not w in stop_words]
filtered_sentence2

['This', 'example', 'showing', 'stop', 'word', 'filtration']

# Lesson 3 - Stemming

- A form of preprocessing
- Takes the stem of words (written, writing, wrote, write -> stem = writ)
- We do it b/c there are different variations of words based on stems, but the meaning of the word is unchanged.

In [9]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]

for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [10]:
new_text = "It is very import to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

It
is
veri
import
to
be
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


# Lesson 4 - Part of Speech Tagging


POS tag list:

- CC	coordinating conjunction
- CD	cardinal digit
- DT	determiner
- EX	existential there (like: "there is" ... think of it like "there exists")
- FW	foreign word
- IN	preposition/subordinating conjunction
- JJ	adjective	'big'
- JJR	adjective, comparative	'bigger'
- JJS	adjective, superlative	'biggest'
- LS	list marker	1)
- MD	modal	could, will
- NN	noun, singular 'desk'
- NNS	noun plural	'desks'
- NNP	proper noun, singular	'Harrison'
- NNPS	proper noun, plural	'Americans'
- PDT	predeterminer	'all the kids'
- POS	possessive ending	parent's
- PRP	personal pronoun	I, he, she
- PRP\$	possessive pronoun	my, his, hers
- RB	adverb	very, silently,
- RBR	adverb, comparative	better
- RBS	adverb, superlative	best
- RP	particle	give up
- TO	to	go 'to' the store.
- UH	interjection	errrrrrrrm
- VB	verb, base form	take
- VBD	verb, past tense	took
- VBG	verb, gerund/present participle	taking
- VBN	verb, past participle	taken
- VBP	verb, sing. present, non-3d	take
- VBZ	verb, 3rd person sing. present	takes
- WDT	wh-determiner	which
- WP	wh-pronoun	who, what
- WP\$	possessive wh-pronoun	whose
- WRB	wh-abverb	where, when

In [11]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer # unsupervised ML tokenizer. sent_tokenizer works the same, but it is pretrained

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

tokenized[:10]

[u"PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.",
 u'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.',
 u'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.',
 u'(Applause.)',
 u'President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.',
 u'31, 2006.',
 u"White House photo by Eric DraperEvery time I'm invited to this rostrum, I'm humbled by the privilege, and mindful of the history we've seen together.",
 u'We have gathered under this Capitol dome in moments of national mourning and nati

In [12]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))
        
process_content()

[(u'PRESIDENT', 'NNP'), (u'GEORGE', 'NNP'), (u'W.', 'NNP'), (u'BUSH', 'NNP'), (u"'S", 'POS'), (u'ADDRESS', 'NNP'), (u'BEFORE', 'IN'), (u'A', 'NNP'), (u'JOINT', 'NNP'), (u'SESSION', 'NNP'), (u'OF', 'IN'), (u'THE', 'NNP'), (u'CONGRESS', 'NNP'), (u'ON', 'NNP'), (u'THE', 'NNP'), (u'STATE', 'NNP'), (u'OF', 'IN'), (u'THE', 'NNP'), (u'UNION', 'NNP'), (u'January', 'NNP'), (u'31', 'CD'), (u',', ','), (u'2006', 'CD'), (u'THE', 'NNP'), (u'PRESIDENT', 'NNP'), (u':', ':'), (u'Thank', 'NNP'), (u'you', 'PRP'), (u'all', 'DT'), (u'.', '.')]
[(u'Mr.', 'NNP'), (u'Speaker', 'NNP'), (u',', ','), (u'Vice', 'NNP'), (u'President', 'NNP'), (u'Cheney', 'NNP'), (u',', ','), (u'members', 'NNS'), (u'of', 'IN'), (u'Congress', 'NNP'), (u',', ','), (u'members', 'NNS'), (u'of', 'IN'), (u'the', 'DT'), (u'Supreme', 'NNP'), (u'Court', 'NNP'), (u'and', 'CC'), (u'diplomatic', 'JJ'), (u'corps', 'NN'), (u',', ','), (u'distinguished', 'JJ'), (u'guests', 'NNS'), (u',', ','), (u'and', 'CC'), (u'fellow', 'JJ'), (u'citizens', 'NN

[(u'When', 'WRB'), (u'they', 'PRP'), (u'murder', 'VBP'), (u'children', 'NNS'), (u'at', 'IN'), (u'a', 'DT'), (u'school', 'NN'), (u'in', 'IN'), (u'Beslan', 'NNP'), (u',', ','), (u'or', 'CC'), (u'blow', 'VB'), (u'up', 'RP'), (u'commuters', 'NNS'), (u'in', 'IN'), (u'London', 'NNP'), (u',', ','), (u'or', 'CC'), (u'behead', 'VB'), (u'a', 'DT'), (u'bound', 'NN'), (u'captive', 'NN'), (u',', ','), (u'the', 'DT'), (u'terrorists', 'NNS'), (u'hope', 'VBP'), (u'these', 'DT'), (u'horrors', 'NNS'), (u'will', 'MD'), (u'break', 'VB'), (u'our', 'PRP$'), (u'will', 'MD'), (u',', ','), (u'allowing', 'VBG'), (u'the', 'DT'), (u'violent', 'NN'), (u'to', 'TO'), (u'inherit', 'VB'), (u'the', 'DT'), (u'Earth', 'NNP'), (u'.', '.')]
[(u'But', 'CC'), (u'they', 'PRP'), (u'have', 'VBP'), (u'miscalculated', 'VBN'), (u':', ':'), (u'We', 'PRP'), (u'love', 'VBP'), (u'our', 'PRP$'), (u'freedom', 'NN'), (u',', ','), (u'and', 'CC'), (u'we', 'PRP'), (u'will', 'MD'), (u'fight', 'VB'), (u'to', 'TO'), (u'keep', 'VB'), (u'it', 'P

[(u'White', 'NNP'), (u'House', 'NNP'), (u'photo', 'NN'), (u'by', 'IN'), (u'Eric', 'NNP'), (u'Draper', 'NNP'), (u'Our', 'PRP$'), (u'men', 'NNS'), (u'and', 'CC'), (u'women', 'NNS'), (u'in', 'IN'), (u'uniform', 'JJ'), (u'are', 'VBP'), (u'making', 'VBG'), (u'sacrifices', 'NNS'), (u'--', ':'), (u'and', 'CC'), (u'showing', 'VBG'), (u'a', 'DT'), (u'sense', 'NN'), (u'of', 'IN'), (u'duty', 'NN'), (u'stronger', 'JJR'), (u'than', 'IN'), (u'all', 'DT'), (u'fear', 'NN'), (u'.', '.')]
[(u'They', 'PRP'), (u'know', 'VBP'), (u'what', 'WP'), (u'it', 'PRP'), (u"'s", 'VBZ'), (u'like', 'IN'), (u'to', 'TO'), (u'fight', 'VB'), (u'house', 'NN'), (u'to', 'TO'), (u'house', 'NN'), (u'in', 'IN'), (u'a', 'DT'), (u'maze', 'NN'), (u'of', 'IN'), (u'streets', 'NNS'), (u',', ','), (u'to', 'TO'), (u'wear', 'VB'), (u'heavy', 'JJ'), (u'gear', 'NN'), (u'in', 'IN'), (u'the', 'DT'), (u'desert', 'NN'), (u'heat', 'NN'), (u',', ','), (u'to', 'TO'), (u'see', 'VB'), (u'a', 'DT'), (u'comrade', 'NN'), (u'killed', 'VBN'), (u'by', 'I

[(u'It', 'PRP'), (u'is', 'VBZ'), (u'said', 'VBD'), (u'that', 'IN'), (u'prior', 'JJ'), (u'to', 'TO'), (u'the', 'DT'), (u'attacks', 'NNS'), (u'of', 'IN'), (u'September', 'NNP'), (u'the', 'DT'), (u'11th', 'CD'), (u',', ','), (u'our', 'PRP$'), (u'government', 'NN'), (u'failed', 'VBD'), (u'to', 'TO'), (u'connect', 'VB'), (u'the', 'DT'), (u'dots', 'NNS'), (u'of', 'IN'), (u'the', 'DT'), (u'conspiracy', 'NN'), (u'.', '.')]
[(u'We', 'PRP'), (u'now', 'RB'), (u'know', 'VBP'), (u'that', 'IN'), (u'two', 'CD'), (u'of', 'IN'), (u'the', 'DT'), (u'hijackers', 'NNS'), (u'in', 'IN'), (u'the', 'DT'), (u'United', 'NNP'), (u'States', 'NNPS'), (u'placed', 'VBD'), (u'telephone', 'NN'), (u'calls', 'NNS'), (u'to', 'TO'), (u'al', 'VB'), (u'Qaeda', 'NNP'), (u'operatives', 'VBZ'), (u'overseas', 'RB'), (u'.', '.')]
[(u'But', 'CC'), (u'we', 'PRP'), (u'did', 'VBD'), (u'not', 'RB'), (u'know', 'VB'), (u'about', 'IN'), (u'their', 'PRP$'), (u'plans', 'NNS'), (u'until', 'IN'), (u'it', 'PRP'), (u'was', 'VBD'), (u'too', 'RB

[(u'Tonight', 'NNP'), (u'I', 'PRP'), (u'will', 'MD'), (u'set', 'VB'), (u'out', 'RP'), (u'a', 'DT'), (u'better', 'JJR'), (u'path', 'NN'), (u':', ':'), (u'an', 'DT'), (u'agenda', 'NN'), (u'for', 'IN'), (u'a', 'DT'), (u'nation', 'NN'), (u'that', 'WDT'), (u'competes', 'VBZ'), (u'with', 'IN'), (u'confidence', 'NN'), (u';', ':'), (u'an', 'DT'), (u'agenda', 'NN'), (u'that', 'WDT'), (u'will', 'MD'), (u'raise', 'VB'), (u'standards', 'NNS'), (u'of', 'IN'), (u'living', 'NN'), (u'and', 'CC'), (u'generate', 'VB'), (u'new', 'JJ'), (u'jobs', 'NNS'), (u'.', '.')]
[(u'Americans', 'NNPS'), (u'should', 'MD'), (u'not', 'RB'), (u'fear', 'VB'), (u'our', 'PRP$'), (u'economic', 'JJ'), (u'future', 'NN'), (u',', ','), (u'because', 'IN'), (u'we', 'PRP'), (u'intend', 'VBP'), (u'to', 'TO'), (u'shape', 'VB'), (u'it', 'PRP'), (u'.', '.')]
[(u'Keeping', 'VBG'), (u'America', 'NNP'), (u'competitive', 'JJ'), (u'begins', 'NNS'), (u'with', 'IN'), (u'keeping', 'VBG'), (u'our', 'PRP$'), (u'economy', 'NN'), (u'growing', 'VBG

[(u'Since', 'IN'), (u'2001', 'CD'), (u',', ','), (u'we', 'PRP'), (u'have', 'VBP'), (u'spent', 'VBN'), (u'nearly', 'RB'), (u'$', '$'), (u'10', 'CD'), (u'billion', 'CD'), (u'to', 'TO'), (u'develop', 'VB'), (u'cleaner', 'JJR'), (u',', ','), (u'cheaper', 'JJR'), (u',', ','), (u'and', 'CC'), (u'more', 'RBR'), (u'reliable', 'JJ'), (u'alternative', 'JJ'), (u'energy', 'NN'), (u'sources', 'NNS'), (u'--', ':'), (u'and', 'CC'), (u'we', 'PRP'), (u'are', 'VBP'), (u'on', 'IN'), (u'the', 'DT'), (u'threshold', 'NN'), (u'of', 'IN'), (u'incredible', 'JJ'), (u'advances', 'NNS'), (u'.', '.')]
[(u'So', 'RB'), (u'tonight', 'JJ'), (u',', ','), (u'I', 'PRP'), (u'announce', 'VBP'), (u'the', 'DT'), (u'Advanced', 'NNP'), (u'Energy', 'NNP'), (u'Initiative', 'NNP'), (u'--', ':'), (u'a', 'DT'), (u'22-percent', 'JJ'), (u'increase', 'NN'), (u'in', 'IN'), (u'clean-energy', 'JJ'), (u'research', 'NN'), (u'--', ':'), (u'at', 'IN'), (u'the', 'DT'), (u'Department', 'NNP'), (u'of', 'IN'), (u'Energy', 'NNP'), (u',', ','), (u

[(u'We', 'PRP'), (u'will', 'MD'), (u'renew', 'VB'), (u'the', 'DT'), (u'defining', 'VBG'), (u'moral', 'JJ'), (u'commitments', 'NNS'), (u'of', 'IN'), (u'this', 'DT'), (u'land', 'NN'), (u'.', '.')]
[(u'And', 'CC'), (u'so', 'RB'), (u'we', 'PRP'), (u'move', 'VBP'), (u'forward', 'RB'), (u'--', ':'), (u'optimistic', 'JJ'), (u'about', 'IN'), (u'our', 'PRP$'), (u'country', 'NN'), (u',', ','), (u'faithful', 'JJ'), (u'to', 'TO'), (u'its', 'PRP$'), (u'cause', 'NN'), (u',', ','), (u'and', 'CC'), (u'confident', 'NN'), (u'of', 'IN'), (u'the', 'DT'), (u'victories', 'NNS'), (u'to', 'TO'), (u'come', 'VB'), (u'.', '.')]
[(u'May', 'NNP'), (u'God', 'NNP'), (u'bless', 'NN'), (u'America', 'NNP'), (u'.', '.')]
[(u'(', '('), (u'Applause', 'NNP'), (u'.', '.'), (u')', ')')]


# Lesson 5 - Chunking

- Associating the groups of words/phrases that are tied to the respective noun in a sentence.
- Chunking is combo of POS tags and regex.

In [13]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunk_gram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" # looking for this grouping of POS words
            # RB = adverb
            # VB = verb
            # NNP = proper noun
            # NN = noun
            
            chunk_parser = nltk.RegexpParser(chunk_gram)
            chunked = chunk_parser.parse(tagged)
            
            print(chunked)
            # chunked.draw()
            
    except Exception as e:
        print(str(e))
        
process_content()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  (Chunk called/VBD America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  and/CC
  carried/VBD
  on/IN
  a/DT
  noble/JJ
  dream/NN
  ./.)
(S
  Tonight/NN
  we/PRP
  are/VBP
  comforted

(S 31/CD ,/, 2006/CD ./.)
(S
  (Chunk White/NNP House/NNP photo/NN)
  by/IN
  (Chunk Eric/NNP Draper/NNP No/NNP one/NN)
  can/MD
  deny/VB
  the/DT
  success/NN
  of/IN
  freedom/NN
  ,/,
  but/CC
  some/DT
  men/NNS
  rage/VB
  and/CC
  fight/VB
  against/IN
  it/PRP
  ./.)
(S
  And/CC
  one/CD
  of/IN
  the/DT
  main/JJ
  sources/NNS
  of/IN
  reaction/NN
  and/CC
  opposition/NN
  is/VBZ
  radical/JJ
  (Chunk Islam/NNP)
  --/:
  the/DT
  perversion/NN
  by/IN
  a/DT
  few/JJ
  of/IN
  a/DT
  noble/JJ
  faith/NN
  into/IN
  an/DT
  ideology/NN
  of/IN
  terror/NN
  and/CC
  death/NN
  ./.)
(S
  Terrorists/NNS
  like/IN
  bin/NN
  (Chunk Laden/NNP)
  are/VBP
  serious/JJ
  about/IN
  mass/NN
  murder/NN
  --/:
  and/CC
  all/DT
  of/IN
  us/PRP
  must/MD
  take/VB
  their/PRP$
  declared/JJ
  intentions/NNS
  seriously/RB
  ./.)
(S
  They/PRP
  seek/VBP
  to/TO
  impose/VB
  a/DT
  heartless/NN
  system/NN
  of/IN
  totalitarian/JJ
  control/NN
  throughout/IN
  the/DT
  (Chunk Middle

(S
  The/DT
  regime/NN
  in/IN
  that/DT
  country/NN
  sponsors/NNS
  terrorists/NNS
  in/IN
  the/DT
  Palestinian/JJ
  territories/NNS
  and/CC
  in/IN
  (Chunk Lebanon/NNP)
  --/:
  and/CC
  that/DT
  must/MD
  come/VB
  to/TO
  an/DT
  end/NN
  ./.)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
  The/DT
  Iranian/JJ
  government/NN
  is/VBZ
  defying/VBG
  the/DT
  world/NN
  with/IN
  its/PRP$
  nuclear/JJ
  ambitions/NNS
  ,/,
  and/CC
  the/DT
  nations/NNS
  of/IN
  the/DT
  world/NN
  must/MD
  not/RB
  permit/VB
  the/DT
  Iranian/JJ
  regime/NN
  to/TO
  gain/VB
  nuclear/JJ
  weapons/NNS
  ./.)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
  (Chunk America/NNP)
  will/MD
  continue/VB
  to/TO
  rally/VB
  the/DT
  world/NN
  to/TO
  confront/VB
  these/DT
  threats/NNS
  ./.)
(S
  (Chunk Tonight/NNP)
  ,/,
  let/VB
  me/PRP
  speak/VB
  directly/RB
  to/TO
  the/DT
  citizens/NNS
  of/IN
  (Chunk Iran/NNP)
  :/:
  (Chunk America/NNP)
  respects/VBZ
  you/PRP
  ,/,
  and/CC
  we/PRP
 

(S (/( (Chunk Applause/NNP) ./. )/))
(S
  Yet/RB
  the/DT
  tax/NN
  relief/NN
  is/VBZ
  set/VBN
  to/TO
  expire/VB
  in/IN
  the/DT
  next/JJ
  few/JJ
  years/NNS
  ./.)
(S
  If/IN
  we/PRP
  do/VBP
  nothing/NN
  ,/,
  (Chunk American/NNP)
  families/NNS
  will/MD
  face/VB
  a/DT
  massive/JJ
  tax/NN
  increase/NN
  they/PRP
  do/VBP
  not/RB
  expect/VB
  and/CC
  will/MD
  not/RB
  welcome/VB
  ./.)
(S
  Because/IN
  (Chunk America/NNP)
  needs/VBZ
  more/JJR
  than/IN
  a/DT
  temporary/JJ
  expansion/NN
  ,/,
  we/PRP
  need/VBP
  more/JJR
  than/IN
  temporary/JJ
  tax/NN
  relief/NN
  ./.)
(S
  I/PRP
  urge/VBP
  the/DT
  (Chunk Congress/NNP)
  to/TO
  act/VB
  responsibly/RB
  ,/,
  and/CC
  make/VB
  the/DT
  tax/NN
  cuts/NNS
  permanent/NN
  ./.)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
  (Chunk Keeping/VBG America/NNP)
  competitive/JJ
  requires/VBZ
  us/PRP
  to/TO
  be/VB
  good/JJ
  stewards/NNS
  of/IN
  tax/NN
  dollars/NNS
  ./.)
(S
  Every/DT
  year/NN
  of/IN
 

(S
  By/IN
  applying/VBG
  the/DT
  talent/NN
  and/CC
  technology/NN
  of/IN
  (Chunk America/NNP)
  ,/,
  this/DT
  country/NN
  can/MD
  dramatically/RB
  improve/VB
  our/PRP$
  environment/NN
  ,/,
  move/VB
  beyond/IN
  a/DT
  petroleum-based/JJ
  economy/NN
  ,/,
  and/CC
  make/VB
  our/PRP$
  dependence/NN
  on/IN
  (Chunk Middle/NNP Eastern/NNP oil/NN)
  a/DT
  thing/NN
  of/IN
  the/DT
  past/NN
  ./.)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
  And/CC
  to/TO
  (Chunk keep/VB America/NNP)
  competitive/JJ
  ,/,
  one/CD
  commitment/NN
  is/VBZ
  necessary/JJ
  above/IN
  all/DT
  :/:
  We/PRP
  must/MD
  continue/VB
  to/TO
  lead/VB
  the/DT
  world/NN
  in/IN
  human/JJ
  talent/NN
  and/CC
  creativity/NN
  ./.)
(S
  Our/PRP$
  greatest/JJS
  advantage/NN
  in/IN
  the/DT
  world/NN
  has/VBZ
  always/RB
  been/VBN
  our/PRP$
  educated/VBN
  ,/,
  hardworking/VBG
  ,/,
  ambitious/JJ
  people/NNS
  --/:
  and/CC
  we/PRP
  're/VBP
  going/VBG
  to/TO
  keep/VB
  that/

(S (/( (Chunk Applause/NNP) ./. )/))
(S
  Honorable/JJ
  people/NNS
  in/IN
  both/DT
  parties/NNS
  are/VBP
  working/VBG
  on/IN
  reforms/NNS
  to/TO
  strengthen/VB
  the/DT
  ethical/JJ
  standards/NNS
  of/IN
  (Chunk Washington/NNP)
  --/:
  I/PRP
  support/VBP
  your/PRP$
  efforts/NNS
  ./.)
(S
  Each/DT
  of/IN
  us/PRP
  has/VBZ
  made/VBN
  a/DT
  pledge/NN
  to/TO
  be/VB
  worthy/JJ
  of/IN
  public/JJ
  responsibility/NN
  --/:
  and/CC
  that/DT
  is/VBZ
  a/DT
  pledge/NN
  we/PRP
  must/MD
  never/RB
  forget/VB
  ,/,
  never/RB
  dismiss/NN
  ,/,
  and/CC
  never/RB
  betray/NN
  ./.)
(S (/( (Chunk Applause/NNP) ./. )/))
(S
  As/IN
  we/PRP
  renew/VBP
  the/DT
  promise/NN
  of/IN
  our/PRP$
  institutions/NNS
  ,/,
  let/VB
  us/PRP
  also/RB
  show/VBP
  the/DT
  character/NN
  of/IN
  (Chunk America/NNP)
  in/IN
  our/PRP$
  compassion/NN
  and/CC
  care/NN
  for/IN
  one/CD
  another/DT
  ./.)
(S
  A/DT
  hopeful/JJ
  society/NN
  gives/VBZ
  special/JJ
  atten

# Lesson 6 - Chinking

- You chink something from a chunk - it's the removal of words from a chunk.

In [14]:
def process_content():
    try:
        for i in tokenized[:100]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            # below is chunking all groups of words, and then chinking (getting rid of)
            # verbs, prepositions, & determiners
            chunk_gram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{""" 
            
            chunk_parser = nltk.RegexpParser(chunk_gram)
            chunked = chunk_parser.parse(tagged)
            
            print(chunked)
            # chunked.draw()
            
    except Exception as e:
        print(str(e))
        
process_content()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP 'S/POS ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk
    THE/NNP
    UNION/NNP
    January/NNP
    31/CD
    ,/,
    2006/CD
    THE/NNP
    PRESIDENT/NNP
    :/:
    Thank/NNP
    you/PRP)
  all/DT
  (Chunk ./.))
(S
  (Chunk
    Mr./NNP
    Speaker/NNP
    ,/,
    Vice/NNP
    President/NNP
    Cheney/NNP
    ,/,
    members/NNS)
  of/IN
  (Chunk Congress/NNP ,/, members/NNS)
  of/IN
  the/DT
  (Chunk
    Supreme/NNP
    Court/NNP
    and/CC
    diplomatic/JJ
    corps/NN
    ,/,
    distinguished/JJ
    guests/NNS
    ,/,
    and/CC
    fellow/JJ
    citizens/NNS
    :/:)
  Today/VB
  (Chunk our/PRP$ nation/NN)
  lost/VBD
  a/DT
  beloved/VBN
  (Chunk ,/, graceful/JJ ,/, courageous/JJ woman/NN who/WP)
  called/VBD
  (Chunk America/NNP)
  to/TO
  (Chunk its/PRP$ founding/NN ideals/NNS and/CC)
  carried/VBD
  on/IN
  a/DT
  (Chunk noble/

(S (Chunk (/( Applause/NNP ./. )/)))
(S
  (Chunk Our/PRP$ work/NN)
  in/IN
  (Chunk Iraq/NNP)
  is/VBZ
  (Chunk difficult/JJ)
  because/IN
  (Chunk our/PRP$ enemy/NN)
  is/VBZ
  (Chunk brutal/JJ ./.))
(S
  (Chunk But/CC)
  that/DT
  (Chunk brutality/NN)
  has/VBZ
  (Chunk not/RB)
  stopped/VBN
  the/DT
  (Chunk dramatic/JJ progress/NN)
  of/IN
  a/DT
  (Chunk new/JJ democracy/NN ./.))
(S
  In/IN
  (Chunk less/JJR)
  than/IN
  (Chunk three/CD years/NNS ,/,)
  the/DT
  (Chunk nation/NN)
  has/VBZ
  gone/VBN
  from/IN
  (Chunk dictatorship/NN)
  to/TO
  (Chunk liberation/NN ,/,)
  to/TO
  sovereignty/VB
  (Chunk ,/,)
  to/TO
  a/DT
  (Chunk constitution/NN ,/,)
  to/TO
  (Chunk national/JJ elections/NNS ./.))
(S
  At/IN
  the/DT
  (Chunk same/JJ time/NN ,/, our/PRP$ coalition/NN)
  has/VBZ
  been/VBN
  relentless/VBN
  in/IN
  shutting/VBG
  (Chunk off/RP terrorist/JJ infiltration/NN ,/,)
  clearing/VBG
  (Chunk out/RP insurgent/JJ strongholds/NNS ,/, and/CC)
  turning/VBG
  (Chunk over/R

# Lesson 7 - Named Entity Recognition



| NE Type | Examples |
| --- | --- |
|ORGANIZATION | Georgia-Pacific Corp., WHO |
|PERSON	| Eddy Bonte, President Obama |
|LOCATION | Murray River, Mount Everest |
|DATE | June, 2008-06-29 |
|TIME | two fifty a m, 1:30 p.m. |
|MONEY | 175 million Canadian Dollars, GBP 10.40 |
|PERCENT | twenty pct, 18.75 % |
|FACILITY | Washington Monument, Stonehenge |
|GPE | South East Asia, Midlothian |

- Note: error rates and false positives are pretty high with named entity recognition.

In [15]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            named_ent = nltk.ne_chunk(tagged)
            print(named_ent)
            
    except Exception as e:
        print(str(e))
        
process_content()

(S
  PRESIDENT/NNP
  (PERSON GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (ORGANIZATION ADDRESS/NNP)
  BEFORE/IN
  A/NNP
  (ORGANIZATION JOINT/NNP)
  SESSION/NNP
  OF/IN
  (ORGANIZATION THE/NNP)
  (ORGANIZATION CONGRESS/NNP)
  ON/NNP
  THE/NNP
  (ORGANIZATION STATE/NNP OF/IN)
  (ORGANIZATION THE/NNP)
  (ORGANIZATION UNION/NNP)
  January/NNP
  31/CD
  ,/,
  2006/CD
  (ORGANIZATION THE/NNP)
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)
(S
  (PERSON Mr./NNP Speaker/NNP)
  ,/,
  Vice/NNP
  President/NNP
  (PERSON Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (ORGANIZATION Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (ORGANIZATION Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  called/VBD
  (GPE America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  

(S
  And/CC
  one/CD
  of/IN
  the/DT
  main/JJ
  sources/NNS
  of/IN
  reaction/NN
  and/CC
  opposition/NN
  is/VBZ
  radical/JJ
  Islam/NNP
  --/:
  the/DT
  perversion/NN
  by/IN
  a/DT
  few/JJ
  of/IN
  a/DT
  noble/JJ
  faith/NN
  into/IN
  an/DT
  ideology/NN
  of/IN
  terror/NN
  and/CC
  death/NN
  ./.)
(S
  Terrorists/NNS
  like/IN
  bin/NN
  Laden/NNP
  are/VBP
  serious/JJ
  about/IN
  mass/NN
  murder/NN
  --/:
  and/CC
  all/DT
  of/IN
  us/PRP
  must/MD
  take/VB
  their/PRP$
  declared/JJ
  intentions/NNS
  seriously/RB
  ./.)
(S
  They/PRP
  seek/VBP
  to/TO
  impose/VB
  a/DT
  heartless/NN
  system/NN
  of/IN
  totalitarian/JJ
  control/NN
  throughout/IN
  the/DT
  (GPE Middle/NNP East/NNP)
  ,/,
  and/CC
  arm/NN
  themselves/PRP
  with/IN
  weapons/NNS
  of/IN
  mass/NN
  murder/NN
  ./.)
(S
  Their/PRP$
  aim/NN
  is/VBZ
  to/TO
  seize/VB
  power/NN
  in/IN
  (GPE Iraq/NNP)
  ,/,
  and/CC
  use/VB
  it/PRP
  as/IN
  a/DT
  safe/JJ
  haven/NN
  to/TO
  launch/VB

(S
  We/PRP
  've/VBP
  adjusted/VBN
  our/PRP$
  military/JJ
  tactics/NNS
  and/CC
  changed/VBD
  our/PRP$
  approach/NN
  to/TO
  reconstruction/NN
  ./.)
(S
  Along/IN
  the/DT
  way/NN
  ,/,
  we/PRP
  have/VBP
  benefitted/VBN
  from/IN
  responsible/JJ
  criticism/NN
  and/CC
  counsel/NN
  offered/VBN
  by/IN
  members/NNS
  of/IN
  (ORGANIZATION Congress/NNP)
  of/IN
  both/DT
  parties/NNS
  ./.)
(S
  In/IN
  the/DT
  coming/VBG
  year/NN
  ,/,
  I/PRP
  will/MD
  continue/VB
  to/TO
  reach/VB
  out/RP
  and/CC
  seek/VB
  your/PRP$
  good/JJ
  advice/NN
  ./.)
(S
  Yet/RB
  ,/,
  there/EX
  is/VBZ
  a/DT
  difference/NN
  between/IN
  responsible/JJ
  criticism/NN
  that/WDT
  aims/VBZ
  for/IN
  success/NN
  ,/,
  and/CC
  defeatism/NN
  that/WDT
  refuses/VBZ
  to/TO
  acknowledge/VB
  anything/NN
  but/CC
  failure/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  (PERSON Hindsight/NNP)
  alone/RB
  is/VBZ
  not/RB
  wisdom/JJ
  ,/,
  and/CC
  second-guessing/N

(S
  We/PRP
  show/VBP
  compassion/JJ
  abroad/RB
  because/IN
  (GSP Americans/NNPS)
  believe/VBP
  in/IN
  the/DT
  God-given/NNP
  dignity/NN
  and/CC
  worth/NN
  of/IN
  a/DT
  villager/NN
  with/IN
  HIV/AIDS/NNP
  ,/,
  or/CC
  an/DT
  infant/NN
  with/IN
  malaria/NNS
  ,/,
  or/CC
  a/DT
  refugee/JJ
  fleeing/NN
  genocide/NN
  ,/,
  or/CC
  a/DT
  young/JJ
  girl/NN
  sold/VBN
  into/IN
  slavery/NN
  ./.)
(S
  We/PRP
  also/RB
  show/VBP
  compassion/NN
  abroad/RB
  because/IN
  regions/NNS
  overwhelmed/VBN
  by/IN
  poverty/NN
  ,/,
  corruption/NN
  ,/,
  and/CC
  despair/NN
  are/VBP
  sources/NNS
  of/IN
  terrorism/NN
  ,/,
  and/CC
  organized/VBD
  crime/NN
  ,/,
  and/CC
  human/JJ
  trafficking/NN
  ,/,
  and/CC
  the/DT
  drug/NN
  trade/NN
  ./.)
(S
  In/IN
  recent/JJ
  years/NNS
  ,/,
  you/PRP
  and/CC
  I/PRP
  have/VBP
  taken/VBN
  unprecedented/JJ
  action/NN
  to/TO
  fight/VB
  (ORGANIZATION AIDS/NNP)
  and/CC
  malaria/NNS
  ,/,
  expand/VBP
  the/D

(S
  In/IN
  a/DT
  dynamic/JJ
  world/NN
  economy/NN
  ,/,
  we/PRP
  are/VBP
  seeing/VBG
  new/JJ
  competitors/NNS
  ,/,
  like/IN
  (GPE China/NNP)
  and/CC
  (GPE India/NNP)
  ,/,
  and/CC
  this/DT
  creates/VBZ
  uncertainty/NN
  ,/,
  which/WDT
  makes/VBZ
  it/PRP
  easier/JJR
  to/TO
  feed/VB
  people/NNS
  's/POS
  fears/NNS
  ./.)
(S
  So/IN
  we/PRP
  're/VBP
  seeing/VBG
  some/DT
  old/JJ
  temptations/NNS
  return/NN
  ./.)
(S
  Protectionists/NNS
  want/VBP
  to/TO
  escape/VB
  competition/NN
  ,/,
  pretending/VBG
  that/IN
  we/PRP
  can/MD
  keep/VB
  our/PRP$
  high/JJ
  standard/NN
  of/IN
  living/NN
  while/IN
  walling/VBG
  off/RP
  our/PRP$
  economy/NN
  ./.)
(S
  Others/NNS
  say/VBP
  that/IN
  the/DT
  government/NN
  needs/VBZ
  to/TO
  take/VB
  a/DT
  larger/JJR
  role/NN
  in/IN
  directing/VBG
  the/DT
  economy/NN
  ,/,
  centralizing/VBG
  more/JJR
  power/NN
  in/IN
  (GPE Washington/NNP)
  and/CC
  increasing/VBG
  taxes/NNS
  ./.)
(S
  We/PR

(S
  And/CC
  we/PRP
  must/MD
  have/VB
  a/DT
  rational/JJ
  ,/,
  humane/JJ
  guest/JJS
  worker/NN
  program/NN
  that/WDT
  rejects/VBZ
  amnesty/JJ
  ,/,
  allows/VBZ
  temporary/JJ
  jobs/NNS
  for/IN
  people/NNS
  who/WP
  seek/VBP
  them/PRP
  legally/RB
  ,/,
  and/CC
  reduces/NNS
  smuggling/VBG
  and/CC
  crime/NN
  at/IN
  the/DT
  border/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  Keeping/VBG
  (GPE America/NNP)
  competitive/JJ
  requires/VBZ
  affordable/JJ
  health/NN
  care/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  Our/PRP$
  government/NN
  has/VBZ
  a/DT
  responsibility/NN
  to/TO
  provide/VB
  health/NN
  care/NN
  for/IN
  the/DT
  poor/JJ
  and/CC
  the/DT
  elderly/JJ
  ,/,
  and/CC
  we/PRP
  are/VBP
  meeting/VBG
  that/IN
  responsibility/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  For/IN
  all/DT
  Americans/NNPS
  --/:
  for/IN
  all/DT
  Americans/NNPS
  ,/,
  we/PRP
  must/MD
  confront/VB
  the/DT
  ris

  ./.)
(S
  Yet/RB
  our/PRP$
  greatness/NN
  is/VBZ
  not/RB
  measured/VBN
  in/IN
  power/NN
  or/CC
  luxuries/NNS
  ,/,
  but/CC
  by/IN
  who/WP
  we/PRP
  are/VBP
  and/CC
  how/WRB
  we/PRP
  treat/VBP
  one/CD
  another/DT
  ./.)
(S
  So/IN
  we/PRP
  strive/VBP
  to/TO
  be/VB
  a/DT
  compassionate/NN
  ,/,
  decent/NN
  ,/,
  hopeful/JJ
  society/NN
  ./.)
(S
  In/IN
  recent/JJ
  years/NNS
  ,/,
  (GPE America/NNP)
  has/VBZ
  become/VBN
  a/DT
  more/RBR
  hopeful/JJ
  nation/NN
  ./.)
(S
  Violent/JJ
  crime/NN
  rates/NNS
  have/VBP
  fallen/VBN
  to/TO
  their/PRP$
  lowest/JJS
  levels/NNS
  since/IN
  the/DT
  1970s/CD
  ./.)
(S
  (GPE Welfare/NN)
  cases/NNS
  have/VBP
  dropped/VBN
  by/IN
  more/JJR
  than/IN
  half/NN
  over/IN
  the/DT
  past/JJ
  decade/NN
  ./.)
(S
  (GPE Drug/NN)
  use/NN
  among/IN
  youth/NN
  is/VBZ
  down/RB
  19/CD
  percent/NN
  since/IN
  2001/CD
  ./.)
(S
  There/EX
  are/VBP
  fewer/JJR
  abortions/NNS
  in/IN
  (GPE America/NNP)
  

(S
  Yet/RB
  as/IN
  we/PRP
  meet/VBP
  these/DT
  immediate/JJ
  needs/NNS
  ,/,
  we/PRP
  must/MD
  also/RB
  address/VB
  deeper/JJR
  challenges/NNS
  that/WDT
  existed/VBD
  before/IN
  the/DT
  storm/NN
  arrived/VBD
  ./.)
(S
  In/IN
  (GSP New/NNP Orleans/NNP)
  and/CC
  in/IN
  other/JJ
  places/NNS
  ,/,
  many/JJ
  of/IN
  our/PRP$
  fellow/JJ
  citizens/NNS
  have/VBP
  felt/VBN
  excluded/VBN
  from/IN
  the/DT
  promise/NN
  of/IN
  our/PRP$
  country/NN
  ./.)
(S
  The/DT
  answer/NN
  is/VBZ
  not/RB
  only/RB
  temporary/JJ
  relief/NN
  ,/,
  but/CC
  schools/NNS
  that/WDT
  teach/VBP
  every/DT
  child/NN
  ,/,
  and/CC
  job/NN
  skills/NNS
  that/IN
  bring/VBG
  upward/JJ
  mobility/NN
  ,/,
  and/CC
  more/JJR
  opportunities/NNS
  to/TO
  own/VB
  a/DT
  home/NN
  and/CC
  start/VB
  a/DT
  business/NN
  ./.)
(S
  As/IN
  we/PRP
  recover/VBP
  from/IN
  a/DT
  disaster/NN
  ,/,
  let/VB
  us/PRP
  also/RB
  work/NN
  for/IN
  the/DT
  day/NN
  when/WRB
  a

# Lesson 8 - Lemmatization

- Similar to stemming, finding the root word.
- Generally will find the root stem or synonym to the original word

In [16]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))

cat
cat
cactus
goose
rock
python


In [17]:
print(lemmatizer.lemmatize("better", pos="a"))

good


This came back with a completely different word, which is good...literally.

In [18]:
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run", pos="v"))

best
run
run


In [19]:
print(lemmatizer.lemmatize("better"))

better


- The default parameter for lemmatize is pos="n". If you have something that isn't a noun, you have to pass through the POS tag.
- Lemmatizing is better than stemming b/c it gives you an actual word with an actual meaning. Easier for humans to understand

# Lesson 9 - NLTK Corpora

In [20]:
import nltk
print(nltk.__file__) # to see the location of python modules
# we probably want this to default to /usr/local/lib/python3.7/site-packages/nltk

/Users/andrewrubino/anaconda2/lib/python2.7/site-packages/nltk/__init__.pyc


In [21]:
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

sample = gutenberg.raw("bible-kjv.txt")

tok = sent_tokenize(sample)

print(tok[5:15])

[u'1:5 And God called the light Day, and the darkness he called Night.', u'And the evening and the morning were the first day.', u'1:6 And God said, Let there be a firmament in the midst of the waters,\nand let it divide the waters from the waters.', u'1:7 And God made the firmament, and divided the waters which were\nunder the firmament from the waters which were above the firmament:\nand it was so.', u'1:8 And God called the firmament Heaven.', u'And the evening and the\nmorning were the second day.', u'1:9 And God said, Let the waters under the heaven be gathered together\nunto one place, and let the dry land appear: and it was so.', u'1:10 And God called the dry land Earth; and the gathering together of\nthe waters called he Seas: and God saw that it was good.', u'1:11 And God said, Let the earth bring forth grass, the herb yielding\nseed, and the fruit tree yielding fruit after his kind, whose seed is\nin itself, upon the earth: and it was so.', u'1:12 And the earth brought forth 

# Lesson 10 - WordNet

- WordNet let's you look up synonmyms, antonyms, definitions, context, etc\

#### What do ppl use synsets for?
1. To rewrite things, like term papers.
2. To catch ppl who use tools like wordnets.

Basically, people use this to cheat, or to catch people who cheat.

In [22]:
from nltk.corpus import wordnet

syns = wordnet.synsets("program")

print(syns)

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]


In [23]:
print(syns[0])

Synset('plan.n.01')


  _warn_if_not_unicode(string)


In [24]:
# synset
print(syns[0].lemmas())

# just the word
print(syns[0].lemmas()[0].name())

# what if we want the definition
print(syns[0].definition())

# examples
print(syns[0].examples())

[Lemma('plan.n.01.plan'), Lemma('plan.n.01.program'), Lemma('plan.n.01.programme')]
plan
a series of steps to be carried out or goals to be accomplished
[u'they drew up a six-step plan', u'they discussed plans for a new bond issue']


In [25]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        # print("l: ",l)
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
            
print(set(synonyms))
print(set(antonyms))

set([u'beneficial', u'right', u'secure', u'just', u'unspoilt', u'respectable', u'good', u'goodness', u'dear', u'salutary', u'ripe', u'expert', u'skillful', u'in_force', u'proficient', u'unspoiled', u'dependable', u'soundly', u'honorable', u'full', u'undecomposed', u'safe', u'adept', u'upright', u'trade_good', u'sound', u'in_effect', u'practiced', u'effective', u'commodity', u'estimable', u'well', u'honest', u'near', u'skilful', u'thoroughly', u'serious'])
set([u'bad', u'badness', u'ill', u'evil', u'evilness'])


In [26]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")

print(w1.wup_similarity(w2)) # wup = wu & palmer, based on a paper they wrote on semantics

# these words are 91% similar

0.909090909091


In [27]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")

print(w1.wup_similarity(w2)) # wup = wu & palmer, based on a paper they wrote on semantics

w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")

print(w1.wup_similarity(w2)) # wup = wu & palmer, based on a paper they wrote on semantics

0.695652173913
0.32


# Lesson 11 - Text Classification

For sentiment analysis!! In this, to classify things with two choices (ex: spam or not spam, positive or negative sentiment)

In [28]:
import nltk
import random
from nltk.corpus import movie_reviews

# documents = [(list(movie_reviews.words(fileid)), category)
#             for category in movie_reviews.categories()
#             for fileid in movie_reviews.fileids(category)]

# another way of writing the above
documents = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid.decode('utf-8'))), category))

# we're gonna train and test, and want to make sure train & test sets are not the same to avoid extreme bias
random.shuffle(documents)

print(documents[1])

([u'devotees', u'of', u'robert', u'a', u'.', u'heinlein', u',', u'be', u'forewarned', u':', u'paul', u'verhoeven', u"'", u's', u'starship', u'troopers', u'is', u'less', u'an', u'adaptation', u'of', u'heinlein', u"'", u's', u'novel', u'than', u'it', u'is', u'a', u'literary', u'satire', u'.', u'the', u'author', u"'", u's', u'jingo', u'-', u'all', u'-', u'the', u'-', u'way', u'militarism', u'and', u'his', u'tendency', u'to', u'create', u'plastic', u'characters', u'with', u'plastic', u'conflicts', u'had', u'me', u'expecting', u'a', u'soulless', u',', u'faceless', u'parade', u'of', u'carnage', u'from', u'the', u'film', u'version', u'.', u'and', u'i', u'suppose', u'that', u"'", u's', u'exactly', u'what', u'verhoeven', u'delivers', u',', u'with', u'sometimes', u'hilarious', u'results', u'.', u'heinlein', u"'", u's', u'basic', u'motifs', u'are', u'so', u'faithfully', u'rendered', u'that', u'their', u'flaws', u'become', u'a', u'source', u'of', u'amusement', u',', u'while', u'their', u'strengths

#### The above bag of words has a negative sentiment
What we want to do is take every word from every review, compile them, take that list and find the most popular words used. Then, we'll associated those popular words and see which appear in more popular positive or negative texts. Then we'll classify them. This is a naive bayes classification.

In [29]:
all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())
    
# convert all_words to nltk frequency distribution
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[(u',', 77717), (u'the', 76529), (u'.', 65876), (u'a', 38106), (u'and', 35576), (u'of', 34123), (u'to', 31937), (u"'", 30585), (u'is', 25195), (u'in', 21822), (u's', 18513), (u'"', 17612), (u'it', 16107), (u'that', 15924), (u'-', 15595)]


In [30]:
# mostly useless stopwords above
print(all_words["stupid"])
# in 2000 movie reviews, the word stupid shows up 253 times

253


# Lesson 12 - Words as Features for Learning

In [31]:
# let's use the most common words, the rest don't matter so much for modeling
# word_features = list(all_words.keys())[:3000]
# this is actually wrong since list(all_words.keys()) does not have any order inside
# to order the top 3000 words, we can do something like:
word_features = [w[0] for w in all_words.most_common(3000)]
# as we can see, there are a lot of stop words in this list. we should consider
# removing the stop words to get better word features, but for now let's leave them in.

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
# this basically is saying False if the word is not associated with a negative review, and true otherwise

featuresets = [(find_features(rev), category) for (rev, category) in documents]



# Lesson 13 - Naive Bayes

The algorithm of choice, at least at a basic level, for text analysis is often the Naive Bayes classifier. Part of the reason for this is that text data is almost always massive in size. The Naive Bayes algorithm is so simple that it can be used at scale very easily with minimal process requirements.

In [32]:
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

# This is the algorithm calculation:
# posterior = prior occurences x likelihood / evidence
# the above gives us the likelihood of something to be positive (or negative). It's not the best algorithm,
# but it's scalable and easy to use

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)
# none of the top 15 words contained grammar. Grammar doesn't matter much in a large dataset, or it's not an informative feature

('Naive Bayes Algo accuracy:', 84.0)
Most Informative Features
             outstanding = True              pos : neg    =     10.6 : 1.0
                  finest = True              pos : neg    =      9.0 : 1.0
                   mulan = True              pos : neg    =      8.8 : 1.0
             beautifully = True              pos : neg    =      8.4 : 1.0
             wonderfully = True              pos : neg    =      7.6 : 1.0
                  seagal = True              neg : pos    =      7.1 : 1.0
              schumacher = True              neg : pos    =      6.7 : 1.0
                  prinze = True              neg : pos    =      6.7 : 1.0
                  alicia = True              neg : pos    =      6.7 : 1.0
                  wasted = True              neg : pos    =      6.1 : 1.0
                   damon = True              pos : neg    =      6.0 : 1.0
                   inept = True              neg : pos    =      5.6 : 1.0
                  ripley = True      

# Lesson 14 - Save Classifier with Pickle

As you will likely find with any form of data analysis, there is going to be some sort of processing bottleneck, that you repeat over and over, often yielding the same object in Python memory. 

Examples of this might be loading a massive dataset into memory, some basic pre-processing of a static dataset, or, like in our case, the training of a classifier. 

In our case, we spend much time on training our classifier, and soon we may add more. It is a wise choice to go ahead and pickle the trained classifer. This way, we can load in the trained classifier in a matter of milliseconds, rather than waiting 3-5+ minutes for the classifier to be trained. 

In [45]:
import pickle

save_classifier = open("naive_bayes.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [46]:
# lets open the pickle file now
classifier_f = open("naive_bayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

# let's see how it does this time

word_features = [w[0] for w in all_words.most_common(3000)]

featuresets = [(find_features(rev), category) for (rev, category) in documents]

training_set = featuresets[:1900]
testing_set = featuresets[1900:]

print("Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

('Naive Bayes Algo accuracy:', 60.842273453549424)
Most Informative Features
             outstanding = True              pos : neg    =     10.6 : 1.0
                  finest = True              pos : neg    =      9.0 : 1.0
                   mulan = True              pos : neg    =      8.8 : 1.0
             beautifully = True              pos : neg    =      8.4 : 1.0
             wonderfully = True              pos : neg    =      7.6 : 1.0
                  seagal = True              neg : pos    =      7.1 : 1.0
              schumacher = True              neg : pos    =      6.7 : 1.0
                  prinze = True              neg : pos    =      6.7 : 1.0
                  alicia = True              neg : pos    =      6.7 : 1.0
                  wasted = True              neg : pos    =      6.1 : 1.0
                   damon = True              pos : neg    =      6.0 : 1.0
                   inept = True              neg : pos    =      5.6 : 1.0
                  riple

In [50]:
testing_set[1]

({u'writings': False,
  u'magnetic': False,
  u'saves': False,
  u'foul': False,
  u'sleek': False,
  u'four': False,
  u'woods': False,
  u'asian': False,
  u'hanging': False,
  u'woody': False,
  u'comically': False,
  u'marching': False,
  u'relationships': False,
  u'psychopathic': False,
  u'endings': False,
  u'presents': False,
  u'superficially': False,
  u'bike': False,
  u'xtc': False,
  u'lord': False,
  u'immature': False,
  u'worth': False,
  u'alternating': False,
  u'compassion': False,
  u'blanket': False,
  u'hilarious': False,
  u'sellers': False,
  u'manic': False,
  u'leisurely': False,
  u'bringing': False,
  u'tickets': False,
  u'wednesday': False,
  u'relentlessly': False,
  u'deceptions': False,
  u'stereotypical': False,
  u'eye-popping': False,
  u'undercover': False,
  u'build-up': False,
  u'clothes': False,
  u'wisegirls': False,
  u'chew': False,
  u'sturm': False,
  u'diverting': False,
  u'tired': False,
  u'miller': False,
  u'pulse': False,
  u'elegan

# Lesson 15 - Scikit-Learn Incorporation with NLTK

In [35]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

print("Original Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

# Multinomial NB
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

# Gaussian
# GaussianNB_classifier = SklearnClassifier(GaussianNB())
# GaussianNB_classifier.train(training_set)
# print("GaussianNB_classifier accuracy percent:", (nltk.classify.accuracy(GaussianNB_classifier, testing_set))*100)

# Bernoulli
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

('Original Naive Bayes Algo accuracy:', 84.0)
Most Informative Features
             outstanding = True              pos : neg    =     10.6 : 1.0
                  finest = True              pos : neg    =      9.0 : 1.0
                   mulan = True              pos : neg    =      8.8 : 1.0
             beautifully = True              pos : neg    =      8.4 : 1.0
             wonderfully = True              pos : neg    =      7.6 : 1.0
                  seagal = True              neg : pos    =      7.1 : 1.0
                  prinze = True              neg : pos    =      6.7 : 1.0
              schumacher = True              neg : pos    =      6.7 : 1.0
                  alicia = True              neg : pos    =      6.7 : 1.0
                  wasted = True              neg : pos    =      6.1 : 1.0
                   damon = True              pos : neg    =      6.0 : 1.0
                   inept = True              neg : pos    =      5.6 : 1.0
                  ripley = T

In [36]:
# NOTE: all of these models are using default parameters. When you study and understand the parameters,
# then you can tune them to improve the accuracy of the models.
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)



('LogisticRegression_classifier accuracy percent:', 83.0)




('SGDClassifier_classifier accuracy percent:', 80.0)




('SVC_classifier accuracy percent:', 86.0)
('LinearSVC_classifier accuracy percent:', 77.0)
('NuSVC_classifier accuracy percent:', 85.0)


# Lesson 16 - Combining Algos with a Vote

Now that we have many classifiers, what if we created a new classifier, which combined the votes of all of the classifiers, and then classified the text whatever the majority vote was? 

Turns out, doing this is super easy. NLTK has considered this in advance, allowing us to inherit from their ClassifierI class from nltk.classify, which will give us the attributes of a classifier, yet allow us to write our own custom classifier code. 

In [37]:
# Take the models from above, but remove SVC (cuz that's what he does since the accuracy is low)
from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
            
        choice_votes = votes.count(mode(votes))
        conf = float(choice_votes) / len(votes)
        return conf

In [38]:
voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier,
                                  LogisticRegression_classifier, SGDClassifier_classifier, 
                                  LinearSVC_classifier, NuSVC_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]),
      "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]),
      "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]),
      "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]),
      "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]),
      "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]),
      "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100)

('voted_classifier accuracy percent:', 85.0)
('Classification:', u'neg', 'Confidence %:', 100.0)
('Classification:', u'neg', 'Confidence %:', 57.14285714285714)
('Classification:', u'pos', 'Confidence %:', 57.14285714285714)
('Classification:', u'neg', 'Confidence %:', 71.42857142857143)
('Classification:', u'pos', 'Confidence %:', 100.0)
('Classification:', u'neg', 'Confidence %:', 100.0)


In [39]:
for i in range(25):
    print("Classification:", voted_classifier.classify(testing_set[i][0]),
      "Confidence %:", voted_classifier.confidence(testing_set[i][0])*100)

('Classification:', u'neg', 'Confidence %:', 100.0)
('Classification:', u'neg', 'Confidence %:', 57.14285714285714)
('Classification:', u'pos', 'Confidence %:', 57.14285714285714)
('Classification:', u'neg', 'Confidence %:', 71.42857142857143)
('Classification:', u'pos', 'Confidence %:', 100.0)
('Classification:', u'neg', 'Confidence %:', 100.0)
('Classification:', u'pos', 'Confidence %:', 100.0)
('Classification:', u'pos', 'Confidence %:', 57.14285714285714)
('Classification:', u'pos', 'Confidence %:', 100.0)
('Classification:', u'neg', 'Confidence %:', 100.0)
('Classification:', u'neg', 'Confidence %:', 100.0)
('Classification:', u'pos', 'Confidence %:', 100.0)
('Classification:', u'pos', 'Confidence %:', 100.0)
('Classification:', u'neg', 'Confidence %:', 100.0)
('Classification:', u'neg', 'Confidence %:', 100.0)
('Classification:', u'neg', 'Confidence %:', 100.0)
('Classification:', u'pos', 'Confidence %:', 100.0)
('Classification:', u'neg', 'Confidence %:', 100.0)
('Classification

When the above is 100%, it means that ALL the models agree that list of features is pos or neg. When it's less than 100% it's saying that x/7 of the models (the number of models we passed into VoteClassifier) agree with the test set

# Lesson 17 - Investigating Bias

At this point in our project, we're interested in moving on to a real dataset, but we're concerned still about our volatility in accuracy.

In this video, we peak into the classifiers to see if we have any bias leans towards positive or negative, and we wind up finding out that not only do we have a bias, we have a bug!

In [40]:
# This one is hard to follow. All's i know is he doesn't shffule the dataset in the video, and the models
# get stuck in a local minima and become biased.


# Lesson 18 - Better Training Data

After some consideration it became clear that a new dataset would solve a lot of problems. This tutorial covers employing a new dataset, and what is involved in this process. 

This time, we're using a movie reviews data set that contains much shorter movie reviews. 

You can get this data set from: http://pythonprogramming.net/static/d...

This one yields us a far more reliable reading across the board, and is far more fitting for the tweets we intend to read from the Twitter API soon. 

In [41]:
import codecs

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
            
        choice_votes = votes.count(mode(votes))
        conf = float(choice_votes) / len(votes)
        return conf
    
# short_pos = open("data/short_reviews/positive.txt", "r").read()
# short_neg = open("data/short_reviews/negative.txt", "r").read()
short_pos = codecs.open("data/short_reviews/positive.txt","r", encoding='latin2').read()
short_neg = codecs.open("data/short_reviews/negative.txt","r",encoding='latin2').read()

documents = []

for r in short_pos.split('\n'):
    documents.append((r, "pos"))
    
for r in short_neg.split('\n'):
    documents.append((r, "neg"))
    
all_words = []

short_pos_words = set(word_tokenize(short_pos))
short_neg_words = set(word_tokenize(short_neg))

for w in short_pos_words:
    all_words.append(w.lower())
    
for w in short_neg_words:
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:5000]

def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

random.shuffle(featuresets)

# positive data example:
training_set = featuresets[:10000]
testing_set = featuresets[10000:]

# lets open the pickle file now
classifier_f = open("naive_bayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()


print("Original Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

# Multinomial NB
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

# Bernoulli
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

# we want to pickle every classifier so that we can reuse these models on future test setsþyou

('Original Naive Bayes Algo accuracy:', 45.16616314199396)
Most Informative Features
             outstanding = True              pos : neg    =     10.6 : 1.0
                  finest = True              pos : neg    =      9.0 : 1.0
                   mulan = True              pos : neg    =      8.8 : 1.0
             beautifully = True              pos : neg    =      8.4 : 1.0
             wonderfully = True              pos : neg    =      7.6 : 1.0
                  seagal = True              neg : pos    =      7.1 : 1.0
                  prinze = True              neg : pos    =      6.7 : 1.0
              schumacher = True              neg : pos    =      6.7 : 1.0
                  alicia = True              neg : pos    =      6.7 : 1.0
                  wasted = True              neg : pos    =      6.1 : 1.0
                   damon = True              pos : neg    =      6.0 : 1.0
                   inept = True              neg : pos    =      5.6 : 1.0
               

KeyboardInterrupt: 

In [44]:
documents

[(u'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . ',
  'pos'),
 (u'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . ',
  'pos'),
 (u'effective but too-tepid biopic', 'pos'),
 (u'if you sometimes like to go to the movies to have fun , wasabi is a good place to start . ',
  'pos'),
 (u"emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . ",
  'pos'),
 (u'the film provides some great insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game . ',
  'pos'),
 (u'offers that rare combination of entertainment and education . ', 'pos'),
 (u'perhaps no picture ever made has more l

# Lesson 19 - Sentiment Analysis Module

Now that we've got a more reliable classifier, we're ready to push forward. Here, we cover how we can convert our classifier training script to an actual sentiment analysis module. 

We pickle everything, and create a new sentiment function, which, with a parameter of "Text" will perform a classification and return the result. 

By pickling everything, we find that we can load this module in seconds, rather than the prior 3-5 minutes. After this, we're ready to apply this module to a live Twitter stream. 

In [None]:
# a lot of pickling - worth going back to

# Lesson 20 - Twitter Sentiment Analysis

Let's do a live test with Twitter!

# Lesson 21 - Graphing Live Twitter Sentiment

Note! There is a section on how to do live updating graphs on pythonprogramming.net