In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [3]:
example_text = "Hello Mr. Smith, how are you doing today? The weather is great and Python is awesome. The sky is blue."

In [4]:
word_tokenize(example_text)

['Hello',
 'Mr.',
 'Smith',
 ',',
 'how',
 'are',
 'you',
 'doing',
 'today',
 '?',
 'The',
 'weather',
 'is',
 'great',
 'and',
 'Python',
 'is',
 'awesome',
 '.',
 'The',
 'sky',
 'is',
 'blue',
 '.']

In [5]:
sent_tokenize(example_text)

['Hello Mr. Smith, how are you doing today?',
 'The weather is great and Python is awesome.',
 'The sky is blue.']

In [6]:
for item in word_tokenize(example_text):
    print(item)

Hello
Mr.
Smith
,
how
are
you
doing
today
?
The
weather
is
great
and
Python
is
awesome
.
The
sky
is
blue
.


## Stop Words

Can be used in two ways.
1. To remove words that occur commonly.
2. To remove words that are used sarcastically or that can mislead the analysis.

In [7]:
from nltk.corpus import stopwords

In [8]:
stop_words = set(stopwords.words("english"))

In [9]:
token_words = word_tokenize(example_text)

In [10]:
token_words

['Hello',
 'Mr.',
 'Smith',
 ',',
 'how',
 'are',
 'you',
 'doing',
 'today',
 '?',
 'The',
 'weather',
 'is',
 'great',
 'and',
 'Python',
 'is',
 'awesome',
 '.',
 'The',
 'sky',
 'is',
 'blue',
 '.']

In [11]:
filtered_sentence = [w for w in token_words if w not in stop_words]

## Stemming

Take stem of the word for variation of the word. For example, use rid for riding and ride

In [12]:
from nltk.stem import PorterStemmer

In [13]:
example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]

In [14]:
ps = PorterStemmer()

In [15]:
[ps.stem(word) for word in example_words]

['python', 'python', 'python', 'python', 'pythonli']

In [16]:
new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly atleast once."

In [17]:
python_words = word_tokenize(new_text)

In [18]:
[ps.stem(w) for w in python_words]

['It',
 'is',
 'veri',
 'import',
 'to',
 'be',
 'pythonli',
 'while',
 'you',
 'are',
 'python',
 'with',
 'python',
 '.',
 'all',
 'python',
 'have',
 'python',
 'poorli',
 'atleast',
 'onc',
 '.']

Use word net for getting synonyms

# Speech tagging

In [19]:
import nltk

tags words to part of the speech.. eg: noun, verb etc..

In [20]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
#Punkt sentence tokenizer is an unsupervised tokenizing algorithm

In [21]:
# Extracting raw text from GW Bush to train and test the Punkt Sentence tokenizer algorithm
train_text = state_union.raw('2005-GWBush.txt')
test_text = state_union.raw('2006-GWBush.txt')

In [22]:
custom_tokenizer = PunktSentenceTokenizer(train_text)

In [23]:
tokenized = custom_tokenizer.tokenize(test_text)

In [24]:
def process_content():
    try:
        tags = [nltk.pos_tag(nltk.word_tokenize(w)) for w in tokenized]
        print(tags)
    except Exception as e:
        print(str(e))

In [25]:
process_content()

[[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')], [('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('n

# Chunking

In [None]:
Next step to figure out the meaning of a sentence.

Who is the sentence talking about? (Noun is generally going to be the subject)

Noun phrases. Noun and a bunch of modifiers around that.

We can only chunck touching words

In [59]:
def process_content_chunk():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            # <> are used for mentioning speech tags
            #Any adverb. ? -> 0 or 1 characters
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            print(chunked)
        
    except Exception as e:
        print(str(e))
        
process_content_chunk()

(S
  (Chunk PRESIDENT/NNP)
  (Chunk GEORGE/NNP)
  (Chunk W./NNP)
  (Chunk BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP)
  (Chunk JOINT/NNP)
  (Chunk SESSION/NNP)
  OF/IN
  (Chunk THE/NNP)
  (Chunk CONGRESS/NNP)
  (Chunk ON/NNP)
  (Chunk THE/NNP)
  (Chunk STATE/NNP)
  OF/IN
  (Chunk THE/NNP)
  (Chunk UNION/NNP)
  (Chunk January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP)
  (Chunk PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(S
  (Chunk Mr./NNP)
  (Chunk Speaker/NNP)
  ,/,
  (Chunk Vice/NNP)
  (Chunk President/NNP)
  (Chunk Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP)
  (Chunk Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  (Chunk called/VBD America/NNP)

  ./.)
(S
  (Chunk White/NNP)
  (Chunk House/NNP photo/NN)
  by/IN
  (Chunk Eric/NNP)
  (Chunk Draper/NNP)
  Our/PRP$
  men/NNS
  and/CC
  women/NNS
  in/IN
  uniform/JJ
  are/VBP
  making/VBG
  sacrifices/NNS
  --/:
  and/CC
  showing/VBG
  a/DT
  sense/NN
  of/IN
  duty/NN
  stronger/JJR
  than/IN
  all/DT
  fear/NN
  ./.)
(S
  They/PRP
  know/VBP
  what/WP
  it/PRP
  's/VBZ
  like/IN
  to/TO
  fight/VB
  house/NN
  to/TO
  house/NN
  in/IN
  a/DT
  maze/NN
  of/IN
  streets/NNS
  ,/,
  to/TO
  wear/VB
  heavy/JJ
  gear/NN
  in/IN
  the/DT
  desert/NN
  heat/NN
  ,/,
  to/TO
  see/VB
  a/DT
  comrade/NN
  killed/VBN
  by/IN
  a/DT
  roadside/NN
  bomb/NN
  ./.)
(S
  And/CC
  those/DT
  who/WP
  know/VBP
  the/DT
  costs/NNS
  also/RB
  know/VBP
  the/DT
  stakes/NNS
  ./.)
(S
  Marine/JJ
  (Chunk Staff/NNP)
  (Chunk Sergeant/NNP)
  (Chunk Dan/NNP)
  (Chunk Clay/NNP)
  was/VBD
  killed/VBN
  last/JJ
  month/NN
  fighting/VBG
  in/IN
  (Chunk Fallujah/NNP)
  ./.)
(S
  He/PRP
  left/VBD

Chunk is a noun

In [None]:
def process_content_chunk():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            # <> are used for mentioning speech tags
            #Any adverb. ? -> 0 or 1 characters
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()
        
    except Exception as e:
        print(str(e))
        
process_content_chunk()

# Chinking


It is the removal of something from a chunk

In [27]:
def process_content_chink():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            # <> are used for mentioning speech tags
            #Any adverb. ? -> 0 or 1 characters
            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT>+{"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            print(chunked)
        
    except Exception as e:
        print(str(e))
        
process_content_chink()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP 'S/POS ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk
    THE/NNP
    UNION/NNP
    January/NNP
    31/CD
    ,/,
    2006/CD
    THE/NNP
    PRESIDENT/NNP
    :/:
    Thank/NNP
    you/PRP)
  all/DT
  (Chunk ./.))
(S
  (Chunk
    Mr./NNP
    Speaker/NNP
    ,/,
    Vice/NNP
    President/NNP
    Cheney/NNP
    ,/,
    members/NNS)
  of/IN
  (Chunk Congress/NNP ,/, members/NNS)
  of/IN
  the/DT
  (Chunk
    Supreme/NNP
    Court/NNP
    and/CC
    diplomatic/JJ
    corps/NN
    ,/,
    distinguished/JJ
    guests/NNS
    ,/,
    and/CC
    fellow/JJ
    citizens/NNS
    :/:)
  Today/VB
  (Chunk our/PRP$ nation/NN)
  lost/VBD
  a/DT
  beloved/VBN
  (Chunk ,/, graceful/JJ ,/, courageous/JJ woman/NN who/WP)
  called/VBD
  (Chunk America/NNP to/TO its/PRP$ founding/NN ideals/NNS and/CC)
  carried/VBD
  on/IN
  a/DT
  (Chunk noble/JJ dream/NN 

(S
  (Chunk We/PRP)
  're/VBP
  (Chunk grateful/JJ to/TO)
  all/DT
  (Chunk who/WP)
  volunteer/VBP
  (Chunk to/TO)
  wear/VB
  (Chunk our/PRP$ nation/NN 's/POS uniform/NN --/: and/CC)
  as/IN
  (Chunk we/PRP)
  honor/VBP
  (Chunk our/PRP$ brave/NN troops/NNS ,/,)
  let/VB
  (Chunk us/PRP never/RB)
  forget/VBP
  the/DT
  (Chunk sacrifices/NNS)
  of/IN
  (Chunk America/NNP 's/POS military/JJ families/NNS ./.))
(S (Chunk (/( Applause/NNP ./. )/)))
(S
  (Chunk Our/PRP$ offensive/JJ)
  against/IN
  (Chunk terror/NN)
  involves/VBZ
  (Chunk more/JJR)
  than/IN
  (Chunk military/JJ action/NN ./.))
(S
  (Chunk Ultimately/RB ,/,)
  the/DT
  (Chunk only/JJ way/NN to/TO)
  defeat/VB
  the/DT
  (Chunk terrorists/NNS)
  is/VBZ
  (Chunk to/TO)
  defeat/VB
  (Chunk their/PRP$ dark/JJ vision/NN)
  of/IN
  hatred/VBN
  (Chunk and/CC)
  fear/VBN
  by/IN
  offering/VBG
  the/DT
  (Chunk hopeful/JJ alternative/NN)
  of/IN
  (Chunk political/JJ freedom/NN and/CC peaceful/JJ change/NN ./.))
(S
  So/IN
  t

(S
  For/IN
  all/DT
  (Chunk Americans/NNPS --/:)
  for/IN
  all/DT
  (Chunk Americans/NNPS ,/, we/PRP must/MD)
  confront/VB
  the/DT
  rising/VBG
  (Chunk cost/NN)
  of/IN
  (Chunk care/NN ,/,)
  strengthen/VB
  the/DT
  (Chunk
    doctor-patient/JJ
    relationship/NN
    ,/,
    and/CC
    help/NN
    people/NNS)
  afford/VBP
  the/DT
  (Chunk insurance/NN coverage/NN they/PRP)
  need/VBP
  (Chunk ./.))
(S (Chunk (/( Applause/NNP ./. )/)))
(S
  (Chunk We/PRP will/MD)
  make/VB
  (Chunk wider/JJR use/NN)
  of/IN
  (Chunk
    electronic/JJ
    records/NNS
    and/CC
    other/JJ
    health/NN
    information/NN
    technology/NN
    ,/,
    to/TO)
  help/VB
  control/VB
  (Chunk costs/NNS and/CC)
  reduce/VB
  (Chunk dangerous/JJ medical/JJ errors/NNS ./.))
(S
  (Chunk We/PRP will/MD)
  strengthen/VB
  (Chunk health/NN savings/NNS accounts/NNS --/:)
  making/VBG
  (Chunk
    sure/JJ
    individuals/NNS
    and/CC
    small/JJ
    business/NN
    employees/NNS
    can/MD)
  buy/VB
  

# Named Entity

Organization, person, location, date, time, money, percent, facility, gpe

In [32]:
def process_content_ne(binary = False):
    try:
        for i in tokenized[2:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            namedEnt = nltk.ne_chunk(tagged, binary = binary)
            
            print(namedEnt)
        
    except Exception as e:
        print(str(e))

In [33]:
process_content_ne()
#The first item in the tuple is NE.

(S
  Tonight/NN
  we/PRP
  are/VBP
  comforted/VBN
  by/IN
  the/DT
  hope/NN
  of/IN
  a/DT
  glad/JJ
  reunion/NN
  with/IN
  the/DT
  husband/NN
  who/WP
  was/VBD
  taken/VBN
  so/RB
  long/RB
  ago/RB
  ,/,
  and/CC
  we/PRP
  are/VBP
  grateful/JJ
  for/IN
  the/DT
  good/JJ
  life/NN
  of/IN
  (ORGANIZATION Coretta/NNP Scott/NNP King/NNP)
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  President/NNP
  (PERSON George/NNP W./NNP Bush/NNP)
  reacts/VBZ
  to/TO
  applause/VB
  during/IN
  his/PRP$
  State/NNP
  of/IN
  the/DT
  (ORGANIZATION Union/NNP Address/NNP)
  at/IN
  the/DT
  (GPE Capitol/NNP)
  ,/,
  Tuesday/NNP
  ,/,
  (PERSON Jan/NNP)
  ./.)
(S 31/CD ,/, 2006/CD ./.)
(S
  (FACILITY White/NNP)
  (ORGANIZATION House/NNP)
  photo/NN
  by/IN
  (PERSON Eric/NNP)
  DraperEvery/NNP
  time/NN
  I/PRP
  'm/VBP
  invited/JJ
  to/TO
  this/DT
  rostrum/NN
  ,/,
  I/PRP
  'm/VBP
  humbled/VBN
  by/IN
  the/DT
  privilege/NN
  ,/,
  and/CC
  mindful/NN
  of/IN
  the/DT
  histor

(S
  (GPE Second/JJ)
  ,/,
  we/PRP
  're/VBP
  continuing/VBG
  reconstruction/NN
  efforts/NNS
  ,/,
  and/CC
  helping/VBG
  the/DT
  (GPE Iraqi/NNP)
  government/NN
  to/TO
  fight/VB
  corruption/NN
  and/CC
  build/VB
  a/DT
  modern/JJ
  economy/NN
  ,/,
  so/IN
  all/DT
  (GPE Iraqis/NNP)
  can/MD
  experience/VB
  the/DT
  benefits/NNS
  of/IN
  freedom/NN
  ./.)
(S
  And/CC
  ,/,
  third/JJ
  ,/,
  we/PRP
  're/VBP
  striking/VBG
  terrorist/JJ
  targets/NNS
  while/IN
  we/PRP
  train/VBP
  (GPE Iraqi/JJ)
  forces/NNS
  that/WDT
  are/VBP
  increasingly/RB
  capable/JJ
  of/IN
  defeating/VBG
  the/DT
  enemy/NN
  ./.)
(S
  (GPE Iraqis/NNP)
  are/VBP
  showing/VBG
  their/PRP$
  courage/NN
  every/DT
  day/NN
  ,/,
  and/CC
  we/PRP
  are/VBP
  proud/JJ
  to/TO
  be/VB
  their/PRP$
  allies/NNS
  in/IN
  the/DT
  cause/NN
  of/IN
  freedom/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  Our/PRP$
  work/NN
  in/IN
  (GPE Iraq/NNP)
  is/VBZ
  difficult/JJ
  because/

(S
  Our/PRP$
  offensive/JJ
  against/IN
  terror/NN
  involves/VBZ
  more/JJR
  than/IN
  military/JJ
  action/NN
  ./.)
(S
  Ultimately/RB
  ,/,
  the/DT
  only/JJ
  way/NN
  to/TO
  defeat/VB
  the/DT
  terrorists/NNS
  is/VBZ
  to/TO
  defeat/VB
  their/PRP$
  dark/JJ
  vision/NN
  of/IN
  hatred/VBN
  and/CC
  fear/VBN
  by/IN
  offering/VBG
  the/DT
  hopeful/JJ
  alternative/NN
  of/IN
  political/JJ
  freedom/NN
  and/CC
  peaceful/JJ
  change/NN
  ./.)
(S
  So/IN
  the/DT
  (GPE United/NNP States/NNPS)
  of/IN
  (GPE America/NNP)
  supports/NNS
  democratic/JJ
  reform/NN
  across/IN
  the/DT
  broader/JJR
  (GPE Middle/NNP East/NNP)
  ./.)
(S
  Elections/NNS
  are/VBP
  vital/JJ
  ,/,
  but/CC
  they/PRP
  are/VBP
  only/RB
  the/DT
  beginning/NN
  ./.)
(S
  Raising/VBG
  up/RP
  a/DT
  democracy/NN
  requires/VBZ
  the/DT
  rule/NN
  of/IN
  law/NN
  ,/,
  and/CC
  protection/NN
  of/IN
  minorities/NNS
  ,/,
  and/CC
  strong/JJ
  ,/,
  accountable/JJ
  institutions/NNS
 

  ./.)
(S
  But/CC
  we/PRP
  did/VBD
  not/RB
  know/VB
  about/IN
  their/PRP$
  plans/NNS
  until/IN
  it/PRP
  was/VBD
  too/RB
  late/JJ
  ./.)
(S
  So/RB
  to/TO
  prevent/VB
  another/DT
  attack/NN
  --/:
  based/VBN
  on/IN
  authority/NN
  given/VBN
  to/TO
  me/PRP
  by/IN
  the/DT
  (ORGANIZATION Constitution/NNP)
  and/CC
  by/IN
  statute/NN
  --/:
  I/PRP
  have/VBP
  authorized/VBN
  a/DT
  terrorist/JJ
  surveillance/NN
  program/NN
  to/TO
  aggressively/RB
  pursue/VB
  the/DT
  international/JJ
  communications/NNS
  of/IN
  suspected/JJ
  al/JJ
  Qaeda/NNP
  operatives/NNS
  and/CC
  affiliates/NNS
  to/TO
  and/CC
  from/IN
  (GPE America/NNP)
  ./.)
(S
  Previous/JJ
  Presidents/NNS
  have/VBP
  used/VBN
  the/DT
  same/JJ
  constitutional/JJ
  authority/NN
  I/PRP
  have/VBP
  ,/,
  and/CC
  federal/JJ
  courts/NNS
  have/VBP
  approved/VBN
  the/DT
  use/NN
  of/IN
  that/DT
  authority/NN
  ./.)
(S
  Appropriate/JJ
  members/NNS
  of/IN
  (ORGANIZATION Congres

(S
  Keeping/VBG
  (GPE America/NNP)
  competitive/JJ
  requires/VBZ
  us/PRP
  to/TO
  open/VB
  more/JJR
  markets/NNS
  for/IN
  all/DT
  that/DT
  Americans/NNPS
  make/VBP
  and/CC
  grow/VB
  ./.)
(S
  One/CD
  out/NN
  of/IN
  every/DT
  five/CD
  factory/NN
  jobs/NNS
  in/IN
  (GPE America/NNP)
  is/VBZ
  related/VBN
  to/TO
  global/JJ
  trade/NN
  ,/,
  and/CC
  we/PRP
  want/VBP
  people/NNS
  everywhere/RB
  to/TO
  buy/VB
  (GPE American/NNP)
  ./.)
(S
  With/IN
  open/JJ
  markets/NNS
  and/CC
  a/DT
  level/JJ
  playing/NN
  field/NN
  ,/,
  no/DT
  one/NN
  can/MD
  out-produce/VB
  or/CC
  out-compete/VB
  the/DT
  (GPE American/JJ)
  worker/NN
  ./.)
(S (/( (ORGANIZATION Applause/NNP) ./. )/))
(S
  Keeping/VBG
  (GPE America/NNP)
  competitive/JJ
  requires/VBZ
  an/DT
  immigration/NN
  system/NN
  that/WDT
  upholds/VBZ
  our/PRP$
  laws/NNS
  ,/,
  reflects/VBZ
  our/PRP$
  values/NNS
  ,/,
  and/CC
  serves/VBZ
  the/DT
  interests/NNS
  of/IN
  our/PRP$
  econom

In [34]:
process_content_ne(binary=True)

(S
  Tonight/NN
  we/PRP
  are/VBP
  comforted/VBN
  by/IN
  the/DT
  hope/NN
  of/IN
  a/DT
  glad/JJ
  reunion/NN
  with/IN
  the/DT
  husband/NN
  who/WP
  was/VBD
  taken/VBN
  so/RB
  long/RB
  ago/RB
  ,/,
  and/CC
  we/PRP
  are/VBP
  grateful/JJ
  for/IN
  the/DT
  good/JJ
  life/NN
  of/IN
  (NE Coretta/NNP Scott/NNP King/NNP)
  ./.)
(S (/( (NE Applause/NNP) ./. )/))
(S
  President/NNP
  (NE George/NNP)
  W./NNP
  Bush/NNP
  reacts/VBZ
  to/TO
  applause/VB
  during/IN
  his/PRP$
  State/NNP
  of/IN
  the/DT
  (NE Union/NNP Address/NNP)
  at/IN
  the/DT
  (NE Capitol/NNP)
  ,/,
  Tuesday/NNP
  ,/,
  (NE Jan/NNP)
  ./.)
(S 31/CD ,/, 2006/CD ./.)
(S
  (NE White/NNP House/NNP)
  photo/NN
  by/IN
  (NE Eric/NNP)
  DraperEvery/NNP
  time/NN
  I/PRP
  'm/VBP
  invited/JJ
  to/TO
  this/DT
  rostrum/NN
  ,/,
  I/PRP
  'm/VBP
  humbled/VBN
  by/IN
  the/DT
  privilege/NN
  ,/,
  and/CC
  mindful/NN
  of/IN
  the/DT
  history/NN
  we/PRP
  've/VBP
  seen/VBN
  together/RB
  ./.)
(S
  W

(S
  Their/PRP$
  aim/NN
  is/VBZ
  to/TO
  seize/VB
  power/NN
  in/IN
  (NE Iraq/NNP)
  ,/,
  and/CC
  use/VB
  it/PRP
  as/IN
  a/DT
  safe/JJ
  haven/NN
  to/TO
  launch/VB
  attacks/NNS
  against/IN
  (NE America/NNP)
  and/CC
  the/DT
  world/NN
  ./.)
(S
  Lacking/VBG
  the/DT
  military/JJ
  strength/NN
  to/TO
  challenge/VB
  us/PRP
  directly/RB
  ,/,
  the/DT
  terrorists/NNS
  have/VBP
  chosen/VBN
  the/DT
  weapon/NN
  of/IN
  fear/NN
  ./.)
(S
  When/WRB
  they/PRP
  murder/VBP
  children/NNS
  at/IN
  a/DT
  school/NN
  in/IN
  (NE Beslan/NNP)
  ,/,
  or/CC
  blow/VB
  up/RP
  commuters/NNS
  in/IN
  (NE London/NNP)
  ,/,
  or/CC
  behead/VB
  a/DT
  bound/NN
  captive/NN
  ,/,
  the/DT
  terrorists/NNS
  hope/VBP
  these/DT
  horrors/NNS
  will/MD
  break/VB
  our/PRP$
  will/MD
  ,/,
  allowing/VBG
  the/DT
  violent/NN
  to/TO
  inherit/VB
  the/DT
  (NE Earth/NNP)
  ./.)
(S
  But/CC
  they/PRP
  have/VBP
  miscalculated/VBN
  :/:
  We/PRP
  love/VBP
  our/PRP$
  fr

  ./.)
(S
  Raising/VBG
  up/RP
  a/DT
  democracy/NN
  requires/VBZ
  the/DT
  rule/NN
  of/IN
  law/NN
  ,/,
  and/CC
  protection/NN
  of/IN
  minorities/NNS
  ,/,
  and/CC
  strong/JJ
  ,/,
  accountable/JJ
  institutions/NNS
  that/IN
  last/JJ
  longer/JJR
  than/IN
  a/DT
  single/JJ
  vote/NN
  ./.)
(S
  The/DT
  great/JJ
  people/NNS
  of/IN
  (NE Egypt/NNP)
  have/VBP
  voted/VBN
  in/IN
  a/DT
  multi-party/JJ
  presidential/JJ
  election/NN
  --/:
  and/CC
  now/RB
  their/PRP$
  government/NN
  should/MD
  open/VB
  paths/NNS
  of/IN
  peaceful/JJ
  opposition/NN
  that/WDT
  will/MD
  reduce/VB
  the/DT
  appeal/NN
  of/IN
  radicalism/NN
  ./.)
(S
  The/DT
  (NE Palestinian/JJ)
  people/NNS
  have/VBP
  voted/VBN
  in/IN
  elections/NNS
  ./.)
(S
  And/CC
  now/RB
  the/DT
  leaders/NNS
  of/IN
  (NE Hamas/NNP)
  must/MD
  recognize/VB
  (NE Israel/NNP)
  ,/,
  disarm/NN
  ,/,
  reject/JJ
  terrorism/NN
  ,/,
  and/CC
  work/NN
  for/IN
  lasting/VBG
  peace/NN
  ./.)
(S

False positives and error rates are very high

# Lemmatizer

In [35]:
from nltk.stem import WordNetLemmatizer

In [36]:
lemmatizer = WordNetLemmatizer()

In [37]:
lemmatizer.lemmatize("cats")

'cat'

In [38]:
lemmatizer.lemmatize("cacti")

'cactus'

In [41]:
lemmatizer.lemmatize("lovely")

'lovely'

In [42]:
lemmatizer.lemmatize("better")

'better'

In [43]:
lemmatizer.lemmatize("better", pos="a")

'good'

In [44]:
lemmatizer.lemmatize("best", pos="a")

'best'

Position for lemmatizing is set to noun by default.

Could be more useful than stemming.

# Corpora

In [45]:
nltk.__file__

'/Users/avinash/anaconda3/lib/python3.6/site-packages/nltk/__init__.py'

# Wordnet

Using wordnet we can obtain the synonyms and antonyms.

In [46]:
from nltk.corpus import wordnet

In [47]:
syns = wordnet.synsets("program")

In [48]:
syns

[Synset('plan.n.01'),
 Synset('program.n.02'),
 Synset('broadcast.n.02'),
 Synset('platform.n.02'),
 Synset('program.n.05'),
 Synset('course_of_study.n.01'),
 Synset('program.n.07'),
 Synset('program.n.08'),
 Synset('program.v.01'),
 Synset('program.v.02')]

In [55]:
syns[0]

Synset('plan.n.01')

In [53]:
# Just the word
syns[0].lemmas()[0].name()

'plan'

In [56]:
# Definition
syns[0].definition()

'a series of steps to be carried out or goals to be accomplished'

In [57]:
#Examples using  
syns[0].examples()

['they drew up a six-step plan', 'they discussed plans for a new bond issue']

In [59]:
#Synonyms and antonyms
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))

{'skilful', 'unspoiled', 'trade_good', 'goodness', 'honest', 'near', 'unspoilt', 'commodity', 'just', 'secure', 'ripe', 'undecomposed', 'adept', 'effective', 'thoroughly', 'honorable', 'expert', 'sound', 'in_force', 'safe', 'serious', 'upright', 'respectable', 'right', 'in_effect', 'soundly', 'proficient', 'beneficial', 'salutary', 'estimable', 'full', 'dear', 'skillful', 'good', 'well', 'dependable', 'practiced'}
{'evil', 'ill', 'bad', 'badness', 'evilness'}


In [62]:
# Setting up Symmantic similarity for two words
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")

# Wu and Palmer similarity
w1.wup_similarity(w2)

0.9090909090909091

In [63]:
# Setting up Symmantic similarity for two words
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")

# Wu and Palmer similarity
w1.wup_similarity(w2)

0.6956521739130435

In [64]:
# Setting up Symmantic similarity for two words
w1 = wordnet.synset("cat.n.01")
w2 = wordnet.synset("dog.n.01")

# Wu and Palmer similarity
w1.wup_similarity(w2)

0.8571428571428571

# Text Classification

In [66]:
import random
from nltk.corpus import movie_reviews

In [68]:
movie_reviews.raw()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [77]:
# Using list comprehension
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]

In [78]:
documents[0]

(['plot',
  ':',
  'two',
  'teen',
  'couples',
  'go',
  'to',
  'a',
  'church',
  'party',
  ',',
  'drink',
  'and',
  'then',
  'drive',
  '.',
  'they',
  'get',
  'into',
  'an',
  'accident',
  '.',
  'one',
  'of',
  'the',
  'guys',
  'dies',
  ',',
  'but',
  'his',
  'girlfriend',
  'continues',
  'to',
  'see',
  'him',
  'in',
  'her',
  'life',
  ',',
  'and',
  'has',
  'nightmares',
  '.',
  'what',
  "'",
  's',
  'the',
  'deal',
  '?',
  'watch',
  'the',
  'movie',
  'and',
  '"',
  'sorta',
  '"',
  'find',
  'out',
  '.',
  '.',
  '.',
  'critique',
  ':',
  'a',
  'mind',
  '-',
  'fuck',
  'movie',
  'for',
  'the',
  'teen',
  'generation',
  'that',
  'touches',
  'on',
  'a',
  'very',
  'cool',
  'idea',
  ',',
  'but',
  'presents',
  'it',
  'in',
  'a',
  'very',
  'bad',
  'package',
  '.',
  'which',
  'is',
  'what',
  'makes',
  'this',
  'review',
  'an',
  'even',
  'harder',
  'one',
  'to',
  'write',
  ',',
  'since',
  'i',
  'generally',
  'a

In [75]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append([list(movie_reviews.words(fileid)), category])

In [76]:
documents[0]

[['plot',
  ':',
  'two',
  'teen',
  'couples',
  'go',
  'to',
  'a',
  'church',
  'party',
  ',',
  'drink',
  'and',
  'then',
  'drive',
  '.',
  'they',
  'get',
  'into',
  'an',
  'accident',
  '.',
  'one',
  'of',
  'the',
  'guys',
  'dies',
  ',',
  'but',
  'his',
  'girlfriend',
  'continues',
  'to',
  'see',
  'him',
  'in',
  'her',
  'life',
  ',',
  'and',
  'has',
  'nightmares',
  '.',
  'what',
  "'",
  's',
  'the',
  'deal',
  '?',
  'watch',
  'the',
  'movie',
  'and',
  '"',
  'sorta',
  '"',
  'find',
  'out',
  '.',
  '.',
  '.',
  'critique',
  ':',
  'a',
  'mind',
  '-',
  'fuck',
  'movie',
  'for',
  'the',
  'teen',
  'generation',
  'that',
  'touches',
  'on',
  'a',
  'very',
  'cool',
  'idea',
  ',',
  'but',
  'presents',
  'it',
  'in',
  'a',
  'very',
  'bad',
  'package',
  '.',
  'which',
  'is',
  'what',
  'makes',
  'this',
  'review',
  'an',
  'even',
  'harder',
  'one',
  'to',
  'write',
  ',',
  'since',
  'i',
  'generally',
  'a

In [81]:
random.shuffle(documents)

In [82]:
documents[0]

(['ex',
  '-',
  'universal',
  'soldier',
  'luc',
  'has',
  'to',
  'battle',
  'a',
  'group',
  'of',
  'newer',
  '-',
  'model',
  'engineered',
  'fighters',
  'gone',
  'bad',
  '.',
  'the',
  'review',
  'jean',
  '-',
  'claude',
  'van',
  'damme',
  'has',
  'a',
  'one',
  '-',
  'liner',
  'early',
  'on',
  'in',
  'universal',
  'soldier',
  ':',
  'the',
  'return',
  ',',
  'his',
  'latest',
  'attempt',
  'to',
  'remain',
  'relevant',
  ',',
  'that',
  'sums',
  'up',
  'this',
  'entire',
  'movie',
  ';',
  'he',
  'says',
  '"',
  'been',
  'there',
  ',',
  'done',
  'that',
  '.',
  '"',
  'no',
  'film',
  'critic',
  'could',
  'possibly',
  'sum',
  'up',
  'van',
  'damme',
  "'",
  's',
  'recent',
  'film',
  'choices',
  'any',
  'better',
  '.',
  'while',
  'other',
  'ageing',
  'action',
  'stars',
  'have',
  'wisely',
  'moved',
  'into',
  'other',
  'film',
  'genres',
  '(',
  'schwarzenegger',
  'makes',
  'as',
  'many',
  'family',
  'co

In [83]:
all_words = [w.lower() for w in movie_reviews.words()]

In [84]:
# Have frequency distribution 
all_words = nltk.FreqDist(all_words)

In [85]:
#To view the most common words
all_words.most_common(5)

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576)]

In [86]:
# To the the number of time the word supid occurred
all_words["stupid"]

253

In [88]:
list(all_words.keys())[:10]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']

In [98]:
all_words.most_common(10)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

In [100]:
[w[0] for w in all_words.most_common(10)]

[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in']

In [102]:
list(all_words.keys())[:10]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']

In [103]:
word_features = [w[0] for w in all_words.most_common(3000)]

In [112]:
def find_features(document):
    '''
    document = list of all the words in a review
    '''
    # To extract only the unique words in a document
    words = set(document)
    features = {}
    
    #Set true or false based on the if the word is 
    #present in the top 3000 words
    for w in word_features:
        features[w] = (w in words)
        
    return features    

In [113]:
find_features(movie_reviews.words('neg/cv000_29416.txt'))

{',': True,
 'the': True,
 '.': True,
 'a': True,
 'and': True,
 'of': True,
 'to': True,
 "'": True,
 'is': True,
 'in': True,
 's': True,
 '"': True,
 'it': True,
 'that': True,
 '-': True,
 ')': True,
 '(': True,
 'as': True,
 'with': True,
 'for': True,
 'his': True,
 'this': True,
 'film': True,
 'i': True,
 'he': True,
 'but': True,
 'on': True,
 'are': True,
 't': True,
 'by': True,
 'be': True,
 'one': True,
 'movie': True,
 'an': True,
 'who': True,
 'not': True,
 'you': True,
 'from': True,
 'at': False,
 'was': False,
 'have': True,
 'they': True,
 'has': True,
 'her': True,
 'all': True,
 '?': True,
 'there': True,
 'like': True,
 'so': True,
 'out': True,
 'about': True,
 'up': True,
 'more': True,
 'what': True,
 'when': True,
 'which': True,
 'or': True,
 'she': False,
 'their': False,
 ':': True,
 'some': False,
 'just': True,
 'can': False,
 'if': False,
 'we': True,
 'him': True,
 'into': True,
 'even': True,
 'only': True,
 'than': False,
 'no': True,
 'good': True,


In [114]:
feature_sets = [(find_features(rev), category) for (rev, category) in documents]

In [115]:
feature_sets[1000]

({',': True,
  'the': True,
  '.': True,
  'a': True,
  'and': True,
  'of': True,
  'to': True,
  "'": True,
  'is': True,
  'in': True,
  's': True,
  '"': False,
  'it': True,
  'that': True,
  '-': False,
  ')': True,
  '(': True,
  'as': True,
  'with': True,
  'for': True,
  'his': True,
  'this': True,
  'film': False,
  'i': True,
  'he': False,
  'but': True,
  'on': True,
  'are': False,
  't': True,
  'by': False,
  'be': True,
  'one': True,
  'movie': True,
  'an': False,
  'who': False,
  'not': True,
  'you': True,
  'from': False,
  'at': False,
  'was': True,
  'have': True,
  'they': False,
  'has': False,
  'her': False,
  'all': False,
  '?': True,
  'there': True,
  'like': True,
  'so': True,
  'out': False,
  'about': True,
  'up': False,
  'more': False,
  'what': True,
  'when': False,
  'which': False,
  'or': True,
  'she': False,
  'their': False,
  ':': False,
  'some': False,
  'just': False,
  'can': True,
  'if': True,
  'we': False,
  'him': False,
  'i

In [116]:
len(feature_sets)

2000

In [117]:
training_set = feature_sets[:1800]
testing_set = feature_sets[1800:]

In [118]:
#Bayes-> posterior = prior occurances * likelihood / evidence
#Highly scalable and easy to understand

classifier =nltk.NaiveBayesClassifier.train(training_set)

In [119]:
nltk.classify.accuracy(classifier, training_set)

0.8844444444444445

In [120]:
nltk.classify.accuracy(classifier, testing_set)

0.83

In [121]:
classifier.most_informative_features(15)

[('outstanding', True),
 ('damon', True),
 ('seagal', True),
 ('mulan', True),
 ('stiller', True),
 ('idiotic', True),
 ('finest', True),
 ('prinze', True),
 ('awful', True),
 ('wonderfully', True),
 ('schumacher', True),
 ('inept', True),
 ('flynt', True),
 ('ordinary', True),
 ('jolie', True)]

In [122]:
classifier.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =     10.8 : 1.0
                   damon = True              pos : neg    =      7.4 : 1.0
                  seagal = True              neg : pos    =      7.3 : 1.0
                   mulan = True              pos : neg    =      7.1 : 1.0
                 stiller = True              pos : neg    =      7.1 : 1.0
                 idiotic = True              neg : pos    =      6.9 : 1.0
                  finest = True              pos : neg    =      6.9 : 1.0
                  prinze = True              neg : pos    =      6.5 : 1.0
                   awful = True              neg : pos    =      6.5 : 1.0
             wonderfully = True              pos : neg    =      6.4 : 1.0
              schumacher = True              neg : pos    =      6.1 : 1.0
                   inept = True              neg : pos    =      5.9 : 1.0
                   flynt = True              pos : neg    =      5.7 : 1.0

In [124]:
#Pickle is a format in which we can save python objects
import pickle

In [127]:
#wb is write bytes
save_classifier = open("naivebayes.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [129]:
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

In [130]:
#Sklearn wrapper in nltk
from nltk.classify.scikitlearn import SklearnClassifier

In [131]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [132]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
nltk.classify.accuracy(MNB_classifier, testing_set)

0.83

In [None]:
# Does not work with nltk wrapper
GNB_classifier = SklearnClassifier(GaussianNB())
GNB_classifier.train(training_set)
nltk.classify.accuracy(GNB_classifier, testing_set)

In [135]:
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
nltk.classify.accuracy(BNB_classifier, testing_set)

0.835

In [137]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [138]:
#LogisticRegression
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
nltk.classify.accuracy(LogisticRegression_classifier, testing_set)

0.85

In [139]:
#SGDClassifier
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
nltk.classify.accuracy(SGDClassifier_classifier, testing_set)



0.82

In [140]:
#SVC
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
nltk.classify.accuracy(SVC_classifier, testing_set)

0.85

In [141]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
nltk.classify.accuracy(LinearSVC_classifier, testing_set)

0.84

In [142]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
nltk.classify.accuracy(NuSVC_classifier, testing_set)

0.865

# Ensemble Models

In [143]:
from nltk.classify import ClassifierI
#To decide which outcome got most votes
from statistics import mode

In [149]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        selection = votes.count(mode(votes))
        conf = selection/len(votes)
        return conf                         

In [152]:
# First one is commented out so that there are odd number of classifiers
voted_classifiers = VoteClassifier(#classifier, 
                                   MNB_classifier, 
                                   BNB_classifier, 
                                   LogisticRegression_classifier, 
                                   SGDClassifier_classifier, 
                                   SVC_classifier, 
                                   LinearSVC_classifier, 
                                   NuSVC_classifier)

In [153]:
nltk.classify.accuracy(voted_classifiers, testing_set)

0.87

In [158]:
for i in range(0,5):
    print("Classification: ", voted_classifiers.classify(testing_set[i][0]), 
      "& Confidence %:", voted_classifiers.confidence(testing_set[i][0]))

Classification:  pos & Confidence %: 1.0
Classification:  pos & Confidence %: 1.0
Classification:  neg & Confidence %: 1.0
Classification:  neg & Confidence %: 0.5714285714285714
Classification:  pos & Confidence %: 0.8571428571428571


In [None]:
# Use confusion matrix for all the methods and also the ensemble classifier.
# Use sklearn accuracy. (Do not use nltk wrapper)