In [1]:
import nltk
import os
import nltk.corpus

In [2]:
#print(os.listdir(nltk.data.find("corpora")))

In [3]:
#nltk.corpus.gutenberg.fileids()

In [4]:
hamlet = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')
hamlet

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', ...]

In [5]:
for word in hamlet[:20]:
    print(word, sep = ' ', end = ' ')

[ The Tragedie of Hamlet by William Shakespeare 1599 ] Actus Primus . Scoena Prima . Enter Barnardo and Francisco 

In [6]:
AI = "week ago a friend invited a couple of other couples over for dinner."

In [7]:
from nltk.tokenize import word_tokenize

In [8]:
AI_tokens = word_tokenize(AI)
print(AI_tokens)

['week', 'ago', 'a', 'friend', 'invited', 'a', 'couple', 'of', 'other', 'couples', 'over', 'for', 'dinner', '.']


In [9]:
from nltk.probability import FreqDist
fdist = FreqDist()

In [10]:
for word in AI_tokens:
    fdist[word.lower()]+=1

fdist

FreqDist({'a': 2, 'week': 1, 'ago': 1, 'friend': 1, 'invited': 1, 'couple': 1, 'of': 1, 'other': 1, 'couples': 1, 'over': 1, ...})

In [11]:
from nltk.tokenize import blankline_tokenize
AI_blank = blankline_tokenize(AI)
len(AI_blank)

1

In [12]:
from nltk.util import bigrams, trigrams, ngrams
quotes_tokens = nltk.word_tokenize(AI)
print(quotes_tokens)

['week', 'ago', 'a', 'friend', 'invited', 'a', 'couple', 'of', 'other', 'couples', 'over', 'for', 'dinner', '.']


In [13]:
# bigrams
q_bigrams = list(nltk.bigrams(quotes_tokens))
q_bigrams[0]

('week', 'ago')

In [14]:
# N-grams
q_ngrams = list(nltk.ngrams(quotes_tokens,4))
q_ngrams[0]

('week', 'ago', 'a', 'friend')

In [15]:
# Stemming: Normalize word into base form (may not be proper word)
from nltk.stem import PorterStemmer
pst = PorterStemmer()
pst.stem("having")

'have'

In [16]:
words_to_stem = ["give","giving","given","gave"]
for words in words_to_stem:
    print(words+":"+pst.stem(words))

give:give
giving:give
given:given
gave:gave


In [17]:
from nltk.stem import LancasterStemmer
lst = LancasterStemmer()
for words in words_to_stem:
    print(words+":"+lst.stem(words))

give:giv
giving:giv
given:giv
gave:gav


In [18]:
# Lemmatization: Word to its base form (meaningful word)
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer
word_lem = WordNetLemmatizer()
word_lem.lemmatize('corpora')

'corpus'

In [19]:
# stopwords
from nltk.corpus import stopwords

In [23]:
#stopwords.words('english')

In [24]:
# remove stopwords
import re
punctuations = re.compile(r'[-,.?!;()|0-9]')

In [26]:
post_punctuations = []
for words in AI_tokens:
    word = punctuations.sub("",words)
    if len(words)>0:
        post_punctuations.append(word)

In [27]:
print(post_punctuations)

['week', 'ago', 'a', 'friend', 'invited', 'a', 'couple', 'of', 'other', 'couples', 'over', 'for', 'dinner', '']


In [29]:
# pos tagging
for tokens in AI_tokens:
    print(nltk.pos_tag([tokens]))

[('week', 'NN')]
[('ago', 'RB')]
[('a', 'DT')]
[('friend', 'NN')]
[('invited', 'VBN')]
[('a', 'DT')]
[('couple', 'NN')]
[('of', 'IN')]
[('other', 'JJ')]
[('couples', 'NNS')]
[('over', 'IN')]
[('for', 'IN')]
[('dinner', 'NN')]
[('.', '.')]


In [31]:
# Named Entity Recognition
sent = "All political parties but BJP in Punjab support bandh"
from nltk import ne_chunk
NE_tokens = word_tokenize(sent)
NE_tags = nltk.pos_tag(NE_tokens)
NE_NER = ne_chunk(NE_tags)
print(NE_NER)

(S
  All/DT
  political/JJ
  parties/NNS
  but/CC
  (ORGANIZATION BJP/NNP)
  in/IN
  (GPE Punjab/NNP)
  support/NN
  bandh/NN)
