
## Installing and Using NLTK


In [2]:
import nltk
nltk.download() # download all packages/collections in nltk

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Example of Tokenizing

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize
example_text = "Hello Mr. Sampath, how are you doing today? The weather is great and NLTK is interesting. It is snowing here and really, it is pretty cold. Though at -5 degrees, actually quite warmer than usual! Okay, bye now. ttyl."
print(sent_tokenize(example_text))

['Hello Mr. Sampath, how are you doing today?', 'The weather is great and NLTK is interesting.', 'It is snowing here and really, it is pretty cold.', 'Though at -5 degrees, actually quite warmer than usual!', 'Okay, bye now.', 'ttyl.']


In [8]:
print(word_tokenize(example_text))

['Hello', 'Mr.', 'Sampath', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', 'and', 'NLTK', 'is', 'interesting', '.', 'It', 'is', 'snowing', 'here', 'and', 'really', ',', 'it', 'is', 'pretty', 'cold', '.', 'Though', 'at', '-5', 'degrees', ',', 'actually', 'quite', 'warmer', 'than', 'usual', '!', 'Okay', ',', 'bye', 'now', '.', 'ttyl', '.']


## Filtering Stop Words
Stop words are words which are filtered out before or after processing of natural language data. Stop words usually refers to the most common words in a language.

(Note: there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list).

In [11]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
example_sentence = "This is an example sentence showing stop word filtration. Hopefully, this will work! Or, will it? Wont it?? Lets find out!!"
stop_words = set(stopwords.words("english"))
words = word_tokenize(example_sentence)
filtered_sentence = []
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)

# The above for loop as a one-liner
# filtered_sentence = [w for w in words if w not in stop_words]

print(filtered_sentence)

['This', 'example', 'sentence', 'showing', 'stop', 'word', 'filtration', '.', 'Hopefully', ',', 'work', '!', 'Or', ',', '?', 'Wont', '?', '?', 'Lets', 'find', '!', '!']


## Stemming
Reducing different forms of a word to a common base form. For example: 
- “I am a student” = “I be a student”;  
- “My dog’s fur is dark” = “My dog fur be dark”.

Note: Stemming may not be needed aynmore with modern NLP tools.

In [20]:
from nltk.stem import PorterStemmer # Using the Porter Stemming Algorithm from 1979!
from nltk.tokenize import word_tokenize
ps = PorterStemmer()

example_words = ["python","pythoning","pythoner","pythoned","pythonly"]
for w in example_words:
    print(ps.stem(w))

example_sentence = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned badly at least once."
words = word_tokenize(example_sentence)
for w in words:
    print(ps.stem(w))

python
python
python
python
pythonli
It
is
veri
import
to
be
pythonli
while
you
are
python
with
python
.
all
python
have
python
badli
at
least
onc
.


## Part of Speech Tagging

In [29]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer # Unsupervised tokenizer, can also be retrained if needed

train_text = state_union.raw("2005-GWBush.txt") # train on this text
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[1:10]: # restricting to first 10 sentences
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

process_content()

[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nation', 'NN'), ('lost', 'VBD'), ('a', 'DT'), ('beloved', 'VBN'), (',', ','), ('graceful', 'JJ'), (',', ','), ('courageous', 'JJ'), ('woman', 'NN'), ('who', 'WP'), ('called', 'VBD'), ('America', 'NNP'), ('to', 'TO'), ('its', 'PRP$'), ('founding', 'NN'), ('ideals', 'NNS'), ('and', 'CC'), ('carried', 'VBD'), ('on', 'IN'), ('a', 'DT'), ('noble', 'JJ'), ('dream', 'NN'), ('.', '.')]
[('Tonight', 'NN'), ('we', 'PRP'), ('are', 'VBP'), ('comforted', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('hope', 'NN'), ('of

In [None]:
## Chunking
