In [20]:
# For NLP we use a package —> nltk [Natural Language Tool Kit]
import nltk

In [21]:
help(nltk)

Help on package nltk:

NAME
    nltk

DESCRIPTION
    The Natural Language Toolkit (NLTK) is an open source Python library
    for Natural Language Processing.  A free online book is available.
    (If you use the library for academic research, please cite the book.)
    
    Steven Bird, Ewan Klein, and Edward Loper (2009).
    Natural Language Processing with Python.  O'Reilly Media Inc.
    https://www.nltk.org/book/
    
    isort:skip_file
    
    @version: 3.9.1

PACKAGE CONTENTS
    app (package)
    book
    ccg (package)
    chat (package)
    chunk (package)
    classify (package)
    cli
    cluster (package)
    collections
    collocations
    compat
    corpus (package)
    data
    decorators
    downloader
    draw (package)
    featstruct
    grammar
    help
    inference (package)
    internals
    jsontags
    langnames
    lazyimport
    lm (package)
    metrics (package)
    misc (package)
    parse (package)
    probability
    sem (package)
    sentiment (packa

How to identify stop words

In [22]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop=stopwords.words('english')
stop

# for nlp we need to dowload the resourse first otherwise the code will show error like this
# Resource stopwords not found.
# Please use the NLTK Downloader to obtain the resource:
# >>> import nltk
# >>> nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [23]:
# no.of stop words in that list
print(len(stop))

179


## Tokenization

In [24]:
# if there are single line of sentence we can pass it through single quote('').
# But if there are multiple lines then we use triple qoute(''' ''')
nltk.download('punkt_tab')
sentence = 'luminar technolab is IT finishing school located at kakkanad'
from nltk.tokenize import word_tokenize
tok=word_tokenize(sentence)
tok

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


['luminar',
 'technolab',
 'is',
 'IT',
 'finishing',
 'school',
 'located',
 'at',
 'kakkanad']

In [26]:
review='''The Natural Language Toolkit (NLTK) is an open source Python library
    for Natural Language Processing. A free online book is available'''
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
Stop=stopwords.words('english')
review_token=word_tokenize(review)
# stop_removed=[i for i in review_token if i not in Stop]
# stop_removed # some stop words are removed and some are not (eg: The) because it is in capital letter. So we need to convert it into lowercase
final_removed=[i.lower() for i in review_token if i.lower() not in Stop]
final_removed

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['natural',
 'language',
 'toolkit',
 '(',
 'nltk',
 ')',
 'open',
 'source',
 'python',
 'library',
 'natural',
 'language',
 'processing',
 '.',
 'free',
 'online',
 'book',
 'available']

## N-gram

In [27]:
from nltk.util import ngrams
sentence='this is very good book to study'
data=ngrams(sequence=word_tokenize(sentence),n=2)
for i in data:
  print(i)

('this', 'is')
('is', 'very')
('very', 'good')
('good', 'book')
('book', 'to')
('to', 'study')


## Stemming


*   porterStemmer
*   SnowballStemmer



In [28]:
from nltk.stem import PorterStemmer
po=PorterStemmer()
words=['programming','reached','walking','baked','running','sliced']
for i in words:
  print(i,":",po.stem(i))

programming : program
reached : reach
walking : walk
baked : bake
running : run
sliced : slice


In [29]:
from nltk.stem import SnowballStemmer
sn=SnowballStemmer('english')
for i in words:
  print(i,':',sn.stem(i))

programming : program
reached : reach
walking : walk
baked : bake
running : run
sliced : slice


## Lemmatization
Lemmatization is most suitable for words that ends in 's'

Either lemmatization or stemming need to be done

Not both

We use stemming for majority of cases, and it is more effective

In [32]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lem=WordNetLemmatizer()
print('rocks : ',lem.lemmatize('rocks'))
print('reaches : ',lem.lemmatize('reaches'))
print('walking : ',lem.lemmatize('walking'))

rocks :  rock
reaches :  reach
walking :  walking


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
