# Tokenization example

In [1]:
!pip install nltk



In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
corpus = """ Hello welcome, to aditya's NLP tutorial.
Please do watch the entire course! to become expert in NLP.
"""

In [3]:
corpus

" Hello welcome, to aditya's NLP tutorial. \nPlease do watch the entire course! to become expert in NLP.\n"

In [11]:
print(corpus)

 Hello welcome, to aditya's NLP tutorial. 
Please do watch the entire course! to become expert in NLP.



In [12]:
# Tokenization
## sentence --> paragraphs

from nltk.tokenize import sent_tokenize

In [13]:
sent_tokenize(corpus)

[" Hello welcome, to aditya's NLP tutorial.",
 'Please do watch the entire course!',
 'to become expert in NLP.']

#### we will get in the form of list, it is also considering ! to separate senetnce

In [15]:
documents = sent_tokenize(corpus)

In [16]:
type(documents)

list

In [17]:
for sentence in documents:
  print(sentence)

 Hello welcome, to aditya's NLP tutorial.
Please do watch the entire course!
to become expert in NLP.


In [18]:
## tokenization 2 : paragraphs --> words
## sentence --> words
from nltk.tokenize import word_tokenize

In [20]:
word_tokenize(corpus)  # we can see that even , . ! are also treated as seprate word.

['Hello',
 'welcome',
 ',',
 'to',
 'aditya',
 "'s",
 'NLP',
 'tutorial',
 '.',
 'Please',
 'do',
 'watch',
 'the',
 'entire',
 'course',
 '!',
 'to',
 'become',
 'expert',
 'in',
 'NLP',
 '.']

In [21]:
for sentence in documents:
  print(word_tokenize(sentence))

['Hello', 'welcome', ',', 'to', 'aditya', "'s", 'NLP', 'tutorial', '.']
['Please', 'do', 'watch', 'the', 'entire', 'course', '!']
['to', 'become', 'expert', 'in', 'NLP', '.']


In [22]:
## we can also use
from nltk.tokenize import wordpunct_tokenize

In [24]:
wordpunct_tokenize(corpus)   # here we can see that ' and s also got seprated.

['Hello',
 'welcome',
 ',',
 'to',
 'aditya',
 "'",
 's',
 'NLP',
 'tutorial',
 '.',
 'Please',
 'do',
 'watch',
 'the',
 'entire',
 'course',
 '!',
 'to',
 'become',
 'expert',
 'in',
 'NLP',
 '.']

In [25]:
## another way is
from nltk.tokenize import TreebankWordTokenizer

In [30]:
tokenizer = TreebankWordTokenizer()  # full stop will not be treated as seprated as another word but for the ending dot '.' it will consider it

In [31]:
tokenizer.tokenize(corpus)

['Hello',
 'welcome',
 ',',
 'to',
 'aditya',
 "'s",
 'NLP',
 'tutorial.',
 'Please',
 'do',
 'watch',
 'the',
 'entire',
 'course',
 '!',
 'to',
 'become',
 'expert',
 'in',
 'NLP',
 '.']

# text preprocessing with stemming using NLTK

#### reducing word to it's word stem that affixes to suffixes and prefixes or to the roots of words known as lemma.

In [37]:
## Classification problem
## comments of product are positive or negative
## comments and reviews as dataset with words like --> eating, eat, eaten .. here root word is eat, the focus is eat not other variation, so eat is word stem.


words = ["eating","eats","eaten","writing","writes","programming","programs","history","finally","finalize"]

### stemming technique - 1. PorterStemmer

In [38]:
from nltk.stem import PorterStemmer

In [39]:
stemming = PorterStemmer()

In [41]:
for word in words:
  print(word+"---->"+stemming.stem(word))

eating---->eat
eats---->eat
eaten---->eaten
writing---->write
writes---->write
programming---->program
programs---->program
history---->histori
finally---->final
finalize---->final


##### we can see that there are few errors like histori, this is a major disadvantage of stemming

### 2. Regexpstemmer class -
##### we can easily implement regular expression stemmer algorithm, it takes single regular expression and remove any prefix or suffix.

In [44]:
from nltk.stem import RegexpStemmer

In [45]:
reg_stemmer = RegexpStemmer('ing$|s$|e$|able$', min=4)

In [46]:
reg_stemmer.stem("eating")

'eat'

In [47]:
reg_stemmer.stem("cars")

'car'

In [48]:
reg_stemmer.stem("ingeating")

'ingeat'

### 3. Snowball stemmer
#### better than porter stemmer

In [50]:
from nltk.stem import SnowballStemmer

In [51]:
snowballstemmer = SnowballStemmer('english')

In [52]:
for word in words:
  print(word+"--->"+snowballstemmer.stem(word))

eating--->eat
eats--->eat
eaten--->eaten
writing--->write
writes--->write
programming--->program
programs--->program
history--->histori
finally--->final
finalize--->final


In [53]:
# compare porter and snowball
stemming.stem("fairly"), stemming.stem("sportingly")

('fairli', 'sportingli')

In [54]:
snowballstemmer.stem("fairly"), snowballstemmer.stem("sportingly")

('fair', 'sport')

###### accuracy with snowball stemmer is high

## Lemmatization solves all the problem we have faced above because it has dictionary of all the words!!

# Lemmatization using NLTK

##### It is like stemming. The output we get after lemmatization is called 'lemma', which is root word rather than  root stem, the output of stemming. At the end we will get a valid word that means the same thing. We will use WordNetLemmatizer class which is a thin cover or wrapper around wordnet corpus. This class use morphy() function.

In [59]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [60]:
lemmatizer = WordNetLemmatizer()

In [62]:
'''
POS- Noun-n
verb- v
adjective-a
adverb-r
'''

lemmatizer.lemmatize("going", pos='n')

'going'

In [63]:
lemmatizer.lemmatize("going", pos='v')

'go'

In [64]:
lemmatizer.lemmatize("going", pos='a')

'going'

In [65]:
lemmatizer.lemmatize("going", pos='r')

'going'

In [66]:
words = ["eating","eats","eaten","writing","writes","programming","programs","history","finally","finalize"]

In [70]:
for word in words:
  print(word+"--->"+lemmatizer.lemmatize(word, pos='v'))

eating--->eat
eats--->eat
eaten--->eat
writing--->write
writes--->write
programming--->program
programs--->program
history--->history
finally--->finally
finalize--->finalize


In [72]:
lemmatizer.lemmatize("fairly", pos='v'), lemmatizer.lemmatize("sportingly",pos='v')

('fairly', 'sportingly')

### WordNetLemmatizer will take more time to execute - ex= chatbots, Q/A, text summarization etc