# Tokenization

In [1]:
corpus = """
My name is Ahmed and Iam learning NLP.
NLP has basic concept called Tokenization! That's it.
"""
print(corpus)


My name is Ahmed and Iam learning NLP.
NLP has basic concept called Tokenization! That's it.



### using NLTK (Natural Language Processing ToolKit) library
- converting corpus(paragraph) into document(sentence)

In [2]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to C:\Users\hp/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

### NLP Tokenization
- corpus -> paragraph
- document -> sentence
- words
- vocabulary -> unique words.

### convert corpus to document

In [4]:
from nltk.tokenize import sent_tokenize
document = sent_tokenize(corpus)
print(f"corpus:{corpus}")
print(f"document:{document}")

corpus:
My name is Ahmed and Iam learning NLP.
NLP has basic concept called Tokenization! That's it.

document:['\nMy name is Ahmed and Iam learning NLP.', 'NLP has basic concept called Tokenization!', "That's it."]


In [5]:
for sentence in document:
    print(sentence)


My name is Ahmed and Iam learning NLP.
NLP has basic concept called Tokenization!
That's it.


### convert document to words

In [6]:
from nltk.tokenize import word_tokenize

words = word_tokenize(corpus)

print(f"corpus: {corpus}")
print(f"words: {words}")


corpus: 
My name is Ahmed and Iam learning NLP.
NLP has basic concept called Tokenization! That's it.

words: ['My', 'name', 'is', 'Ahmed', 'and', 'Iam', 'learning', 'NLP', '.', 'NLP', 'has', 'basic', 'concept', 'called', 'Tokenization', '!', 'That', "'s", 'it', '.']


In [7]:
# in this case punctuation is not spliting like ('s) below
for word in words:
    print(word)

My
name
is
Ahmed
and
Iam
learning
NLP
.
NLP
has
basic
concept
called
Tokenization
!
That
's
it
.


In [8]:
# convert sentences into words.
for sentence in document:
    print(word_tokenize(sentence))

['My', 'name', 'is', 'Ahmed', 'and', 'Iam', 'learning', 'NLP', '.']
['NLP', 'has', 'basic', 'concept', 'called', 'Tokenization', '!']
['That', "'s", 'it', '.']


### we can also seperate punctuation using wordpunct_tokenize()

In [9]:
from nltk.tokenize import wordpunct_tokenize
split_punctuation = wordpunct_tokenize(corpus)
print(split_punctuation)

['My', 'name', 'is', 'Ahmed', 'and', 'Iam', 'learning', 'NLP', '.', 'NLP', 'has', 'basic', 'concept', 'called', 'Tokenization', '!', 'That', "'", 's', 'it', '.']


In [10]:
# here punctuation is splited.
for word in split_punctuation:
    print(word)

My
name
is
Ahmed
and
Iam
learning
NLP
.
NLP
has
basic
concept
called
Tokenization
!
That
'
s
it
.


### we can't split "." using treebankWordTokenizer()
- it only seperate last "."

In [11]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(corpus)


['My',
 'name',
 'is',
 'Ahmed',
 'and',
 'Iam',
 'learning',
 'NLP.',
 'NLP',
 'has',
 'basic',
 'concept',
 'called',
 'Tokenization',
 '!',
 'That',
 "'s",
 'it',
 '.']

# Stemming
- change the word to it's stem word 
- like convert (eating, eates) to it's stem word ('eat')

### 1 - PorterStemmer

In [12]:
words = ['eating', 'eaten', 'ate', 'eats', 'writing', 'writes', 'programming', 'programs', 'history', 'finaly', 'finilized']

In [13]:
from nltk.stem import PorterStemmer

stemming = PorterStemmer()

In [14]:
for word in words:
    print(word, '-->', stemming.stem(word))

eating --> eat
eaten --> eaten
ate --> ate
eats --> eat
writing --> write
writes --> write
programming --> program
programs --> program
history --> histori
finaly --> finali
finilized --> finil


### 2 - RegexpStemmer

In [15]:
from nltk.stem import RegexpStemmer

rege_stemmer = RegexpStemmer('ing| s$| e$| able$', min=4)

In [16]:
rege_stemmer.stem('eating')

'eat'

In [17]:
rege_stemmer.stem('ingeating')

'eat'

### 3 - Snowball Stemmer

In [18]:
from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer('english')

In [19]:
for word in words:
    print(word, "-->" ,snowball_stemmer.stem(word))

eating --> eat
eaten --> eaten
ate --> ate
eats --> eat
writing --> write
writes --> write
programming --> program
programs --> program
history --> histori
finaly --> finali
finilized --> finil


# Lemmatization
### Wordnet Lemmatizer
- Lemmatizer technique is like stemming. 
- Output we get after lemmatization is called lemma, which is root word rather than root stem like in stemming
- wordnet lemmatizer takes more time then stemming

In [20]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\hp/nltk_data...


True

In [21]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()


In [22]:
"""
    lemmatize('word', pos='n')
    n use for Noun
    v use for verb
    a - adjective
    r - adverb
"""
print(lemmatizer.lemmatize('going', 'v'))
print(lemmatizer.lemmatize('going', 'n'))

go
going


In [23]:
words = ['eating', 'eaten', 'ate', 'eats', 'writing', 'writes', 'programming', 'programs', 'history', 'finaly', 'finilized']

In [24]:
# using lemmatization every word converts into it's root word.
for word in words:
    print(word,'-->',lemmatizer.lemmatize(word, 'v'))

eating --> eat
eaten --> eat
ate --> eat
eats --> eat
writing --> write
writes --> write
programming --> program
programs --> program
history --> history
finaly --> finaly
finilized --> finilized


In [25]:
lemmatizer.lemmatize('fairly', 'v'), lemmatizer.lemmatize('sportingly', 'v')

('fairly', 'sportingly')

# StopWords with NLTK

In [26]:
paragraph = """
Detecting stop words is a crucial step in natural language processing (NLP) that involves identifying and removing commonly used words that do not carry significant meaning, such as "is," "and," "the," and "in." These words are typically filtered out to improve the efficiency and accuracy of text processing tasks like text classification, sentiment analysis, and information retrieval. By eliminating stop words, the focus is placed on more meaningful words that contribute to the overall context and content of the text. In Python, libraries like NLTK and SpaCy provide pre-defined lists of stop words, making it easier to clean and preprocess textual data for NLP applications.
"""

In [27]:
from nltk.corpus import stopwords

In [28]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\hp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [30]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [31]:
sentences = nltk.sent_tokenize(paragraph)
sentences

['\nDetecting stop words is a crucial step in natural language processing (NLP) that involves identifying and removing commonly used words that do not carry significant meaning, such as "is," "and," "the," and "in."',
 'These words are typically filtered out to improve the efficiency and accuracy of text processing tasks like text classification, sentiment analysis, and information retrieval.',
 'By eliminating stop words, the focus is placed on more meaningful words that contribute to the overall context and content of the text.',
 'In Python, libraries like NLTK and SpaCy provide pre-defined lists of stop words, making it easier to clean and preprocess textual data for NLP applications.']

In [32]:
for sentence in sentences:
    print(sentence)


Detecting stop words is a crucial step in natural language processing (NLP) that involves identifying and removing commonly used words that do not carry significant meaning, such as "is," "and," "the," and "in."
These words are typically filtered out to improve the efficiency and accuracy of text processing tasks like text classification, sentiment analysis, and information retrieval.
By eliminating stop words, the focus is placed on more meaningful words that contribute to the overall context and content of the text.
In Python, libraries like NLTK and SpaCy provide pre-defined lists of stop words, making it easier to clean and preprocess textual data for NLP applications.


In [33]:
print(len(sentences))
nltk.word_tokenize(sentences[-1])

4


['In',
 'Python',
 ',',
 'libraries',
 'like',
 'NLTK',
 'and',
 'SpaCy',
 'provide',
 'pre-defined',
 'lists',
 'of',
 'stop',
 'words',
 ',',
 'making',
 'it',
 'easier',
 'to',
 'clean',
 'and',
 'preprocess',
 'textual',
 'data',
 'for',
 'NLP',
 'applications',
 '.']

In [34]:
stemmer.stem('eating')

'eat'

In [35]:
# 1 - change sentences into words
# 2 - apply stemming on words, if word is not present in stopwords.words('english')
# 3 - then add the stemmed words back into the sentence

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words_without_stopwords = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] =  " ".join(words_without_stopwords) # converting all words into sentences

In [36]:
print(len(words))

28


In [37]:
print(len(words_without_stopwords))

22


In [38]:
print(sentences)

["detect stop word crucial step natur languag process ( nlp ) involv identifi remov commonli use word carri signific mean , `` , '' `` , '' `` , '' `` . ''", 'these word typic filter improv effici accuraci text process task like text classif , sentiment analysi , inform retriev .', 'by elimin stop word , focu place meaning word contribut overal context content text .', 'in python , librari like nltk spaci provid pre-defin list stop word , make easier clean preprocess textual data nlp applic .']


### 1 - The above porter stemmer is not good so lets try snowball stemmer


In [39]:
from nltk.stem import SnowballStemmer
snow_stemmer = SnowballStemmer('english')

In [40]:
paragraph = """
Detecting stop words is a crucial step in natural language processing (NLP) that involves identifying and removing commonly used words that do not carry significant meaning, such as "is," "and," "the," and "in." These words are typically filtered out to improve the efficiency and accuracy of text processing tasks like text classification, sentiment analysis, and information retrieval. By eliminating stop words, the focus is placed on more meaningful words that contribute to the overall context and content of the text. In Python, libraries like NLTK and SpaCy provide pre-defined lists of stop words, making it easier to clean and preprocess textual data for NLP applications.
"""

In [41]:
# from paragraph to sentences tokenization
sentences = nltk.sent_tokenize(paragraph)
sentences

['\nDetecting stop words is a crucial step in natural language processing (NLP) that involves identifying and removing commonly used words that do not carry significant meaning, such as "is," "and," "the," and "in."',
 'These words are typically filtered out to improve the efficiency and accuracy of text processing tasks like text classification, sentiment analysis, and information retrieval.',
 'By eliminating stop words, the focus is placed on more meaningful words that contribute to the overall context and content of the text.',
 'In Python, libraries like NLTK and SpaCy provide pre-defined lists of stop words, making it easier to clean and preprocess textual data for NLP applications.']

In [42]:
stopwords.words("english")

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [43]:
# 1 - convert sentence into word
# 2 - apply stemming on words other then stopwords

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i]) # convert sentence into words, so the multiple words pass through snow_stemmeing, so we keep in list
    words_without_stopwords = [snow_stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))] # check if word not in stopwords, then apply stemming
    sentences[i] = ' '.join(words_without_stopwords) # join the all words and append to it's sentence

In [44]:
sentences

["detect stop word crucial step natur languag process ( nlp ) involv identifi remov common use word carri signific mean , `` , '' `` , '' `` , '' `` . ''",
 'these word typic filter improv effici accuraci text process task like text classif , sentiment analysi , inform retriev .',
 'by elimin stop word , focus place meaning word contribut overal context content text .',
 'in python , librari like nltk spaci provid pre-defin list stop word , make easier clean preprocess textual data nlp applic .']

### 2 - lets try with lemmatization method

In [45]:
paragraph = """
Detecting stop words is a crucial step in natural language processing (NLP) that involves identifying and removing commonly used words that do not carry significant meaning, such as "is," "and," "the," and "in." These words are typically filtered out to improve the efficiency and accuracy of text processing tasks like text classification, sentiment analysis, and information retrieval. By eliminating stop words, the focus is placed on more meaningful words that contribute to the overall context and content of the text. In Python, libraries like NLTK and SpaCy provide pre-defined lists of stop words, making it easier to clean and preprocess textual data for NLP applications.
"""

In [46]:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(paragraph)

In [47]:
from nltk.stem import WordNetLemmatizer
lemmatization = WordNetLemmatizer()

In [48]:
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words_without_stopwords = [lemmatization.lemmatize(word.lower(), 'v') for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words_without_stopwords)


In [49]:
# lemmatization doesn't change words into lowercase, so we have to change words into lowercase using .lower()
sentences

["detect stop word crucial step natural language process ( nlp ) involve identify remove commonly use word carry significant mean , `` , '' `` , '' `` , '' `` . ''",
 'these word typically filter improve efficiency accuracy text process task like text classification , sentiment analysis , information retrieval .',
 'by eliminate stop word , focus place meaningful word contribute overall context content text .',
 'in python , libraries like nltk spacy provide pre-defined list stop word , make easier clean preprocess textual data nlp applications .']

# Parts of speech with nltk

- Part-of-Speech (POS) tags
- It tells you whether a word is a noun, verb, adjective, etc.

In [50]:
import nltk 

In [51]:
paragraph = """
Detecting stop words is a crucial step in natural language processing (NLP) that involves identifying and removing commonly used words that do not carry significant meaning, such as "is," "and," "the," and "in." These words are typically filtered out to improve the efficiency and accuracy of text processing tasks like text classification, sentiment analysis, and information retrieval. By eliminating stop words, the focus is placed on more meaningful words that contribute to the overall context and content of the text. In Python, libraries like NLTK and SpaCy provide pre-defined lists of stop words, making it easier to clean and preprocess textual data for NLP applications.
"""

In [52]:
sentences = nltk.sent_tokenize(paragraph)
sentences

['\nDetecting stop words is a crucial step in natural language processing (NLP) that involves identifying and removing commonly used words that do not carry significant meaning, such as "is," "and," "the," and "in."',
 'These words are typically filtered out to improve the efficiency and accuracy of text processing tasks like text classification, sentiment analysis, and information retrieval.',
 'By eliminating stop words, the focus is placed on more meaningful words that contribute to the overall context and content of the text.',
 'In Python, libraries like NLTK and SpaCy provide pre-defined lists of stop words, making it easier to clean and preprocess textual data for NLP applications.']

In [53]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package stopwords to C:\Users\hp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\hp/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [54]:
from nltk.corpus import stopwords
from nltk.tag import pos_tag

for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    # print(f"words: {words}")

    words_without_stopwords = [word for word in words if word not in set(stopwords.words('english'))]
    # print(f'words_without_stopwords:{words_without_stopwords}')
    # print(f"type:{type(words_without_stopwords)}")

    pos_tag = nltk.pos_tag(words_without_stopwords)
    print(pos_tag)

[('Detecting', 'VBG'), ('stop', 'NN'), ('words', 'NNS'), ('crucial', 'JJ'), ('step', 'NN'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('involves', 'VBZ'), ('identifying', 'VBG'), ('removing', 'VBG'), ('commonly', 'RB'), ('used', 'VBN'), ('words', 'NNS'), ('carry', 'VBP'), ('significant', 'JJ'), ('meaning', 'NN'), (',', ','), ('``', '``'), (',', ','), ("''", "''"), ('``', '``'), (',', ','), ("''", "''"), ('``', '``'), (',', ','), ("''", "''"), ('``', '``'), ('.', '.'), ("''", "''")]
[('These', 'DT'), ('words', 'NNS'), ('typically', 'RB'), ('filtered', 'VBD'), ('improve', 'VB'), ('efficiency', 'NN'), ('accuracy', 'NN'), ('text', 'IN'), ('processing', 'VBG'), ('tasks', 'NNS'), ('like', 'IN'), ('text', 'JJ'), ('classification', 'NN'), (',', ','), ('sentiment', 'NN'), ('analysis', 'NN'), (',', ','), ('information', 'NN'), ('retrieval', 'NN'), ('.', '.')]
[('By', 'IN'), ('eliminating', 'VBG'), ('stop', 'JJ'), ('words', 'NNS'), (',', 

In [55]:
import nltk
print(nltk.data.path)  # See where NLTK is looking for data
try:
    nltk.data.find('taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle')
    print("Tagger found successfully!")
except LookupError as e:
    print(f"Error: {e}")

['C:\\Users\\hp/nltk_data', 'c:\\Users\\hp\\Documents\\Python\\NLP_GenAi\\venv\\nltk_data', 'c:\\Users\\hp\\Documents\\Python\\NLP_GenAi\\venv\\share\\nltk_data', 'c:\\Users\\hp\\Documents\\Python\\NLP_GenAi\\venv\\lib\\nltk_data', 'C:\\Users\\hp\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']
Tagger found successfully!


In [80]:
# assignment:
sentence = 'Taj Mahal is a beautiful Monument'
sentence = sentence.split()
for word in sentence:
    # print(word)
    words = nltk.pos_tag([word])
    print(words)

[('Taj', 'NN')]
[('Mahal', 'NN')]
[('is', 'VBZ')]
[('a', 'DT')]
[('beautiful', 'NN')]
[('Monument', 'NN')]


In [85]:
print(nltk.pos_tag('Taj Mahal is a beautiful Monument'.split()))

[('Taj', 'NNP'), ('Mahal', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('beautiful', 'JJ'), ('Monument', 'NN')]


# Name Entity Recognition
- it's technique to indentify or classify entitiy in text into pre-defined categories.
- like person, organiation, location, etc.


In [87]:
sentence = "Elon Musk founded SpaceX in 2002 and Tesla in 2003."

In [109]:
nltk.download('maxent_ne_chunker_tab')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\hp/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\hp/nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to C:\Users\hp/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [104]:
import numpy

In [121]:
# from nltk import draw_tree
words = nltk.word_tokenize(sentence)
print(type(words))
pos_tags = nltk.pos_tag(words)
print(type(pos_tags))
named_entities = nltk.ne_chunk(pos_tags)
print(named_entities)

<class 'list'>
<class 'list'>
(S
  (PERSON Elon/NNP)
  (PERSON Musk/NNP)
  founded/VBD
  (ORGANIZATION SpaceX/NNP)
  in/IN
  2002/CD
  and/CC
  (GPE Tesla/NNP)
  in/IN
  2003/CD
  ./.)
