# Tokenization

In [2]:
corpus = """
My name is Ahmed and Iam learning NLP.
NLP has basic concept called Tokenization! That's it.
"""
print(corpus)


My name is Ahmed and Iam learning NLP.
NLP has basic concept called Tokenization! That's it.



### using NLTK (Natural Language Processing ToolKit) library
- converting corpus(paragraph) into document(sentence)

In [15]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [8]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

### NLP Tokenization
- corpus -> paragraph
- document -> sentence
- words
- vocabulary -> unique words.

### convert corpus to document

In [23]:
from nltk.tokenize import sent_tokenize
document = sent_tokenize(corpus)
print(f"corpus:{corpus}")
print(f"document:{document}")

corpus:
My name is Ahmed and Iam learning NLP.
NLP has basic concept called Tokenization! That's it.

document:['\nMy name is Ahmed and Iam learning NLP.', 'NLP has basic concept called Tokenization!', "That's it."]


In [17]:
for sentence in document:
    print(sentence)


My name is Ahmed and Iam learning NLP.
NLP has basic concept called Tokenization!
That's it.


### convert document to words

In [22]:
from nltk.tokenize import word_tokenize

words = word_tokenize(corpus)

print(f"corpus: {corpus}")
print(f"words: {words}")


corpus: 
My name is Ahmed and Iam learning NLP.
NLP has basic concept called Tokenization! That's it.

words: ['My', 'name', 'is', 'Ahmed', 'and', 'Iam', 'learning', 'NLP', '.', 'NLP', 'has', 'basic', 'concept', 'called', 'Tokenization', '!', 'That', "'s", 'it', '.']


In [24]:
# in this case punctuation is not spliting like ('s) below
for word in words:
    print(word)

My
name
is
Ahmed
and
Iam
learning
NLP
.
NLP
has
basic
concept
called
Tokenization
!
That
's
it
.


In [26]:
# convert sentences into words.
for sentence in document:
    print(word_tokenize(sentence))

['My', 'name', 'is', 'Ahmed', 'and', 'Iam', 'learning', 'NLP', '.']
['NLP', 'has', 'basic', 'concept', 'called', 'Tokenization', '!']
['That', "'s", 'it', '.']


### we can also seperate punctuation using wordpunct_tokenize()

In [29]:
from nltk.tokenize import wordpunct_tokenize
split_punctuation = wordpunct_tokenize(corpus)
print(split_punctuation)

['My', 'name', 'is', 'Ahmed', 'and', 'Iam', 'learning', 'NLP', '.', 'NLP', 'has', 'basic', 'concept', 'called', 'Tokenization', '!', 'That', "'", 's', 'it', '.']


In [30]:
# here punctuation is splited.
for word in split_punctuation:
    print(word)

My
name
is
Ahmed
and
Iam
learning
NLP
.
NLP
has
basic
concept
called
Tokenization
!
That
'
s
it
.


### we can't split "." using treebankWordTokenizer()
- it only seperate last "."

In [35]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(corpus)


['My',
 'name',
 'is',
 'Ahmed',
 'and',
 'Iam',
 'learning',
 'NLP.',
 'NLP',
 'has',
 'basic',
 'concept',
 'called',
 'Tokenization',
 '!',
 'That',
 "'s",
 'it',
 '.']

# Stemming
- change the word to it's stem word 
- like convert (eating, eates) to it's root word ('eat')

### 1 - PorterStemmer

In [1]:
words = ['eating', 'eaten', 'ate', 'eats', 'writing', 'writes', 'programming', 'programs', 'history', 'finaly', 'finilized']

In [3]:
from nltk.stem import PorterStemmer

stemming = PorterStemmer()

In [6]:
for word in words:
    print(word, '-->', stemming.stem(word))

eating --> eat
eaten --> eaten
ate --> ate
eats --> eat
writing --> write
writes --> write
programming --> program
programs --> program
history --> histori
finaly --> finali
finilized --> finil


### 2 - RegexpStemmer

In [26]:
from nltk.stem import RegexpStemmer

rege_stemmer = RegexpStemmer('ing| s$| e$| able$', min=4)

In [27]:
rege_stemmer.stem('eating')

'eat'

In [28]:
rege_stemmer.stem('ingeating')

'eat'

### 3 - Snowball Stemmer

In [31]:
from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer('english')

In [33]:
for word in words:
    print(word, "-->" ,snowball_stemmer.stem(word))

eating --> eat
eaten --> eaten
ate --> ate
eats --> eat
writing --> write
writes --> write
programming --> program
programs --> program
history --> histori
finaly --> finali
finilized --> finil


# Lemmatization
### Wordnet Lemmatizer
- Lemmatizer technique is like stemming. 
- Output we get after lemmatization is called lemma, which is root word rather than root stem like in stemming
- wordnet lemmatizer takes more time then stemming

In [4]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...


True

In [5]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()


In [12]:
"""
    lemmatize('word', pos='n')
    n use for Noun
    v use for verb
    a - adjective
    r - adverb
"""
print(lemmatizer.lemmatize('going', 'v'))
print(lemmatizer.lemmatize('going', 'n'))

go
going


In [13]:
words = ['eating', 'eaten', 'ate', 'eats', 'writing', 'writes', 'programming', 'programs', 'history', 'finaly', 'finilized']

In [15]:
# using lemmatization every word converts into it's root word.
for word in words:
    print(word,'-->',lemmatizer.lemmatize(word, 'v'))

eating --> eat
eaten --> eat
ate --> eat
eats --> eat
writing --> write
writes --> write
programming --> program
programs --> program
history --> history
finaly --> finaly
finilized --> finilized


In [23]:
lemmatizer.lemmatize('fairly', 'v'), lemmatizer.lemmatize('sportingly', 'v')

('fairly', 'sportingly')

# StopWords with NLTK

In [55]:
paragraph = """
Detecting stop words is a crucial step in natural language processing (NLP) that involves identifying and removing commonly used words that do not carry significant meaning, such as "is," "and," "the," and "in." These words are typically filtered out to improve the efficiency and accuracy of text processing tasks like text classification, sentiment analysis, and information retrieval. By eliminating stop words, the focus is placed on more meaningful words that contribute to the overall context and content of the text. In Python, libraries like NLTK and SpaCy provide pre-defined lists of stop words, making it easier to clean and preprocess textual data for NLP applications.
"""

In [5]:
from nltk.corpus import stopwords

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [15]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [56]:
sentences = nltk.sent_tokenize(paragraph)
sentences

['\nDetecting stop words is a crucial step in natural language processing (NLP) that involves identifying and removing commonly used words that do not carry significant meaning, such as "is," "and," "the," and "in."',
 'These words are typically filtered out to improve the efficiency and accuracy of text processing tasks like text classification, sentiment analysis, and information retrieval.',
 'By eliminating stop words, the focus is placed on more meaningful words that contribute to the overall context and content of the text.',
 'In Python, libraries like NLTK and SpaCy provide pre-defined lists of stop words, making it easier to clean and preprocess textual data for NLP applications.']

In [57]:
for sentence in sentences:
    print(sentence)


Detecting stop words is a crucial step in natural language processing (NLP) that involves identifying and removing commonly used words that do not carry significant meaning, such as "is," "and," "the," and "in."
These words are typically filtered out to improve the efficiency and accuracy of text processing tasks like text classification, sentiment analysis, and information retrieval.
By eliminating stop words, the focus is placed on more meaningful words that contribute to the overall context and content of the text.
In Python, libraries like NLTK and SpaCy provide pre-defined lists of stop words, making it easier to clean and preprocess textual data for NLP applications.


In [58]:
print(len(sentences))
nltk.word_tokenize(sentences[-1])

4


['In',
 'Python',
 ',',
 'libraries',
 'like',
 'NLTK',
 'and',
 'SpaCy',
 'provide',
 'pre-defined',
 'lists',
 'of',
 'stop',
 'words',
 ',',
 'making',
 'it',
 'easier',
 'to',
 'clean',
 'and',
 'preprocess',
 'textual',
 'data',
 'for',
 'NLP',
 'applications',
 '.']

In [59]:
stemmer.stem('eating')

'eat'

In [60]:
# 1 - change sentences into words
# 2 - apply stemming on words, if word is not present in stopwords.words('english')
# 3 - then add the stemmed words back into the sentence

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words_without_stopwords = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] =  " ".join(words_without_stopwords) # converting all words into sentences

In [61]:
print(len(words))

28


In [62]:
print(len(words_without_stopwords))

22


In [63]:
print(sentences)

["detect stop word crucial step natur languag process ( nlp ) involv identifi remov commonli use word carri signific mean , `` , '' `` , '' `` , '' `` . ''", 'these word typic filter improv effici accuraci text process task like text classif , sentiment analysi , inform retriev .', 'by elimin stop word , focu place meaning word contribut overal context content text .', 'in python , librari like nltk spaci provid pre-defin list stop word , make easier clean preprocess textual data nlp applic .']
