# WHITE SPACE TOKENIZATION

The simplest way is to use python split function to tokenize text based on whitespace

In [7]:
tweet_text1 = "Just finished my workout 💪 Feeling energized and ready to tackle the day! #fitness #motivation."

In [8]:
tweet_text1.split()

['Just',
 'finished',
 'my',
 'workout',
 '💪',
 'Feeling',
 'energized',
 'and',
 'ready',
 'to',
 'tackle',
 'the',
 'day!',
 '#fitness',
 '#motivation.']

 We can observe in the above output that the punctuations have not been separated from the word, for example:  'day !', '#fitness', '#motivation.'

Splitting the sentences using delimeter

In [13]:
tweet_text2 = "Just finished a great workout! 💪 Feeling energized and ready to take on the day. 🌟 Can't wait for dinner tonight with friends. 🍕 #fitness #friends #goodtimes"

tweet_text2.split('.')

['Just finished a great workout! 💪 Feeling energized and ready to take on the day',
 " 🌟 Can't wait for dinner tonight with friends",
 ' 🍕 #fitness #friends #goodtimes']

### We can clearly see that the split() function has not done a good job splitting the text into sentences

In [14]:
#*****************************************************************************************************************************************************************

# NLTK python library

In [16]:
# !pip install nltk
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#### Word Tokenization


In [20]:
from nltk.tokenize import word_tokenize

print(word_tokenize(tweet_text1))
print()
print(f"Total number of tokens: {len(word_tokenize(tweet_text1))}")


['Just', 'finished', 'my', 'workout', '💪', 'Feeling', 'energized', 'and', 'ready', 'to', 'tackle', 'the', 'day', '!', '#', 'fitness', '#', 'motivation', '.']

Total number of tokens: 19


#### WordPunct Tokenization


In [21]:
from nltk.tokenize import WordPunctTokenizer

word_tokenizer = WordPunctTokenizer()
print(word_tokenizer.tokenize(tweet_text1))
print()
print(f"Total number of tokens: {len(word_tokenizer.tokenize(tweet_text1))}")

['Just', 'finished', 'my', 'workout', '💪', 'Feeling', 'energized', 'and', 'ready', 'to', 'tackle', 'the', 'day', '!', '#', 'fitness', '#', 'motivation', '.']

Total number of tokens: 19


#### Sentence Tokenization

In [22]:
from nltk.tokenize import sent_tokenize

print(sent_tokenize(tweet_text2))
print()
print(f"Total number of tokens: {len(sent_tokenize(tweet_text1))}")

['Just finished a great workout!', '💪 Feeling energized and ready to take on the day.', "🌟 Can't wait for dinner tonight with friends.", '🍕 #fitness #friends #goodtimes']

Total number of tokens: 2


#### RegEx Tokenization

In [24]:
from nltk.tokenize import regexp_tokenize

regexp_tokenize(tweet_text1, "[\w']+") # you can observe the punctuations / special characters are excluded

['Just',
 'finished',
 'my',
 'workout',
 'Feeling',
 'energized',
 'and',
 'ready',
 'to',
 'tackle',
 'the',
 'day',
 'fitness',
 'motivation']

#### TreebankWord tokenization


In [26]:
from nltk.tokenize import TreebankWordTokenizer

tbword_tokenizer = TreebankWordTokenizer()
print(tbword_tokenizer.tokenize(tweet_text2))
print()
print(f"Total number of  tokens: {len(tbword_tokenizer.tokenize(tweet_text2))}")

##Output: 'day.' , "n't" is not tokenized
## It separates phrase-terminating punctuation like (?!.;,) from adjacent tokens and retains decimal numbers as a single token.
## Rules for Treebank : https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.treebank

['Just', 'finished', 'a', 'great', 'workout', '!', '💪', 'Feeling', 'energized', 'and', 'ready', 'to', 'take', 'on', 'the', 'day.', '🌟', 'Ca', "n't", 'wait', 'for', 'dinner', 'tonight', 'with', 'friends.', '🍕', '#', 'fitness', '#', 'friends', '#', 'goodtimes']
Total number of  tokens: 32


#### Tweet Tokenizer

In [27]:
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer()
print(tweet_tokenizer.tokenize(tweet_text2))
print()
print(f"Total number of  tokens: {len(tweet_tokenizer.tokenize(tweet_text2))}")

['Just', 'finished', 'a', 'great', 'workout', '!', '💪', 'Feeling', 'energized', 'and', 'ready', 'to', 'take', 'on', 'the', 'day', '.', '🌟', "Can't", 'wait', 'for', 'dinner', 'tonight', 'with', 'friends', '.', '🍕', '#fitness', '#friends', '#goodtimes']

Total number of  tokens: 30


#### MWET Tokenization

In [29]:
from nltk.tokenize import MWETokenizer

mwet_tokenizer = MWETokenizer()
print(mwet_tokenizer.tokenize(tweet_text1))
print()
print(f"Total number of  tokens: {len(mwet_tokenizer.tokenize(tweet_text1))}")

['J', 'u', 's', 't', ' ', 'f', 'i', 'n', 'i', 's', 'h', 'e', 'd', ' ', 'm', 'y', ' ', 'w', 'o', 'r', 'k', 'o', 'u', 't', ' ', '💪', ' ', 'F', 'e', 'e', 'l', 'i', 'n', 'g', ' ', 'e', 'n', 'e', 'r', 'g', 'i', 'z', 'e', 'd', ' ', 'a', 'n', 'd', ' ', 'r', 'e', 'a', 'd', 'y', ' ', 't', 'o', ' ', 't', 'a', 'c', 'k', 'l', 'e', ' ', 't', 'h', 'e', ' ', 'd', 'a', 'y', '!', ' ', '#', 'f', 'i', 't', 'n', 'e', 's', 's', ' ', '#', 'm', 'o', 't', 'i', 'v', 'a', 't', 'i', 'o', 'n', '.']

Total number of  tokens: 95


In [31]:
# Initialize MWETokenizer
mwetokenizer = MWETokenizer([('Feeling', 'energized'), ('dinner', 'tonight')])

# Text to tokenize
text = "Just finished a great workout! Feeling energized and ready for dinner tonight with friends."
tokens = nltk.word_tokenize(text)

# Apply MWETokenizer
mwet_tokens = mwetokenizer.tokenize(tokens)

print(mwet_tokens)

['Just', 'finished', 'a', 'great', 'workout', '!', 'Feeling_energized', 'and', 'ready', 'for', 'dinner_tonight', 'with', 'friends', '.']


A Multi-Word Expression Tokenizer (MWET) treat certain sequences of words as single tokens rather than individual words. This is particularly useful when dealing with multi-word expressions or phrases that have a specific meaning when they appear together.

The MWET tokenizer allows you to specify which sequences of words should be treated as single tokens. For example, in the above example "Feeling energized" you might want to treat "Feeling_energized" as a single token rather than two separate tokens.

This will preserve the meaning of multi-word expressions during text processing tasks such as part-of-speech tagging, named entity recognition, and machine translation. By treating these expressions as single tokens, the tokenizer ensures that their semantic integrity is maintained.


#### TextBlob

TextBlob is a Python library for processing textual data, primarily designed for tasks in NLP. It provides a simple API for common NLP tasks such as part-of-speech tagging, noun phrase extraction, sentiment analysis, classification, translation, and more.

In [36]:
# ! pip install -U textblob
# ! python3 -m textblob.download_corpora

In [38]:
from textblob import TextBlob

# 1. Sentiment Analysis
text1 = "TextBlob is great for performing sentiment analysis!"
text2 = "Life sucks!"
blob1 = TextBlob(text1)
blob2 = TextBlob(text2)
print("Sentiment Analysis:")
print("Polarity:", blob1.sentiment.polarity)  # Positive sentiment
print()
print("Polarity:", blob2.sentiment.polarity)  # Negative sentiment

Sentiment Analysis:
Polarity: 1.0

Polarity: -0.375


In [39]:
# 2. Part-of-Speech Tagging
print("\nPart-of-Speech Tagging:")
print(blob1.tags)


Part-of-Speech Tagging:
[('TextBlob', 'NNP'), ('is', 'VBZ'), ('great', 'JJ'), ('for', 'IN'), ('performing', 'VBG'), ('sentiment', 'JJ'), ('analysis', 'NN')]


In [40]:
# 3. Noun Phrase Extraction
print("\nNoun Phrase Extraction:")
print(blob1.noun_phrases)


Noun Phrase Extraction:
['textblob', 'sentiment analysis']


In [41]:
# 4. Tokenization
print("\nTokenization:")
print(blob1.words)  # Tokenize into words
print(blob1.sentences)  # Tokenize into sentences


Tokenization:
['TextBlob', 'is', 'great', 'for', 'performing', 'sentiment', 'analysis']
[Sentence("TextBlob is great for performing sentiment analysis!")]


In [None]:
# 5. Word Inflection and Lemmatization
print("\nWord Inflection and Lemmatization:")
print("Plural of 'cat':", blob1.words[1].pluralize())
print("Singular of 'dogs':", blob1.words[-1].singularize())
print("Lemmatization of 'running':", blob1.words[3].lemmatize('v'))

In [54]:
# 6. Language Translation
print("\nLanguage Translation:")
blob = TextBlob("Hello!")
print(blob.translate(from_lang='en', to='fr')) # use both from_lang and to. The code might throw error.


Language Translation:
Bonjour!


In [47]:
# 7. Spelling Correction
print("\nSpelling Correction:")
text_with_typo = "The quick brown fox jumps ovri the lazy dog."
corrected_blob = TextBlob(text_with_typo).correct()
print(corrected_blob)


Spelling Correction:
The quick brown fox jumps or the lazy dog.


In [56]:
# 8. n-grams
blob1.ngrams(n=2)

[WordList(['TextBlob', 'is']),
 WordList(['is', 'great']),
 WordList(['great', 'for']),
 WordList(['for', 'performing']),
 WordList(['performing', 'sentiment']),
 WordList(['sentiment', 'analysis'])]

In [59]:
# 9. Word counts
blob1.words.count('for')

1

In [60]:
# 10. Parse the text
blob1.parse()

'TextBlob/NN/B-NP/O is/VBZ/B-VP/O great/JJ/B-ADJP/O for/IN/B-PP/B-PNP performing/VBG/B-VP/I-PNP sentiment/NN/B-NP/I-PNP analysis/NN/I-NP/I-PNP !/./O/O'

In [62]:
blob1.upper()

TextBlob("TEXTBLOB IS GREAT FOR PERFORMING SENTIMENT ANALYSIS!")

In [63]:
blob2.words.pluralize()

WordList(['Lifes', 'suckss'])

In [66]:
from textblob import Word
from textblob.wordnet import VERB
word = Word("sentiment")
print(word.synsets)
print()
print(word.definitions)

[Synset('sentiment.n.01'), Synset('opinion.n.01')]

['tender, romantic, or nostalgic feeling or emotion', 'a personal belief or judgment that is not founded on proof or certainty']


In [69]:
word2 = Word('Feelings')
word2.synsets[0]

Synset('feelings.n.01')

In [70]:
from textblob.wordnet import Synset
sent = Synset("sentiment.n.01")
feel = Synset("feelings.n.01")
sent.path_similarity(feel)

0.25

#### Spacy Tokenizer

In [72]:
# ! pip install spacy
# ! python -m spacy download en_core_web_sm

In [73]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")
text = "SpaCy is a great tool for natural language processing!"
doc = nlp(text)

# Print the tokens
print("Tokenization:")
for token in doc:
    print(token.text)


Tokenization:
SpaCy
is
a
great
tool
for
natural
language
processing
!


In [74]:
text = "SpaCy is a great tool for natural language processing! It can tokenize text and perform various NLP tasks. Give it a try."
doc = nlp(text)
sentences = list(doc.sents)

# Print the segmented sentences
print("Segmented Sentences:")
for i, sentence in enumerate(sentences, 1):
    print(f"Sentence {i}: {sentence}")

Segmented Sentences:
Sentence 1: SpaCy is a great tool for natural language processing!
Sentence 2: It can tokenize text and perform various NLP tasks.
Sentence 3: Give it a try.


#### Gensim Tokenizer

In [75]:
#SYNTAX:  gensim.utils.tokenize(text, lowercase=True, deacc=False, errors='strict', to_lower=False, lower=False)
from gensim.utils import simple_preprocess

text = "Gensim is a Python library for topic modeling and document similarity analysis."
tokens = nltk.word_tokenize(text)
processed_tokens = simple_preprocess(" ".join(tokens))

print("Tokenization using NLTK and preprocessing using Gensim:")
print(processed_tokens)


Tokenization using NLTK and preprocessing using Gensim:
['gensim', 'is', 'python', 'library', 'for', 'topic', 'modeling', 'and', 'document', 'similarity', 'analysis']


#### Tokenization with Keras

In [89]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence

texts = "Keras is a high-level neural networks API running on top of TensorFlow."

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
list_tokens = text_to_word_sequence(texts)
list_tokens


['keras',
 'is',
 'a',
 'high',
 'level',
 'neural',
 'networks',
 'api',
 'running',
 'on',
 'top',
 'of',
 'tensorflow']

In [None]:
#####################################################################################END#####################################################################################