## Tokenization

In [40]:
!pip -q install contractions emoji langdetect

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/981.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m624.6/981.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [41]:
import string
import nltk
import re
import contractions
import emoji

from langdetect import detect
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
text = "Natural Language Processing is fascinating! Let's explore it."

# Word Tokenization with NLTK
nltk_tokens = word_tokenize(text)
print("NLTK Tokens:", nltk_tokens)

# Sentence Tokenization with NLTK
sentences = sent_tokenize(text)
print("Sentences:", sentences)

NLTK Tokens: ['Natural', 'Language', 'Processing', 'is', 'fascinating', '!', 'Let', "'s", 'explore', 'it', '.']
Sentences: ['Natural Language Processing is fascinating!', "Let's explore it."]


## Removing Noise

In [7]:
# Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

cleaned_tokens = [
    token for token in nltk_tokens
    if token.lower() not in stop_words and token not in punctuation
]
print("Cleaned Tokens:", cleaned_tokens)

Cleaned Tokens: ['Natural', 'Language', 'Processing', 'fascinating', 'Let', "'s", 'explore']


## Case Normalization

In [8]:
text_lower = text.lower()
print("Lowercase Text:", text_lower)

Lowercase Text: natural language processing is fascinating! let's explore it.


## Stemming and Lemmatization

In [22]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

#individual word
word = "historical"
stemmed_word = stemmer.stem(word)
lemmatized_word = lemmatizer.lemmatize(word)
print("Stemmed Word:", stemmed_word)
print("Lemmatized Word:", lemmatized_word)

#in sentences
raw_text = "The striped bats are hanging on their feet for best"
text_stem = " ".join([stemmer.stem(word) for word in raw_text.split()])
text_lemma = " ".join([lemmatizer.lemmatize(word) for word in raw_text.split()])
print("Stemmed Text:", text_stem)
print("Lemmatized Text:", text_lemma)

Stemmed Word: histor
Lemmatized Word: historical
Stemmed Text: the stripe bat are hang on their feet for best
Lemmatized Text: The striped bat are hanging on their foot for best


## Handle Numbers and Special Characters

In [36]:
text_with_numbers = "The price increased by 50% in 2023. What do you think? 😊"

# Remove numbers
cleaned_text = re.sub(r'\d+', '', text_with_numbers)
print("Cleaned Text:", cleaned_text)

# Count numbers (feature engineering)
num_count = len(re.findall(r'\d+', text_with_numbers))
num = re.findall(r'\d+', text_with_numbers)
print("Number Count:", num_count)
print("Numbers:", num)

# Count special characters like % ? and etc
special_count = len(re.findall(r'[^\w\s]', text_with_numbers))
special = re.findall(r'[^\w\s]', text_with_numbers)
print("Special Character Count:", special_count)
print("Special Characters:", special)

# Handle emoji
cleaned_text_emoji = emoji.demojize(text_with_numbers)
print("Cleaned Text with Emoji:", cleaned_text_emoji)

Cleaned Text: The price increased by % in . What do you think? 😊
Number Count: 2
Numbers: ['50', '2023']
Special Character Count: 4
Special Characters: ['%', '.', '?', '😊']
Cleaned Text with Emoji: The price increased by 50% in 2023. What do you think? :smiling_face_with_smiling_eyes:


## Handling Contractions and Abbreviations

In [31]:
text = "I can't wait to see y'all in 2023!"
expanded_text = contractions.fix(text)
print("Expanded Text:", expanded_text)

Expanded Text: I cannot wait to see you all in 2023!


## Spell Correction

In [38]:
text = "NLP is teh best!"

blob = TextBlob(text)
corrected = blob.correct()
print(corrected)

NLP is the best!


## Language Detection

In [42]:
# Language detection example
text = "Ceci est un exemple en français."
print(detect(text))

fr
