## Tokenization using python's spilt function

In [1]:
text = "It was the best of times, it was the worst of times."
# Tokenize the text into words
words = text.split()
print((words))

['It', 'was', 'the', 'best', 'of', 'times,', 'it', 'was', 'the', 'worst', 'of', 'times.']


## Using the NLTK library to tokenize a sentence

In [2]:
import nltk
from IPython.display import clear_output

from nltk.tokenize import word_tokenize, TweetTokenizer
nltk.download('punkt')

clear_output()

In [3]:
text = "I'm eating food and drinking milk."

# Tokenize the text into words using word_tokenize
words = word_tokenize(text)
print(words)

# Tokenize the text into sentences using TweetTokenizer
tokenizer = TweetTokenizer()
words = tokenizer.tokenize(text)
print(words)


['I', "'m", 'eating', 'food', 'and', 'drinking', 'milk', '.']
["I'm", 'eating', 'food', 'and', 'drinking', 'milk', '.']


### Removing Pantuaction and Lowercasing


In [4]:
text = "Hello, how are you? I'm good, thank you."

# Tokenize the text into words
tokens = word_tokenize(text)

# Checking for alphanumeric tokens
tokens = [token for token in tokens if token.isalpha()]

# Lowercasing the tokens
tokens =  [token.lower() for token in tokens]

print(tokens)

['hello', 'how', 'are', 'you', 'i', 'good', 'thank', 'you']


### NLTK Stopwords removal

In [5]:
from nltk.corpus import stopwords
nltk.download('stopwords')
clear_output()

english_stopwords = stopwords.words('english')
print(len(english_stopwords), ":", english_stopwords)

179 : ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'th

In [6]:
text = "I'm eating food and drinking milk."

tokens = word_tokenize(text)
tokens = [token.lower() for token in tokens if token.isalpha()]
print(tokens)

# Removing stopwords
tokens = [token for token in tokens if token not in english_stopwords]
print(tokens)


['i', 'eating', 'food', 'and', 'drinking', 'milk']
['eating', 'food', 'drinking', 'milk']


### NLTK Spell Correction Module

In [7]:
from nltk.corpus import words
from nltk.metrics.distance import edit_distance
nltk.download('words')
clear_output()

correct_words = words.words('en')
print(len(correct_words))


235886


In [8]:
incorrect_word = 'interresting'

editD_word = [(edit_distance(incorrect_word, word), word) for word in correct_words if incorrect_word[0] == word[0]]
print(sorted(editD_word)[:5])

[(1, 'interesting'), (2, 'intercepting'), (2, 'interrupting'), (3, 'interestingly'), (3, 'interfering')]


## Lemematization

In [9]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemetizer = WordNetLemmatizer()
clear_output()

In [10]:
# Required lower case
print(lemetizer.lemmatize("dogs"))
print(lemetizer.lemmatize("Dogs")) # Does not work
print(lemetizer.lemmatize("churches"))
print(lemetizer.lemmatize("abaci"))

dog
Dogs
church
abacus


## Stemming


In [11]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
clear_output()

In [12]:
# Case insensitive
print(ps.stem("playing"))
print(ps.stem("Playing"))
print(ps.stem("presumably"))
print(ps.stem("probably"))

play
play
presum
probabl


## Professor's Code


### Basics


In [13]:
import nltk
from nltk import word_tokenize, TweetTokenizer, MWETokenizer
nltk.download('punkt')
clear_output()

text = "I ate 8.5 ice-creams in New Delhi 🥶😇"


#Problem with domain specific text
tokens = word_tokenize(text)
print(tokens)

#Processing Tweets
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(text))

tokenizer = TweetTokenizer()
more_complex_text = "Congrats @shashitharoor Sir, for winning ‘Chevalier de la Legion d’Honneur’, highest civilian honour by the French government.#FranceinIndiafrancediplo_EN"
print(word_tokenize(more_complex_text))
print(tokenizer.tokenize(more_complex_text))


# Multi-word expressions
tokenizer = MWETokenizer()
tokenizer.add_mwe(('New', 'Delhi'))
print(tokenizer.tokenize(word_tokenize(text)))


# Cleaning Text by removing non-alphabetical tokens
"123".isalpha() # False
"NLP".isalpha() # True


text = "I'm eating food and drinking milk."
tokens = word_tokenize(text)
cleaned_tokens = [token for token in tokens if token.isalpha()]
print("Cleaned tokens:", cleaned_tokens)


# lowercasing
lowercased_tokens = [token.lower() for token in cleaned_tokens]
print("Lowercased tokens:", lowercased_tokens)

['I', 'ate', '8.5', 'ice-creams', 'in', 'New', 'Delhi', '🥶😇']
['I', 'ate', '8.5', 'ice-creams', 'in', 'New', 'Delhi', '🥶', '😇']
['Congrats', '@', 'shashitharoor', 'Sir', ',', 'for', 'winning', '‘', 'Chevalier', 'de', 'la', 'Legion', 'd', '’', 'Honneur', '’', ',', 'highest', 'civilian', 'honour', 'by', 'the', 'French', 'government.', '#', 'FranceinIndiafrancediplo_EN']
['Congrats', '@shashitharoor', 'Sir', ',', 'for', 'winning', '‘', 'Chevalier', 'de', 'la', 'Legion', 'd', '’', 'Honneur', '’', ',', 'highest', 'civilian', 'honour', 'by', 'the', 'French', 'government', '.', '#FranceinIndiafrancediplo_EN']
['I', 'ate', '8.5', 'ice-creams', 'in', 'New_Delhi', '🥶😇']
Cleaned tokens: ['I', 'eating', 'food', 'and', 'drinking', 'milk']
Lowercased tokens: ['i', 'eating', 'food', 'and', 'drinking', 'milk']


### Stop Words ans Spell Checks

In [14]:
#Correcting Spelling Mistakes
from nltk.corpus import words
from nltk.metrics.distance import edit_distance
nltk.download('words')
correct_words = words.words("en") # list of correctly spelled words
clear_output()

#Stopword Removal
from nltk.corpus import stopwords
print("All Fiends:", stopwords.fileids(),'\n')
print("English:", stopwords.words("english") ,'\n')
print("Russian:", stopwords.words("russian") ,'\n')
print("Hinglish:", stopwords.words("hinglish") ,'\n')


misspelled_word = "instituon" # "oblied", "instituon" "scohol"
threshold = len(misspelled_word)
correctly_spelled_word = ''
for word in correct_words:
	if word[0] == misspelled_word[0]:
		editDis = edit_distance(word,misspelled_word)
		if  editDis < threshold:
			correctly_spelled_word = word
			threshold = editDis
print("Correct Word:", correctly_spelled_word)

All Fiends: ['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish'] 

English: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'thr

### Lemematization and Stemming

In [15]:
# Lemmatizer 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
clear_output()

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("dogs"))
print(lemmatizer.lemmatize("churches"))
print(lemmatizer.lemmatize("introduction"))


# Stemming
print()
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem("dogs"))
print(stemmer.stem("introduction"))

dog
church
introduction

dog
introduct
