# Text Preprocessing

In [49]:
# Tokenization

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amrit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [50]:

text = "My name is Amrith R Naik. I am doing my internshipping dancing at MIT."

sentences = sent_tokenize(text)
print("Sentences :", sentences)

words = word_tokenize(text)
print("Words :",words)

Sentences : ['My name is Amrith R Naik.', 'I am doing my internshipping dancing at MIT.']
Words : ['My', 'name', 'is', 'Amrith', 'R', 'Naik', '.', 'I', 'am', 'doing', 'my', 'internshipping', 'dancing', 'at', 'MIT', '.']


In [51]:
# Stop word removal

from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amrit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [52]:
stop_words = set(stopwords.words('english'))

filtered_words = [word for word in words if word.lower() not in stop_words]
print("Filtered Words :", filtered_words)

Filtered Words : ['name', 'Amrith', 'R', 'Naik', '.', 'internshipping', 'dancing', 'MIT', '.']


In [53]:
# Stemming

from nltk.stem import PorterStemmer

ps = PorterStemmer()

stemmed_words = [ps.stem(word) for word in filtered_words]
print("Stemmed words :", stemmed_words)

Stemmed words : ['name', 'amrith', 'r', 'naik', '.', 'internship', 'danc', 'mit', '.']


In [56]:
# Lemmatization
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amrit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\amrit\AppData\Roaming\nltk_data...


True

In [59]:
lemmatizer = WordNetLemmatizer()

lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print(lemmatized_words)

['name', 'Amrith', 'R', 'Naik', '.', 'internshipping', 'dancing', 'MIT', '.']


In [61]:
# Parts of speech tagging

nltk.download('averaged_perceptron_tagger')


POS Tags : [('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Amrith', 'NNP'), ('R', 'NNP'), ('Naik', 'NNP'), ('.', '.'), ('I', 'PRP'), ('am', 'VBP'), ('doing', 'VBG'), ('my', 'PRP$'), ('internshipping', 'VBG'), ('dancing', 'VBG'), ('at', 'IN'), ('MIT', 'NNP'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\amrit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [63]:
pos_tags = nltk.pos_tag(words)
print('POS Tags :', pos_tags)

POS Tags : [('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Amrith', 'NNP'), ('R', 'NNP'), ('Naik', 'NNP'), ('.', '.'), ('I', 'PRP'), ('am', 'VBP'), ('doing', 'VBG'), ('my', 'PRP$'), ('internshipping', 'VBG'), ('dancing', 'VBG'), ('at', 'IN'), ('MIT', 'NNP'), ('.', '.')]
