In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

[nltk_data] Downloading package punkt to /Users/anthony/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anthony/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/anthony/nltk_data...
[nltk_data] Downloading package omw-1.4 to /Users/anthony/nltk_data...


### Load data

Next, we need to load the data that we want to preprocess. In this example, we will use the following sentence:

In [5]:
text = "The quick brown fox jumped over the lazy dog."

### Text Normalization

Text normalization is the process of converting text into a standard format. This involves converting all characters to lowercase and removing any punctuation.

In [6]:
# Convert to lowercase
text = text.lower()

# Remove punctuation
text = ''.join(c for c in text if c not in '.,;:-')

print(text)

the quick brown fox jumped over the lazy dog


### Tokenization

Tokenization is the process of splitting a sentence into individual words or tokens.

In [8]:
# Tokenize the text
tokens = word_tokenize(text)

print(tokens)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


### Stopword Removal

Stopwords are common words that do not carry much meaning and can be removed from the text. We will use NLTK's list of stopwords and remove them from the tokenized text.

In [10]:
# Remove stopwords
stop_words = set(stopwords.words('english'))

print(stop_words)

filtered_tokens = [word for word in tokens if word not in stop_words]

print(filtered_tokens)

{'won', 'their', 'doing', 'all', 'not', 'weren', 'each', 'ourselves', 'herself', 'only', 'but', 'she', "shan't", 'have', 'no', 'your', "weren't", 'than', "shouldn't", 'an', 'over', 'its', 'above', 'both', 'isn', 'he', 'a', "doesn't", 'you', 'of', 'him', 'here', 'm', 'did', "it's", "wouldn't", "mightn't", 'itself', 'more', 're', 'being', 'hers', 'now', 'again', 'before', 'ain', 'further', 'yourself', 'wasn', 'the', 'don', 'to', 'too', 'doesn', "didn't", 'me', "hadn't", 'has', 'most', 'myself', "she's", 'd', 'from', 'so', 'themselves', 'same', 'other', 'when', "aren't", 'does', 'theirs', "haven't", 'mightn', 'wouldn', "mustn't", 'why', 'do', "that'll", 'in', 'our', 'ours', 'her', 'any', 'be', 'on', 'these', 'and', 'my', 'some', "you've", "wasn't", 'with', 'while', 'it', 'hadn', 'once', 'how', "won't", 'then', 's', "should've", "couldn't", 'needn', 'is', 'o', 'shouldn', 'there', 'after', 'whom', 'because', 'out', 'until', 'was', 'this', 'or', 'that', 'having', 'those', 'just', 'as', 'very

### Stemming

Stemming is the process of reducing a word to its base or root form. We will use Porter stemmer from NLTK for stemming.

In [11]:
# Perform stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

print(stemmed_tokens)

['quick', 'brown', 'fox', 'jump', 'lazi', 'dog']


### Lemmatization

Lemmatization is the process of converting a word to its base or dictionary form. We will use WordNet lemmatizer from NLTK for lemmatization.

In [13]:
# Perform lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print(lemmatized_tokens)

['quick', 'brown', 'fox', 'jumped', 'lazy', 'dog']


### Output the results

Finally, we will output the results of each step of the text preprocessing process.

In [14]:
print("Original text: ", text)
print("Tokenized text: ", tokens)
print("Filtered tokens: ", filtered_tokens)
print("Stemmed tokens: ", stemmed_tokens)
print("Lemmatized tokens: ", lemmatized_tokens)

Original text:  the quick brown fox jumped over the lazy dog
Tokenized text:  ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']
Filtered tokens:  ['quick', 'brown', 'fox', 'jumped', 'lazy', 'dog']
Stemmed tokens:  ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog']
Lemmatized tokens:  ['quick', 'brown', 'fox', 'jumped', 'lazy', 'dog']
