# NLP Preprocessing

## Import Libraries

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import re

## Download NLTK Data

In [14]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\areeb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\areeb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\areeb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\areeb\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Load Example Dataset

In [17]:
text = """Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence (AI) that focuses on the interaction between humans and computers using natural language. 
The ultimate goal of NLP is to enable computers to understand, interpret, and generate human language in a way that is both valuable and meaningful."""

## Tokenization

In [20]:
# Word tokenization
word_tokens = word_tokenize(text)
print(f"Word Tokens: {word_tokens}")

# Sentence tokenization
sentence_tokens = sent_tokenize(text)
print(f"Sentence Tokens: {sentence_tokens}")

Word Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence', '(', 'AI', ')', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'humans', 'and', 'computers', 'using', 'natural', 'language', '.', 'The', 'ultimate', 'goal', 'of', 'NLP', 'is', 'to', 'enable', 'computers', 'to', 'understand', ',', 'interpret', ',', 'and', 'generate', 'human', 'language', 'in', 'a', 'way', 'that', 'is', 'both', 'valuable', 'and', 'meaningful', '.']
Sentence Tokens: ['Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence (AI) that focuses on the interaction between humans and computers using natural language.', 'The ultimate goal of NLP is to enable computers to understand, interpret, and generate human language in a way that is both valuable and meaningful.']


## Stopword Removal

In [23]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokens if word.lower() not in stop_words]
print(f"Filtered Words: {filtered_words}")

Filtered Words: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'fascinating', 'field', 'Artificial', 'Intelligence', '(', 'AI', ')', 'focuses', 'interaction', 'humans', 'computers', 'using', 'natural', 'language', '.', 'ultimate', 'goal', 'NLP', 'enable', 'computers', 'understand', ',', 'interpret', ',', 'generate', 'human', 'language', 'way', 'valuable', 'meaningful', '.']


## Stemming

In [26]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print(f"Stemmed Words: {stemmed_words}")

Stemmed Words: ['natur', 'languag', 'process', '(', 'nlp', ')', 'fascin', 'field', 'artifici', 'intellig', '(', 'ai', ')', 'focus', 'interact', 'human', 'comput', 'use', 'natur', 'languag', '.', 'ultim', 'goal', 'nlp', 'enabl', 'comput', 'understand', ',', 'interpret', ',', 'gener', 'human', 'languag', 'way', 'valuabl', 'meaning', '.']


## Lemmatization

In [29]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print(f"Lemmatized Words: {lemmatized_words}")

Lemmatized Words: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'fascinating', 'field', 'Artificial', 'Intelligence', '(', 'AI', ')', 'focus', 'interaction', 'human', 'computer', 'using', 'natural', 'language', '.', 'ultimate', 'goal', 'NLP', 'enable', 'computer', 'understand', ',', 'interpret', ',', 'generate', 'human', 'language', 'way', 'valuable', 'meaningful', '.']


## Remove Punctuation

In [32]:
text_no_punctuation = text.translate(str.maketrans('', '', string.punctuation))
print(f"Text without Punctuation: {text_no_punctuation}")

Text without Punctuation: Natural Language Processing NLP is a fascinating field of Artificial Intelligence AI that focuses on the interaction between humans and computers using natural language 
The ultimate goal of NLP is to enable computers to understand interpret and generate human language in a way that is both valuable and meaningful


## Lowercasing

In [35]:
text_lowercase = text_no_punctuation.lower()
print(f"Lowercased Text: {text_lowercase}")

Lowercased Text: natural language processing nlp is a fascinating field of artificial intelligence ai that focuses on the interaction between humans and computers using natural language 
the ultimate goal of nlp is to enable computers to understand interpret and generate human language in a way that is both valuable and meaningful


## Remove Special Characters

In [38]:
text_no_special_chars = re.sub(r'\W', ' ', text_lowercase)
print(f"Text without Special Characters: {text_no_special_chars}")

Text without Special Characters: natural language processing nlp is a fascinating field of artificial intelligence ai that focuses on the interaction between humans and computers using natural language  the ultimate goal of nlp is to enable computers to understand interpret and generate human language in a way that is both valuable and meaningful
