- This project demonstrates fundamental Natural Language Processing techniques such as tokenization, stemming, lemmatization, Bag of Words, and TF-IDF. 
- These techniques are commonly used in text preprocessing and feature extraction for NLP tasks.

In [2]:
# Import necessary libraries
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords

# Download NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

# paragraph
text = """
Natural Language Processing (NLP) is a fascinating field of artificial intelligence 
that deals with the interaction between computers and humans using natural language. 
It includes tasks such as tokenization, stemming, and lemmatization.
"""

# 1. Tokenization
tokens = word_tokenize(text)
print("Tokens:")
print(tokens)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Umakant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Umakant\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Umakant\AppData\Roaming\nltk_data...


Tokens:
['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'artificial', 'intelligence', 'that', 'deals', 'with', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'using', 'natural', 'language', '.', 'It', 'includes', 'tasks', 'such', 'as', 'tokenization', ',', 'stemming', ',', 'and', 'lemmatization', '.']


[nltk_data]   Package omw-1.4 is already up-to-date!


**2. Stemming**

In [4]:

stemmer = PorterStemmer()
stems = [stemmer.stem(token) for token in tokens]
print("\nStemmed Words:")
print(stems)




Stemmed Words:
['natur', 'languag', 'process', '(', 'nlp', ')', 'is', 'a', 'fascin', 'field', 'of', 'artifici', 'intellig', 'that', 'deal', 'with', 'the', 'interact', 'between', 'comput', 'and', 'human', 'use', 'natur', 'languag', '.', 'it', 'includ', 'task', 'such', 'as', 'token', ',', 'stem', ',', 'and', 'lemmat', '.']


**Lemmatization**

In [6]:

lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token) for token in tokens]
print("\nLemmatized Words:")
print(lemmas)




Lemmatized Words:
['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'artificial', 'intelligence', 'that', 'deal', 'with', 'the', 'interaction', 'between', 'computer', 'and', 'human', 'using', 'natural', 'language', '.', 'It', 'includes', 'task', 'such', 'a', 'tokenization', ',', 'stemming', ',', 'and', 'lemmatization', '.']


In [8]:
# 4. Bag of Words (BoW)
vectorizer_bow = CountVectorizer()
bow_matrix = vectorizer_bow.fit_transform([text])
print("\nBag of Words (BoW):")
print(bow_matrix.toarray())
print("Feature Names:", vectorizer_bow.get_feature_names_out())




Bag of Words (BoW):
[[2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1]]
Feature Names: ['and' 'artificial' 'as' 'between' 'computers' 'deals' 'fascinating'
 'field' 'humans' 'includes' 'intelligence' 'interaction' 'is' 'it'
 'language' 'lemmatization' 'natural' 'nlp' 'of' 'processing' 'stemming'
 'such' 'tasks' 'that' 'the' 'tokenization' 'using' 'with']


In [10]:
# 5. TF-IDF
vectorizer_tfidf = TfidfVectorizer()
tfidf_matrix = vectorizer_tfidf.fit_transform([text])
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())
print("Feature Names:", vectorizer_tfidf.get_feature_names_out())



TF-IDF Matrix:
[[0.32879797 0.16439899 0.16439899 0.16439899 0.16439899 0.16439899
  0.16439899 0.16439899 0.16439899 0.16439899 0.16439899 0.16439899
  0.16439899 0.16439899 0.32879797 0.16439899 0.32879797 0.16439899
  0.16439899 0.16439899 0.16439899 0.16439899 0.16439899 0.16439899
  0.16439899 0.16439899 0.16439899 0.16439899]]
Feature Names: ['and' 'artificial' 'as' 'between' 'computers' 'deals' 'fascinating'
 'field' 'humans' 'includes' 'intelligence' 'interaction' 'is' 'it'
 'language' 'lemmatization' 'natural' 'nlp' 'of' 'processing' 'stemming'
 'such' 'tasks' 'that' 'the' 'tokenization' 'using' 'with']


In [16]:
# 6. Stopword Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("\nFiltered Tokens (Stopwords Removed):")
print(filtered_tokens)


Filtered Tokens (Stopwords Removed):
['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'fascinating', 'field', 'artificial', 'intelligence', 'deals', 'interaction', 'computers', 'humans', 'using', 'natural', 'language', '.', 'includes', 'tasks', 'tokenization', ',', 'stemming', ',', 'lemmatization', '.']
