In [2]:
!pip install nltk spacy
!python -m nltk.downloader all
!python -m spacy download en_core_web_sm

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

In [2]:
# Import Libraries
import spacy
import nltk

from nltk.stem import PorterStemmer

# Load spaCy Model
nlp = spacy.load("en_core_web_sm")

# Download NLTK data (only first time)
nltk.download('punkt')

# Initialize Stemmer
stemmer = PorterStemmer()

# Input Text
text = """
Natural Language Processing is a branch of Artificial Intelligence.
It helps machines understand human language. Google and Microsoft
use NLP in their products. India is developing AI rapidly.
"""

print("Original Text:")
print(text)
print("-" * 50)

# Process Text with spaCy
doc = nlp(text)

# --------------------------------
# 1. Tokenization (spaCy)
# --------------------------------

tokens = [token.text for token in doc]

print("Tokenization:")
print(tokens)
print("-" * 50)

# --------------------------------
# 2. Normalization (Lowercase + Remove Punctuation)
# --------------------------------

normalized_tokens = [
    token.text.lower()
    for token in doc
    if not token.is_punct
]

print("After Normalization:")
print(normalized_tokens)
print("-" * 50)

# --------------------------------
# 3. Stop Word Removal (spaCy)
# --------------------------------

filtered_tokens = [
    token.text.lower()
    for token in doc
    if not token.is_stop and not token.is_punct
]

print("After Stop Word Removal:")
print(filtered_tokens)
print("-" * 50)

# --------------------------------
# 4. Stemming (NLTK)
# --------------------------------

stemmed_tokens = [
    stemmer.stem(word)
    for word in filtered_tokens
]

print("After Stemming (NLTK):")
print(stemmed_tokens)
print("-" * 50)

# --------------------------------
# 5. Lemmatization (spaCy)
# --------------------------------

lemmatized_tokens = [
    token.lemma_.lower()
    for token in doc
    if not token.is_stop and not token.is_punct
]

print("After Lemmatization (spaCy):")
print(lemmatized_tokens)
print("-" * 50)

# --------------------------------
# 6. POS Tagging (spaCy)
# --------------------------------

print("POS Tagging:")
for token in doc:
    if not token.is_punct:
        print(token.text, "->", token.pos_)

print("-" * 50)

# --------------------------------
# 7. Named Entity Recognition (spaCy)
# --------------------------------

print("Named Entity Recognition:")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)

print("-" * 50)

# --------------------------------
# 8. Final Output
# --------------------------------

print("Final Processed Text:")
print(" ".join(lemmatized_tokens))


Original Text:

Natural Language Processing is a branch of Artificial Intelligence.
It helps machines understand human language. Google and Microsoft
use NLP in their products. India is developing AI rapidly.

--------------------------------------------------
Tokenization:
['\n', 'Natural', 'Language', 'Processing', 'is', 'a', 'branch', 'of', 'Artificial', 'Intelligence', '.', '\n', 'It', 'helps', 'machines', 'understand', 'human', 'language', '.', 'Google', 'and', 'Microsoft', '\n', 'use', 'NLP', 'in', 'their', 'products', '.', 'India', 'is', 'developing', 'AI', 'rapidly', '.', '\n']
--------------------------------------------------
After Normalization:
['\n', 'natural', 'language', 'processing', 'is', 'a', 'branch', 'of', 'artificial', 'intelligence', '\n', 'it', 'helps', 'machines', 'understand', 'human', 'language', 'google', 'and', 'microsoft', '\n', 'use', 'nlp', 'in', 'their', 'products', 'india', 'is', 'developing', 'ai', 'rapidly', '\n']
-------------------------------------

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
