In [12]:
# ================================
# NLP Preprocessing in Python
# Using both NLTK and spaCy
# ================================

!pip install nltk spacy

# Import & download resources
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')   # ✅ Fix for new NLTK versions
nltk.download('stopwords')
nltk.download('wordnet')

import spacy
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# ================================
# Sample Dataset
# ================================
corpus = [
    "The stock market crashed due to global uncertainty.",
    "Natural Language Processing is a key part of Artificial Intelligence.",
    "Google releases a new AI model to improve search results.",
    "The weather today is sunny and pleasant in New York.",
    "Sports events are being postponed because of heavy rains."
]

print("📌 Original Corpus:")
for i, doc in enumerate(corpus, 1):
    print(f"{i}. {doc}")

# ================================
# 🔹 NLTK Preprocessing
# ================================
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

print("\n================ NLTK Preprocessing ================")

for i, doc in enumerate(corpus, 1):
    tokens = word_tokenize(doc.lower())  # Tokenization
    no_stop = [w for w in tokens if w.isalpha() and w not in stop_words]  # Stopword removal
    stemmed = [stemmer.stem(w) for w in no_stop]  # Stemming
    lemmatized = [lemmatizer.lemmatize(w) for w in no_stop]  # Lemmatization

    print(f"\nSentence {i}: {doc}")
    print(f"👉 Tokens: {tokens}")
    print(f"👉 After Stopword Removal: {no_stop}")
    print(f"👉 After Stemming: {stemmed}")
    print(f"👉 After Lemmatization: {lemmatized}")

# ================================
# 🔹 spaCy Preprocessing
# ================================
print("\n================ spaCy Preprocessing ================")

for i, doc in enumerate(corpus, 1):
    spacy_doc = nlp(doc.lower())

    tokens = [token.text for token in spacy_doc]  # Tokenization
    no_stop = [token.text for token in spacy_doc if not token.is_stop and token.is_alpha]  # Stopword removal
    lemmatized = [token.lemma_ for token in spacy_doc if not token.is_stop and token.is_alpha]  # Lemmatization

    print(f"\nSentence {i}: {doc}")
    print(f"👉 Tokens: {tokens}")
    print(f"👉 After Stopword Removal: {no_stop}")
    print(f"👉 After Lemmatization: {lemmatized}")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m123.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
📌 Original Corpus:
1. The stock market crashed due to global uncertainty.
2. Natural Language Processing is a key part of Artificial Intelligence.
3. Google releases a new AI model to improve search results.
4. The weather today is sunny and pleasant in New York.
5. Sports events are being postponed because of heavy rains.


Sentence