<a href="https://colab.research.google.com/github/bonginkosi1988/bonginkosi1988/blob/main/NLP%20Preprocessing%20tenchniques%20in%20applied%20settings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install neccessary libraries
!pip install nltk spacy
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# For sentence and word tokenization
from nltk.tokenize import word_tokenize, sent_tokenize
import spacy

# Load spaCy English model
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# Input text
text = "spaCy is an amazing NLP library! It's fast and efficient."

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
#--- NLTK Word Tokenization
print("NLTK Word Tokens:")
print(word_tokenize(text))

#---NLTK Sentence Tokenization
print("\nNLTK Sentence Tokens:")
print(sent_tokenize(text))

NLTK Word Tokens:
['spaCy', 'is', 'an', 'amazing', 'NLP', 'library', '!', 'It', "'s", 'fast', 'and', 'efficient', '.']

NLTK Sentence Tokens:
['spaCy is an amazing NLP library!', "It's fast and efficient."]


In [None]:
#---spaCy Tokenization
doc = nlp(text)
print("\nspaCy Word Tokens:")
print([token.text for token in doc])

#---spaCy Sentence Tokenization
print("\nspaCy Sentence Tokens:")
print([sent.text for sent in doc.sents])


spaCy Word Tokens:
['spaCy', 'is', 'an', 'amazing', 'NLP', 'library', '!', 'It', "'s", 'fast', 'and', 'efficient', '.']

spaCy Sentence Tokens:
['spaCy is an amazing NLP library!', "It's fast and efficient."]


In [None]:
# Stopword Removal & Normalization

In [None]:
# NLTK Setup

from nltk.corpus import stopwords

import string

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# spaCy Setup (already downloaded earlier)

text = "Oh wow! This product is actually the BEST I've used so far. No kidding!!"


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# --- NLTK Approach

words = word_tokenize(text.lower())

cleaned_nltk = [word for word in words if word.isalpha() and word not in stop_words]

print("Cleaned Tokens using NLTK:")

print(cleaned_nltk)


Cleaned Tokens using NLTK:
['oh', 'wow', 'product', 'actually', 'best', 'used', 'far', 'kidding']


In [None]:
# --- spaCy Approach
doc = nlp(text.lower())
cleaned_spacy = [token.text for token in doc if not token.is_stop and token.is_alpha]
print("\nCleaned Tokens using spaCy:")
print(cleaned_spacy)


Cleaned Tokens using spaCy:
['oh', 'wow', 'product', 'actually', 'best', 'far', 'kidding']


3. Stemming vs. Lemmatization

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

text = "Users are loving the new updates. The system runs faster and better."
tokens = word_tokenize(text.lower())

In [None]:
# --- Stemming

stemmed = [stemmer.stem(token) for token in tokens if token.isalpha()]

print("Stemmed Tokens (NLTK):")

print(stemmed)


Stemmed Tokens (NLTK):
['user', 'are', 'love', 'the', 'new', 'updat', 'the', 'system', 'run', 'faster', 'and', 'better']


In [None]:
# --- Lemmatization using spaCy
doc = nlp(text.lower())
lemmatized = [token.lemma_ for token in doc if token.is_alpha]
print("\nLemmatized Tokens (spaCy):")
print(lemmatized)


Lemmatized Tokens (spaCy):
['user', 'be', 'love', 'the', 'new', 'update', 'the', 'system', 'run', 'fast', 'and', 'well']


In [None]:
# Compare side by side

print("\nComparison:")

for token in set(tokens):

    if token.isalpha():

        print(f"{token:12s} | Stemmed: {stemmer.stem(token):12s} | Lemma: {nlp(token)[0].lemma_}")



Comparison:
are          | Stemmed: are          | Lemma: be
and          | Stemmed: and          | Lemma: and
runs         | Stemmed: run          | Lemma: run
the          | Stemmed: the          | Lemma: the
better       | Stemmed: better       | Lemma: well
updates      | Stemmed: updat        | Lemma: update
faster       | Stemmed: faster       | Lemma: fast
users        | Stemmed: user         | Lemma: user
new          | Stemmed: new          | Lemma: new
loving       | Stemmed: love         | Lemma: love
system       | Stemmed: system       | Lemma: system
