<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/main/Exercises/day-8/NLP-Concepts-Token/Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exploring Core NLP Concepts
Welcome to this hands-on NLP Colab lab! You will work through key tasks—tokenization, POS tagging, stemming, stop-word filtering, vocabulary matching, lemmatization, dependency parsing, NER, and intent classification—using Python libraries. Follow the instructions and complete the exercises.

In [None]:
# Install required packages
!pip install --quiet nltk spacy textblob sklearn

# Download NLTK data and spaCy model
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

!python -m spacy download en_core_web_sm -q


In [None]:
# 1. Tokenization
# Goal: Split text into tokens (words and punctuation).
from nltk.tokenize import word_tokenize, sent_tokenize

text = "Natural Language Processing enables machines to understand human language."
print("Sentences:", sent_tokenize(text))
print("Tokens:", word_tokenize(text))

In [None]:
# Exercise 1.1: Tokenize the following paragraph into words and sentences:

paragraph = "Machine learning models power many NLP tasks. They learn patterns from data!"


In [None]:
# 2. Part-of-Speech Tagging
# Goal: Assign grammatical tags to each token.
import nltk
tokens = word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)


In [None]:
# Exercise 2.1: Tag POS for tokens from your Exercise 1.1.

In [None]:
# 3. Stemming
# Goal: Reduce words to their root forms (may be non-dictionary).
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ["running", "runs", "ran", "easily", "fairly"]
print({w: stemmer.stem(w) for w in words})


In [None]:
# Exercise 3.1: Stem the tokens from your Exercise 1.1.



In [None]:
# 4. Stop-Word Filtering
# Goal: Remove common, low-value words.
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text.lower())
filtered = [w for w in tokens if w.isalpha() and w not in stop_words]
print(filtered)

In [None]:
# Exercise 4.1: Filter stop words from your Exercise 1.1 tokens.



In [None]:
# 5. Vocabulary Matching
# Goal: Check tokens against a predefined vocabulary.

vocab = {"natural", "language", "machine", "data", "processing"}
tokens = [w.lower() for w in word_tokenize(text)]
in_vocab = [w for w in tokens if w.isalpha() and w in vocab]
print("In-vocab tokens:", in_vocab)
print("OOV tokens:", [w for w in tokens if w.isalpha() and w not in vocab])


In [None]:
# Exercise 5.1: Define your own small vocabulary and classify tokens from Exercise 1.1 into in-vocab vs. out-of-vocab.

In [None]:
# 6. Lemmatization
# Goal: Convert words to their dictionary form.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words = ["running", "better", "wolves"]
print({w: lemmatizer.lemmatize(w) for w in words})
# For verbs:
print("run (verb):", lemmatizer.lemmatize("running", pos='v'))


In [None]:
# Exercise 6.1: Lemmatize tokens from Exercise 1.1 (both default and verb POS).



In [None]:
# 7. Dependency Parsing
# Goal: Identify syntactic relationships between tokens.
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
for token in doc:
    print(token.text, token.dep_, token.head.text)


In [None]:
# Exercise 7.1: Parse the sentence “They learn patterns from data” and list each token’s dependency label and head.

In [None]:
# 8. Named-Entity Recognition (NER)
# Goal: Extract real-world entities from text.
doc = nlp("Google was founded in 1998 by Larry Page and Sergey Brin in California.")
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
#Exercise 8.1: Run NER on this sentence and add at least two more sentences of your own.

