## Exploring Core NLP Concepts
Welcome to this hands-on NLP Colab lab! You will work through key tasks—tokenization, POS tagging, stemming, stop-word filtering, vocabulary matching, lemmatization, dependency parsing, NER, and intent classification—using Python libraries. Follow the instructions and complete the exercises.

In [1]:
# Install required packages
!pip install --upgrade pip setuptools wheel -q
!pip install --quiet nltk spacy textblob sklearn

# Download NLTK data and spaCy model
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

!python -m spacy download en_core_web_sm -q


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.8 MB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m24.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m40.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of th

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
# 1. Tokenization
# Goal: Split text into tokens (words and punctuation).
from nltk.tokenize import word_tokenize, sent_tokenize

text = "Natural Language Processing enables machines to understand human language."
print("Sentences:", sent_tokenize(text))
print("Tokens:", word_tokenize(text))

Sentences: ['Natural Language Processing enables machines to understand human language.']
Tokens: ['Natural', 'Language', 'Processing', 'enables', 'machines', 'to', 'understand', 'human', 'language', '.']


In [4]:
# Exercise 1.1: Tokenize the following paragraph into words and sentences:

paragraph = "Machine learning models power many NLP tasks. They learn patterns from data!"
def tokenize_paragraph(paragraph):
    sentences = sent_tokenize(paragraph)
    tokens = [word_tokenize(sentence) for sentence in sentences]
    return sentences, tokens

# Test the function and print results
sentences, tokens = tokenize_paragraph(paragraph)

print("=== Sentences ===")
for i, sentence in enumerate(sentences, 1):
    print(f"{i}: {sentence}")

print("\n=== Tokens ===")
for i, token_list in enumerate(tokens, 1):
    print(f"Sentence {i} tokens: {token_list}")

=== Sentences ===
1: Machine learning models power many NLP tasks.
2: They learn patterns from data!

=== Tokens ===
Sentence 1 tokens: ['Machine', 'learning', 'models', 'power', 'many', 'NLP', 'tasks', '.']
Sentence 2 tokens: ['They', 'learn', 'patterns', 'from', 'data', '!']


In [5]:
# 2. Part-of-Speech Tagging
# Goal: Assign grammatical tags to each token.
import nltk
tokens = word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)


[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('enables', 'VBZ'), ('machines', 'NNS'), ('to', 'TO'), ('understand', 'VB'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]


In [7]:
# Exercise 2.1: Tag POS for tokens from your Exercise 1.1.
def tag_pos(tokens):
    return nltk.pos_tag(tokens)

def tag_pos(tokens):
    return nltk.pos_tag(tokens)

pos_result = tag_pos(tokens)

print("=== POS Tags ===")
for word, pos in pos_result:
    print(f"{word}: {pos}")

=== POS Tags ===
Natural: JJ
Language: NNP
Processing: NNP
enables: VBZ
machines: NNS
to: TO
understand: VB
human: JJ
language: NN
.: .


In [8]:
# 3. Stemming
# Goal: Reduce words to their root forms (may be non-dictionary).
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ["running", "runs", "ran", "easily", "fairly"]
print({w: stemmer.stem(w) for w in words})


{'running': 'run', 'runs': 'run', 'ran': 'ran', 'easily': 'easili', 'fairly': 'fairli'}


In [9]:
# Exercise 3.1: Stem the tokens from your Exercise 1.1.
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

stemmed_tokens = stem_tokens(tokens)

print("=== Stemmed Tokens ===")
for token, stemmed_token in zip(tokens, stemmed_tokens):
    print(f"{token} -> {stemmed_token}")

=== Stemmed Tokens ===
Natural -> natur
Language -> languag
Processing -> process
enables -> enabl
machines -> machin
to -> to
understand -> understand
human -> human
language -> languag
. -> .


In [10]:
# 4. Stop-Word Filtering
# Goal: Remove common, low-value words.
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(text.lower())
filtered = [w for w in tokens if w.isalpha() and w not in stop_words]
print(filtered)

['natural', 'language', 'processing', 'enables', 'machines', 'understand', 'human', 'language']


In [11]:
# Exercise 4.1: Filter stop words from your Exercise 1.1 tokens.
def filter_stop_words(tokens):
    return [w for w in tokens if w.isalpha() and w.lower() not in stop_words]

filtered_tokens = filter_stop_words(tokens)
print("=== Filtered Tokens ===")
print(filtered_tokens)


=== Filtered Tokens ===
['natural', 'language', 'processing', 'enables', 'machines', 'understand', 'human', 'language']


In [12]:
# 5. Vocabulary Matching
# Goal: Check tokens against a predefined vocabulary.

vocab = {"natural", "language", "machine", "data", "processing"}
tokens = [w.lower() for w in word_tokenize(text)]
in_vocab = [w for w in tokens if w.isalpha() and w in vocab]
print("In-vocab tokens:", in_vocab)
print("OOV tokens:", [w for w in tokens if w.isalpha() and w not in vocab])


In-vocab tokens: ['natural', 'language', 'processing', 'language']
OOV tokens: ['enables', 'machines', 'to', 'understand', 'human']


In [14]:
# Exercise 5.1: Define your own small vocabulary and classify tokens from Exercise 1.1 into in-vocab vs. out-of-vocab.
vocab = {"machine", "artificial" "learning", "data", "language"}

in_vocab = [w for w in filtered_tokens if w in vocab]
out_of_vocab = [w for w in filtered_tokens if w not in vocab]

print("=== In-Vocab Tokens ===")
print(in_vocab)

print("\n=== Out-of-Vocab Tokens ===")
print(out_of_vocab)


=== In-Vocab Tokens ===
['language', 'language']

=== Out-of-Vocab Tokens ===
['natural', 'processing', 'enables', 'machines', 'understand', 'human']


In [15]:
# 6. Lemmatization
# Goal: Convert words to their dictionary form.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words = ["running", "better", "wolves"]
print({w: lemmatizer.lemmatize(w) for w in words})
# For verbs:
print("run (verb):", lemmatizer.lemmatize("running", pos='v'))


{'running': 'running', 'better': 'better', 'wolves': 'wolf'}
run (verb): run


In [16]:
# Exercise 6.1: Lemmatize tokens from Exercise 1.1 (both default and verb POS).
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def lemmatize_tokens_verbs(tokens):
    return [lemmatizer.lemmatize(token, pos='v') for token in tokens]

lemmatized_tokens = lemmatize_tokens(tokens)
lemmatized_tokens_verbs = lemmatize_tokens_verbs(tokens)
print("=== Lemmatized Tokens ===")
print(lemmatized_tokens)
print("=== Lemmatized Verbs ===")
print(lemmatized_tokens_verbs)



=== Lemmatized Tokens ===
['natural', 'language', 'processing', 'enables', 'machine', 'to', 'understand', 'human', 'language', '.']
=== Lemmatized Verbs ===
['natural', 'language', 'process', 'enable', 'machine', 'to', 'understand', 'human', 'language', '.']


In [17]:
# 7. Dependency Parsing
# Goal: Identify syntactic relationships between tokens.
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
for token in doc:
    print(token.text, token.dep_, token.head.text)


Natural compound Language
Language compound Processing
Processing nsubj enables
enables ROOT enables
machines nsubj understand
to aux understand
understand ccomp enables
human amod language
language dobj understand
. punct enables


In [18]:
# Exercise 7.1: Parse the sentence “They learn patterns from data” and list each token’s dependency label and head.
doc = nlp("They learn patterns from data")
for token in doc:
    print(token.text, token.dep_, token.head.text)

They nsubj learn
learn ROOT learn
patterns dobj learn
from prep patterns
data pobj from


In [19]:
# 8. Named-Entity Recognition (NER)
# Goal: Extract real-world entities from text.
doc = nlp("Google was founded in 1998 by Larry Page and Sergey Brin in California.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Google ORG
1998 DATE
Larry Page PERSON
Sergey Brin PERSON
California GPE


In [20]:
#Exercise 8.1: Run NER on this sentence and add at least two more sentences of your own.
pap_sentence = "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife."
pap_sentence += "However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered as the rightful property of some one or other of their daughters."
pap_sentence += "My dear Mr. Bennet,” said his lady to him one day, “have you heard that Netherfield Park is let at last?"
pap_sentence += "The novel was published in 1813 and remains a classic of English literature."

doc = nlp(pap_sentence)
for ent in doc.ents:
    print(ent.text, ent.label_)
    print(f"{ent.text} ({ent.label_})")

first ORDINAL
first (ORDINAL)
some one CARDINAL
some one (CARDINAL)
Bennet PERSON
Bennet (PERSON)
one day DATE
one day (DATE)
Netherfield Park FAC
Netherfield Park (FAC)
1813 DATE
1813 (DATE)
English LANGUAGE
English (LANGUAGE)
