# 1. Tokenization
## Split text into sentences and words.


In [None]:
# 1. prefix -- character(s) at the begining = $(" .)
# 2. suffix  -- character at the end 
# 3. infix -- character at the middel

In [None]:
import nltk  # Imports the main nltk library

nltk.download('punkt')  # Downloads the 'punkt' tokenizer models used for sentence and word tokenization
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize, sent_tokenize  # Imports functions for word and sentence tokenization

text = "Hello! Welcome to NLP using NLTK. Let's explore it."  # Input text string

sentences = sent_tokenize(text)  # Splits text into sentences
print("Sentences:", sentences)  # Prints list of sentences

words = word_tokenize(text)  # Splits text into individual words and punctuation
print("Words:", words)  # Prints list of words


Sentences: ['Hello!', 'Welcome to NLP using NLTK.', "Let's explore it."]
Words: ['Hello', '!', 'Welcome', 'to', 'NLP', 'using', 'NLTK', '.', 'Let', "'s", 'explore', 'it', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# 2. Stop Words Removal
## Remove common useless words like is, am, are, the.

In [None]:
from nltk.corpus import stopwords  # Imports a list of common stop words

nltk.download('stopwords')  # Downloads the stopwords dataset

words = word_tokenize("This is a basic example to remove the stop words.")  # Tokenizes a sentence into words

stop_words = set(stopwords.words('english'))  # Gets English stop words and converts them into a set (faster for lookup)

filtered_words = [w for w in words if w.lower() not in stop_words]  # List comprehension to remove stop words
print("Filtered Words:", filtered_words)  # Output words that are not stopwords


Filtered Words: ['basic', 'example', 'remove', 'stop', 'words', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#  3. Stemming
## Reduce a word to its root form.

In [None]:
# inflection ==> "ingrammer, inflection is the modification of a word to express different grametically categories
# such as tense, case, voice, aspect, person, number, gender and mood

# 2. stemming ==> stemming is the process of reducing inflaction in words
# to their root forms such as mapping a group of words to the same stemming even if the itself is not valid word in the language



In [None]:
from nltk.stem import PorterStemmer  # Imports the stemming class

stemmer = PorterStemmer()  # Creates stemmer object

words = ["playing", "played", "plays", "playful"]  # Input list of words

for word in words:
    print(word, "->", stemmer.stem(word))  # Applies stemming to each word


playing -> play
played -> play
plays -> play
playful -> play


#  4. Lemmatization
## Reduce word to root using dictionary lookup. (More accurate than stemming)

In [None]:
from nltk.stem import WordNetLemmatizer  # Imports WordNet-based lemmatizer

nltk.download('wordnet')  # Downloads WordNet dictionary
nltk.download('omw-1.4')  # Downloads WordNet's multilingual data

lemmatizer = WordNetLemmatizer()  # Creates lemmatizer object

print("running ->", lemmatizer.lemmatize("running", pos='v'))  # 'v' = verb
print("better ->", lemmatizer.lemmatize("better", pos='a'))  # 'a' = adjective


running -> run
better -> good


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


#  5. POS Tagging (Part of Speech)
## Identify grammar of each word (noun, verb, etc.)

### Explanation:

pos_tag(words) assigns a Part-of-Speech to each word like:

NN = Noun

VB = Verb

JJ = Adjective

RB = Adverb

In [2]:
import nltk
from nltk import pos_tag, word_tokenize

# Download required NLTK models
nltk.download('punkt', quiet=True)                             # For tokenization
nltk.download('averaged_perceptron_tagger', quiet=True)        # For POS tagging
nltk.download('all', quiet=True)                               # # downloads everything needed

# Sample text
sentence = "The quick brown fox jumps over the lazy dog"

# Tokenize the text into words
words = word_tokenize(sentence)

# Perform Part-of-Speech tagging
pos_tags = pos_tag(words)

# Display the result
print("POS Tags:", pos_tags)

POS Tags: [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


# NER (Named Entity Recognition)?
## NER is a subtask of NLP (Natural Language Processing) that identifies and categorizes named entities in a text into predefined classes:

### ✅ NER Labels You Might See
PER: Person (e.g., Elon Musk)

ORG: Organization (e.g., Google)

LOC: Location (e.g., India, Paris)

MISC: Miscellaneous (e.g., events, works, or other)



In [7]:
# pip install transformers torch

In [6]:
from transformers import pipeline

# Load the NER pipeline
ner = pipeline("ner", grouped_entities=True)

# Input text
text = "Barack Obama was born in Hawaii. He was the 44th President of the United States."

# Perform Named Entity Recognition
results = ner(text)

# Display the output
print("Named Entities Found:")
for entity in results:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {round(entity['score'], 3)}")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Named Entities Found:
Entity: Barack Obama, Label: PER, Score: 0.9990000128746033
Entity: Hawaii, Label: LOC, Score: 0.9990000128746033
Entity: United States, Label: LOC, Score: 0.9950000047683716


In [8]:
import nltk
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)

from nltk import word_tokenize, pos_tag, ne_chunk

sentence = "Elon Musk founded SpaceX in California."
words = word_tokenize(sentence)          # Step 1: Tokenize
pos_tags = pos_tag(words)                # Step 2: POS tagging
print(pos_tags)

[('Elon', 'NNP'), ('Musk', 'NNP'), ('founded', 'VBD'), ('SpaceX', 'NNP'), ('in', 'IN'), ('California', 'NNP'), ('.', '.')]


In [9]:
tree = ne_chunk(pos_tags)
print(tree)

(S
  (PERSON Elon/NNP)
  (PERSON Musk/NNP)
  founded/VBD
  (ORGANIZATION SpaceX/NNP)
  in/IN
  (GPE California/NNP)
  ./.)


In [10]:
entities = []

for subtree in tree.subtrees():
    if subtree.label() in ['PERSON', 'ORGANIZATION', 'GPE']:
        entity = " ".join(word for word, tag in subtree.leaves())
        entities.append((entity, subtree.label()))

print(entities)

[('Elon', 'PERSON'), ('Musk', 'PERSON'), ('SpaceX', 'ORGANIZATION'), ('California', 'GPE')]


# NER using Hugging Face (Transformers)

## What is Hugging Face?
Hugging Face is an open-source AI company and platform that provides powerful tools, pre-trained models, and datasets for building Natural Language Processing (NLP) applications using Machine Learning and Deep Learning — especially Transformers.

Their most popular Python library is called transformers, which makes it super easy to use state-of-the-art NLP models like BERT, GPT, T5, etc.

In [13]:
# !pip install transformers
# !pip install torch

In [14]:
from transformers import pipeline

# Load the pre-trained NER pipeline
ner_pipeline = pipeline("ner", grouped_entities=True)

# Sample sentence
text = "Elon Musk founded SpaceX and lives in Texas."

# Perform NER
entities = ner_pipeline(text)

# Print results
for entity in entities:
    print(f"{entity['word']} → {entity['entity_group']} (score={entity['score']:.2f})")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Elon Musk → PER (score=1.00)
SpaceX → ORG (score=1.00)
Texas → LOC (score=1.00)


In [2]:
from transformers import pipeline

# Load the pre-trained NER pipeline
# ner_pipeline = pipeline("ner", grouped_entities=True)
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)


# Sample sentence
text = "Elon Musk founded SpaceX and lives in Texas."

# Perform NER
entities = ner_pipeline(text)

# Print results
for entity in entities:
    print(f"{entity['word']} → {entity['entity_group']} (score={entity['score']:.2f})")


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
  return forward_call(*args, **kwargs)


El → PER (score=0.95)
##on Musk → PER (score=0.84)
SpaceX → ORG (score=1.00)
Texas → LOC (score=1.00)


In [3]:
# Here is the complete NER (Named Entity Recognition) code
# using Hugging Face Transformers with output shown via code only

from transformers import pipeline

# Load the NER pipeline
ner = pipeline("ner", grouped_entities=True)

# Input text
text = "Barack Obama was born in Hawaii. He was the 44th President of the United States."

# Perform Named Entity Recognition
results = ner(text)

# Display the output
print("Named Entities Found:")
for entity in results:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {round(entity['score'], 3)}")


No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


Named Entities Found:
Entity: Barack Obama, Label: PER, Score: 0.9990000128746033
Entity: Hawaii, Label: LOC, Score: 0.9990000128746033
Entity: United States, Label: LOC, Score: 0.9950000047683716
