In [1]:
# pipeline in NLTK (manual work , no direct support)
import nltk 
from nltk import word_tokenize, pos_tag, sent_tokenize, ne_chunk
from nltk.tree import Tree

In [2]:
# Download required resources (run only once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('maxent_ne_chunker_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dai.STUDENTSDC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dai.STUDENTSDC\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\dai.STUDENTSDC\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\dai.STUDENTSDC\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\dai.STUDENTSDC\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!


True

In [3]:
# Sample text 
text = "Apple was founded by Steve Jobs in California. He emailed support@apple.com from L.A."

# 1. Sentence Tokenization
sentences = sent_tokenize(text)

# 2. Word Tokenization
tokens = word_tokenize(text)

# 3. POS tagging
pos_tags = pos_tag(tokens)

# 4. Named Entity Recognition
ne_tree = ne_chunk(pos_tags)

# Extract Named Entities in (entity, type) form
named_entities = []
for subtree in ne_tree:
    if isinstance(subtree, Tree):
        entity = " ".join(token for token, pos in subtree.leaves())
        label = subtree.label()
        named_entities.append((entity, label))

# Output Summary
print("====== NLTK NLP Pipeline ========")
print(f"Sentences ( {len(sentences)}):", sentences)
print(f"\n Tokens ({len(tokens)}): ", tokens)
print(f"\n POS Tags :", pos_tags)
print(f"\n Named Entities: ", named_entities)
print(f"\nPipeline Stages : ['Tokenizer', 'POS Tagger', 'Chunker/NER'] ")

Sentences ( 2): ['Apple was founded by Steve Jobs in California.', 'He emailed support@apple.com from L.A.']

 Tokens (17):  ['Apple', 'was', 'founded', 'by', 'Steve', 'Jobs', 'in', 'California', '.', 'He', 'emailed', 'support', '@', 'apple.com', 'from', 'L.A', '.']

 POS Tags : [('Apple', 'NNP'), ('was', 'VBD'), ('founded', 'VBN'), ('by', 'IN'), ('Steve', 'NNP'), ('Jobs', 'NNP'), ('in', 'IN'), ('California', 'NNP'), ('.', '.'), ('He', 'PRP'), ('emailed', 'VBD'), ('support', 'NN'), ('@', 'NNP'), ('apple.com', 'NN'), ('from', 'IN'), ('L.A', 'NNP'), ('.', '.')]

 Named Entities:  [('Apple', 'PERSON'), ('Steve Jobs', 'PERSON'), ('California', 'GPE')]

Pipeline Stages : ['Tokenizer', 'POS Tagger', 'Chunker/NER'] 


In [8]:
#### spaCy Pipeline
import spacy
from spacy import displacy, explain

nlp = spacy.load('en_core_web_sm')
print("Pipeline components  : ", nlp.pipeline)
print("Component names : ", nlp.pipe_names)

# Add sentencizer if not present
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe('sentencizer', before="parser")
    print("Sentencizer added.")

# Process data
doc = nlp(text)    

# Print tokens 
print("\n Tokens : ")
print([token.text for token in doc])

# Print Sentences :
print("\n Sentences : ")
print([sent.text for sent in doc.sents])

# Explain some POS/DEP/NER tags :
print("\nExplanations:")
print("VBZ:", explain("VBZ"))
print("nsubj:", explain("nsubj"))
print("ORG:", explain("ORG"))

# Visualise named entities (works best in Jupyter or Colab)
# displacy.serve(doc, style="ent", host="127.0.0.1", port = 5000)   # For non- Jupyter/Colab

displacy.render(doc, style="ent")   # For Jupyter/Colab

Pipeline components  :  [('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x000001FEEA9D1250>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x000001FEEA9D1370>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x000001FEE4B40F90>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x000001FEE6D86D10>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x000001FEE6D71290>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x000001FEE4B417E0>)]
Component names :  ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
Sentencizer added.

 Tokens : 
['Apple', 'was', 'founded', 'by', 'Steve', 'Jobs', 'in', 'California', '.', 'He', 'emailed', 'support@apple.com', 'from', 'L.A.']

 Sentences : 
['Apple was founded by Steve Jobs in California.', 'He emailed support@apple.com from L.A.']

Explanations:
VBZ: verb, 3rd person singular present
nsubj: nominal subject
ORG: Companies, agencies, institutio

ImportError: cannot import name 'display' from 'IPython.core.display' (d:\workspace\NLP_CV\.venv\Lib\site-packages\IPython\core\display.py)

In [None]:
# Final Summary
print("\nFinal Summary : ")
print(f"Total components : {len(nlp.pipe_names)}")
print(f"NER present : {'ner' in nlp.pipe_names}")


Final Summary : 
Total components : 7
NER present : True
