In [1]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("I love natural language processing")
doc

I love natural language processing

In [2]:
displacy.render(doc,style='dep')

# Name entity recognition

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Barack Obama was born on August 4, 1961, in Hawaii. He was the 44th President of the United States."
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)


Barack Obama PERSON
August 4, 1961 DATE
Hawaii GPE
44th ORDINAL
the United States GPE


In [4]:
displacy.render(doc,style='ent')

# Word vectors and sacy

# Word Vectors

In [5]:
# nlp  = spacy.load("en_core_web_md")
# with open ("data/wiki_us.txt",'r') as f:
#     text = f.read()

# 

In [6]:
doc

Barack Obama was born on August 4, 1961, in Hawaii. He was the 44th President of the United States.

In [7]:
doc

Barack Obama was born on August 4, 1961, in Hawaii. He was the 44th President of the United States.

In [8]:
sentence1 = list(doc.sents)[0]
sentence1

Barack Obama was born on August 4, 1961, in Hawaii.

In [9]:
import spacy
import numpy as np

nlp = spacy.load("en_core_web_md")  # Ensure medium/large model
your_word = "sports"

if nlp.vocab[your_word].has_vector:
    word_id = nlp.vocab.strings[your_word]
    word_vector = nlp.vocab.vectors[word_id]
    ms = nlp.vocab.vectors.most_similar(np.asarray([word_vector]), n=3)
    words = [nlp.vocab.strings[w] for w in ms[0][0]]
    print("Similar words to '{}':".format(your_word), words)
else:
    print("Word vector for '{}' not found.".format(your_word))


Similar words to 'sports': ['GYMNASIUM', 'tennis', 'RINK']


# Similarity

In [10]:
doc1 = nlp('I like saalty fries and hamburgers')
doc2 = nlp('Fast food tastes very good. ')

print(doc1, "<->" , doc2, doc1.similarity(doc2))

I like saalty fries and hamburgers <-> Fast food tastes very good.  0.7530552128924118


# pipeline

# Attirbute Rulers

# 1. Dependency Parser

In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("The quick brown fox jumps over the lazy dog.")
for token in doc:
    print(token.text, "→", token.dep_, "→", token.head.text)


The → det → fox
quick → amod → fox
brown → amod → fox
fox → nsubj → jumps
jumps → ROOT → jumps
over → prep → jumps
the → det → dog
lazy → amod → dog
dog → pobj → over
. → punct → jumps


In [12]:
displacy.render(doc,style='dep')

In [13]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)


Apple ORG
U.K. GPE
$1 billion MONEY


In [14]:
displacy.render(doc,style='ent')

# EntityRuler

In [15]:
import spacy
from spacy.pipeline import EntityRuler  # ✅ ঠিক নাম

nlp = spacy.load("en_core_web_sm")

# EntityRuler যোগ করো
ruler = nlp.add_pipe("entity_ruler", before='ner')

# কাস্টম প্যাটার্ন যোগ করো
ruler.add_patterns([
    {"label": "GADGET", "pattern": "iphone"}
])

# টেস্ট টেক্সট
doc = nlp("I just bought a new iphone")

# চিহ্নিত Entity প্রিন্ট করো
for ent in doc.ents:
    print(ent.text, ent.label_)


iphone GADGET


# Lemmatizer

In [16]:
for token in doc:
    print(token.text, token.lemma_)

I I
just just
bought buy
a a
new new
iphone iphone


# Morpholog

In [17]:
#grammatical features for example tense number, person etc
for token in doc:
    print(token.text, token.morph)

I Case=Nom|Number=Sing|Person=1|PronType=Prs
just 
bought Tense=Past|VerbForm=Fin
a Definite=Ind|PronType=Art
new Degree=Pos
iphone Number=Sing


# SentenceRecognizer / SentenceSegmenter

In [18]:
doc3 = nlp('hello world! How are you?')
for sent in doc3.sents:
    print(sent.text)

hello world!
How are you?


# Sentencizer

In [19]:
#just punctuation pele sents divied kore

npl = spacy.blank("en")
nlp.add_pipe("sentencizer")
doc4 = nlp("This is sentence one. This is sentence two")
for sent in doc4.sents:
    print(sent.text)

This is sentence one.
This is sentence two


# SpanCategorizer

# Tagger

In [20]:
 give pos (part of speech) tag 

SyntaxError: invalid syntax (1687734250.py, line 1)

# TrainablePipe

# Transformer

# Matchers

In [None]:
npl = spacy.blank("en")
npl.add_pipe("sentencizer")

In [None]:
nlp.analyze_pipes()

In [None]:
nlp2 = spacy.load("en_core_web_sm")
nlp2.analyze_pipes()

# Rules_based_spacy

# EntityRuler

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
ruler = nlp.add_pipe("entity_ruler")

In [21]:
ruler

<spacy.pipeline.entityruler.EntityRuler at 0x7b1b112c3cd0>

In [22]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc',
    'pos_acc',
    'tag_micro_p',
    'tag_micro_r',
    'tag_micro_f'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_

In [23]:
patterns = [
    {'label':'GPE', "pattern":"west Chestertenfieldvile"}
]

In [24]:
ruler.add_patterns(patterns)

In [27]:

for ent in doc.ents:
    print(ent.text, ent.label_)
    

iphone GADGET


In [28]:
nlp5 = spacy.load("en_core_web_sm")
ruler = nlp5.add_pipe("entity_ruler", before='ner')
ruler.add_patterns(patterns)

In [29]:
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Barack Obama PERSON
August 4, 1961 DATE
Hawaii GPE
44th ORDINAL
the United States GPE


In [30]:
nlp6 = spacy.load("en_core_web_sm")
ruler = nlp6.add_pipe("entity_ruler", before="ner")

In [33]:
patterns = [
    {"label":"GPE", "pattern":"West Chestertenfieldville"},
    {"label":"FILM", "pattern":"Mr. Deeds"}
]

In [34]:
ruler.add_patterns(patterns)

In [35]:
doc = nlp6(text)
for ent in doc.ents:
    print( ent.text, ent.label_)

Barack Obama PERSON
August 4, 1961 DATE
Hawaii GPE
44th ORDINAL
the United States GPE


# matcher

In [42]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# ✅ ঠিক করা pattern
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADDRESS", [pattern])

doc = nlp("This is an email address: wmattingly@aol.com")
matches = matcher(doc)

for match_id, start, end in matches:
    span = doc[start:end]
    print("Matched:", span.text)


Matched: wmattingly@aol.com


In [50]:

text = "hi i am atik hasan , my home distict shirajgong"

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN"}]
matcher.add("PROPER_NOUN", [pattern])

doc = nlp(text)
matches = matcher(doc)
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])
    
    

3
(451313080118390996, 3, 4) atik
(451313080118390996, 4, 5) hasan
(451313080118390996, 9, 10) shirajgong


In [51]:

text = "hi i am atik hasan , my home distict shirajgong"

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN","OP":"+"}]
matcher.add("PROPER_NOUN", [pattern])

doc = nlp(text)
matches = matcher(doc)
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])
    

4
(451313080118390996, 3, 4) atik
(451313080118390996, 3, 5) atik hasan
(451313080118390996, 4, 5) hasan
(451313080118390996, 9, 10) shirajgong


In [54]:

text = "hi i am atik hasan , my home distict shirajgong"

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN", "OP":"+"}]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")

doc = nlp(text)
matches = matcher(doc)
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])
    

2
(451313080118390996, 3, 5) atik hasan
(451313080118390996, 9, 10) shirajgong


In [55]:

text = "hi i am atik hasan , my home distict shirajgong"

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN", "OP":"+"}]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")

doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])
    

2
(451313080118390996, 3, 5) atik hasan
(451313080118390996, 9, 10) shirajgong


In [59]:
text = """Hi, I am Atik Hasan.
I live in a beautiful place.
My home district is Shirajganj.
It's full of rivers and natural beauty.
I love talking about my roots and culture."""

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN", "OP":"+"},{"POS":"VERB"}]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")

doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])
    

0


In [60]:
text = """Hi, I am.Atik Hasan.
I live in a beautiful place.
My home district is Shirajganj.
It's full of rivers and natural beauty.
I love talking about.my roots and culture."""

In [61]:
text = text.replace("."," ")
print(text)

Hi, I am Atik Hasan 
I live in a beautiful place 
My home district is Shirajganj 
It's full of rivers and natural beauty 
I love talking about my roots and culture 


In [64]:

text = """Hi, I am Atik Hasan.
My friend said, 'Dream big, work hard!'
I replied, 'Sure, I believe in that.'
Later, someone asked me, 'What's your plan?'
I just smiled and said, 'To become great one day.'"""


nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [
    {"ORTH":"'"},
    {"IS_ALPHA": True, "OP":"+"},
    {"IS_PUNCT": True, "OP":"*"},
    {"ORTH":"'"}
]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")

doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])
    

1
(451313080118390996, 54, 62) 'To become great one day.'


In [66]:

text = """Hi, I am Atik Hasan.
My friend said, 'Dream big, work hard!'
I replied, 'Sure, I believe in that.'
Later, someone asked me, 'What's your plan?'
I just smiled and said, 'To become great one day.'"""


speak_lemmas = ["think", "say"]

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [
    {"ORTH":"'"},
    {"IS_ALPHA": True, "OP":"+"},
    {"IS_PUNCT": True, "OP":"*"},
    {"ORTH":"'"},
    {"POS":"VERB","LEMMA":{"IN":speak_lemmas}}
]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")

doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])
    

0


In [68]:

text = """Hi, I am Atik Hasan.
My friend said, 'Dream big, work hard!'
I replied, 'Sure, I believe in that.'
Later, someone asked me, 'What's your plan?'
I just smiled and said, 'To become great one day.'"""


speak_lemmas = ["think", "say"]

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [
    {"ORTH":"'"},
    {"IS_ALPHA": True, "OP":"+"},
    {"IS_PUNCT": True, "OP":"*"},
    {"ORTH":"'"},
    {"POS":"VERB","LEMMA":{"IN":speak_lemmas}},
    {"POS":"PROPN","OP":"+"},
    {"ORTH":"'"}
]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")

doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])
    

0


# Custom Components

In [69]:
import spacy 
from spacy.language import Language


In [70]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Britain is a place. Mary is a doctor.")



In [71]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Britain GPE
Mary PERSON


In [72]:
from spacy.language import Language


In [73]:
@Language.component("remove_gpe")
def remove_gpe(doc):
    original_ents = list(doc.ents)
    for ent in doc.ents:
        if ent.label_ == "GPE":
            original_ents.remove(ent)
    doc.ents = original_ents
    return (doc)
    
    

In [74]:
nlp.add_pipe("remove_gpe")

<function __main__.remove_gpe(doc)>

In [75]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc',
    'pos_acc',
    'tag_micro_p',
    'tag_micro_r',
    'tag_micro_f'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'remove_gpe': {'assigns': [],
   'requires': [],
   

In [76]:
doc = nlp("Britain is a place. Mary is a doctor.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Mary PERSON


In [80]:
from pathlib import Path

# সব parent ফোল্ডার সহ তৈরি করবে
Path("data/new_en_core_web_sm").mkdir(parents=True, exist_ok=True)
nlp.to_disk("data/new_en_core_web_sm")

# RegEx (basics)

In [84]:
# Import the requisite Library
import spacy

# Sample text
text = "This is a sample number (555) 555-5555."

# Build a blank English model
nlp = spacy.blank("en")

# Add the EntityRuler pipeline
ruler = nlp.add_pipe("entity_ruler")

# Patterns (✅ Correct REGEX structure)
patterns = [
    {
        "label": "PHONE_NUMBER",
        "pattern": [
            {"TEXT": {"REGEX": r"\(?\d{3}\)?[\s-]?\d{3}-\d{4}"}}
        ]
    }
]

# Add patterns to the ruler
ruler.add_patterns(patterns)

# Process the text
doc = nlp(text)

# Print the matches
for ent in doc.ents:
    print(ent.text, ent.label_)


In [86]:
import re

In [90]:
text = """Paul Newman was an American actor, but
Paul Hollywood is a British TV Host. The name
Paul is quit common.
""" 

In [91]:
pattern = r"Paul [A-Z]\w+"


In [95]:
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [98]:
import spacy
from spacy.tokens import Span

In [102]:
import spacy
import re

# Text
text = "Paul Walker is a great actor. Paul Newman was a racer."

# Create a blank NLP pipeline
nlp = spacy.blank("en")
doc = nlp(text)

# Save original entities (though spaCy.blank("en") won't have any by default)
original_ents = list(doc.ents)

# Define the regex pattern
pattern = r"Paul [A-Z]\w+"

# Find matches using regex and convert to spans
mwt_ents = []
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span:  # span could be None
        print(span.text)
        mwt_ents.append(span)


Paul Walker
Paul Newman
