In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [5]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_md')

doc = nlp('1823 Villa Linda Ave')

displacy.render(doc, style='dep')

In [7]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_md')

doc = nlp('Bill Gates is the CEO of Microsft.')

displacy.render(doc, style='dep')

In [11]:
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp("1823 Villa Linda Ave, TX")

In [23]:
doc = nlp("1823 Villa Linda Ave, TX")

In [24]:
doc.ents

(1823, Villa, Linda Ave, TX)

In [25]:
doc[1].ent_iob

3

In [26]:
nlp.vocab

<spacy.vocab.Vocab at 0x7f57c2a38790>

In [27]:
print([token.text for token in doc])

['1823', 'Villa', 'Linda', 'Ave', ',', 'TX']


In [21]:

doc1 = nlp("I own a ginger cat.")
print ([token.text for token in doc1])

['I', 'own', 'a', 'ginger', 'cat', '.']


In [22]:

doc2 = nlp("It's been a crazy week!!!")
print ([token.text for token in doc2])

['It', "'s", 'been', 'a', 'crazy', 'week', '!', '!', '!']


In [28]:
# Customizing the Tokenizer and Sentence Segmentation

In [29]:
# adding the special case
import spacy
from spacy.symbols import ORTH
nlp = spacy.load("en_core_web_md")
doc = nlp("lemme that")
print([w.text for w in doc])

special_case = [{ORTH: "lem"}, {"ORTH": "me"}]
nlp.tokenizer.add_special_case("lemme", special_case)
print([w.text for w in nlp("lemme that")])

['lemme', 'that']
['lem', 'me', 'that']


In [33]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")  # Make sure to load the correct model

# Your input text
text = "1823 Villa Linda Ave, TX"

# Tokenize the text
doc = nlp(text)

# Custom post-processing to replace "TX" with "Texas"
processed_tokens = [token.text if token.text != "TX" else "Texas" for token in doc]

print(processed_tokens)

['1823', 'Villa', 'Linda', 'Ave', ',', 'Texas']


In [39]:
# debugging
text = "1823 Villa Linda Ave, TX Can you send me package here or five hundred University drive"
doc = nlp(text)
tok_exp = nlp.tokenizer.explain(text)
for t in tok_exp:
  print(t[1], "\t", t[0])

1823 	 TOKEN
Villa 	 TOKEN
Linda 	 TOKEN
Ave 	 TOKEN
, 	 SUFFIX
TX 	 TOKEN
Can 	 TOKEN
you 	 TOKEN
send 	 TOKEN
me 	 TOKEN
package 	 TOKEN
here 	 TOKEN
or 	 TOKEN
five 	 TOKEN
hundred 	 TOKEN
University 	 TOKEN
drive 	 TOKEN


In [40]:
# sentence segmentation
doc = nlp(text)

for sent in doc.sents:
    print(sent.text)

1823 Villa Linda Ave, TX Can you send me package here or five hundred University drive


In [44]:
# Lemmatization

doc = nlp("I went for working and worked for 3 years.")
for token in doc:
    print(token.text, token.lemma_)

I I
went go
for for
working work
and and
worked work
for for
3 3
years year
. .


In [43]:
text = "1823 Villa Linda Ave, TX"
doc = nlp(text)
for token in doc:
    print(token.text, token.lemma_)

1823 1823
Villa Villa
Linda Linda
Ave Ave
, ,
TX TX


In [45]:
nlp.get_pipe("attribute_ruler").add([[{"TEXT": "Angeltown"}]],
{"LEMMA": "Los Angeles"})
doc = nlp("I am flying to Angeltown")
for token in doc:
  print(token.text, token.lemma_)

I I
am be
flying fly
to to
Angeltown Los Angeles


In [46]:
# define mappings
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

# Define your mappings here
phrase_to_alias = {
    "Lone Star": "Texas",
    "Sunshine": "California",
    "Concrete Jungle": "New York",
    "Sin City": "Los Angeles",  # Add new phrases and their aliases as needed
}

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)

# Register a custom attribute; Span._.alias for example
Span.set_extension("alias", default=None, force=True)


In [47]:
# add phraser to the matcher
# Convert each phrase to a Doc object and add it to the matcher
for phrase in phrase_to_alias.keys():
    matcher.add(phrase, [nlp.make_doc(phrase)])


In [48]:
#process text and apply custom logic
# Process a sample text
doc = nlp("I'm planning a trip to Sin City and the Concrete Jungle.")

# Find matches in the doc
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]  # The matched span
    # Use the match_id to get the string name of the matched phrase
    phrase = nlp.vocab.strings[match_id]
    # Set the custom attribute based on the mapping
    span._.alias = phrase_to_alias[phrase]

# Example output
for span in doc.ents:
    print(f"Original: {span.text}, Alias: {span._.alias if span._.alias else 'No Alias'}")


Original: Sin City, Alias: Los Angeles


In [49]:
nlp.get_pipe("attribute_ruler").add([[{"TEXT": "Angeltown"}]],
{"LEMMA": "Los Angeles"})
doc = nlp("I am flying to Angeltown")
for token in doc:
  print(token.text, token.lemma_)

I I
am be
flying fly
to to
Angeltown Los Angeles


In [50]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")

# Define custom extension to store the alias
Span.set_extension("alias", default=None, force=True)

# Initialize PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

# Updated dictionary mapping phrases to their correct 'aliases'
phrase_to_alias = {
    "Lone Star": "Texas",
    "Sunshine State": "California",
    "Concrete Jungle": "New York",
    "Sin City": "Las Vegas"  # Corrected to Las Vegas
}

# Function to add phrase patterns to the matcher
def add_phrase_patterns(matcher, phrase_dict):
    for phrase, alias in phrase_dict.items():
        pattern = nlp.make_doc(phrase)  # Create a doc for each phrase to create a pattern
        matcher.add(alias, [pattern])   # Add pattern to the matcher

add_phrase_patterns(matcher, phrase_to_alias)

# Processing function to apply aliases to matched phrases
def process_doc(text):
    doc = nlp(text)
    matches = matcher(doc)
    spans = []  # To store the spans with aliases
    for match_id, start, end in matches:
        span = Span(doc, start, end, label=match_id)
        alias = nlp.vocab.strings[match_id]  # Get alias from match ID
        span._.alias = alias
        spans.append(span)
    doc.spans["aliases"] = spans  # Store matched spans in the doc
    return doc

# Example usage
doc = process_doc("I am visiting the Concrete Jungle and Sin City soon.")
for span in doc.spans["aliases"]:  # Access the matched spans with aliases
    print(f"Original: {span.text}, Alias: {span._.alias}")


Original: Concrete Jungle, Alias: New York
Original: Sin City, Alias: Las Vegas


In [51]:
# spacy Container Objects
# Doc, Token, Span

In [54]:
  doc = nlp("I like cats.")
  print(len(doc))
  print(doc.text)
  for token in doc:
    print(token.text)

4
I like cats.
I
like
cats
.


In [56]:
doc = nlp("This is a sentence. This is the second sentence.")
sentences = list(doc.sents) #The doc.sents method returns an iterator to the list of sentences. Each sentence is a Span object
print(sentences)

[This is a sentence., This is the second sentence.]


In [57]:
doc = nlp("I flied to New York with Ashley")
print(doc.ents) # The doc.ents method gives named entities of the text. The result is a list of Span objects.

(New York, Ashley)


In [58]:
doc = nlp("Sweet brown fox jumped over the fence.")
print(list(doc.noun_chunks)) #doc.noun_chunks. It yields the noun phrases found in the text

[Sweet brown fox, the fence]


In [59]:
doc = nlp("Hi")
json_doc = doc.to_json() # serialize
print(json_doc)

{'text': 'Hi', 'ents': [], 'sents': [{'start': 0, 'end': 2}], 'tokens': [{'id': 0, 'start': 0, 'end': 2, 'tag': 'UH', 'pos': 'INTJ', 'morph': '', 'lemma': 'hi', 'dep': 'ROOT', 'head': 0}]}


In [61]:
# Token Object
doc = nlp("Hello Madam!")
print(doc[0])
print(doc[0].text)
print(doc[0].text_with_ws)
print(doc[2].text_with_ws)
token = doc[2]
print(token.i)

Hello
Hello
Hello 
!
2


In [63]:
#token.idx provides the token's character offset (the character position) in doc:
print(doc[0].idx)
print(doc[1].idx)

0
6


In [64]:
# getting the sentence tokens belong to
token = doc[1]
print(token.sent)

Hello Madam!


In [65]:
# token.is_sent_start is another useful property; it returns a Boolean indicating whether the token starts a sentence
doc = nlp("He entered the room. Then he nodded.")
print(doc[0].is_sent_start)
print(doc[5].is_sent_start)
print(doc[6].is_sent_start)

True
True
False


In [66]:
doc = nlp("I went there.")
print(doc[1].lemma_) # how to calculate the token lemma

go


In [68]:
doc = nlp("President Trump visited Mexico City.")
print(doc.ents)
#what sort of entity the token is, use token.ent_type_:
print(doc[1].ent_type_)
print(doc[3].ent_type_)
print(doc[4].ent_type_)
print(doc[0].ent_type_)

(Trump, Mexico City)
PERSON
GPE
GPE



In [70]:
#dir(token) or dir(doc). Calling dir() will print all the features and methods available on the object.

dir(doc[0])

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [71]:
# Span object

In [72]:
doc = nlp("I know that you have been to USA.")
print(doc[2:4])

that you


In [73]:
doc = nlp("You love Atlanta since you're 20.")
print(doc.char_span(4, 16))

love Atlanta


In [74]:
doc = nlp("You went there after you saw me.")
span = doc[2:4]
for token in span:
    print(token)

there
after


In [75]:
doc = nlp("You went there after you saw me.")
span = doc[2:6]
print(span.doc)
print(span.sent)

You went there after you saw me.
You went there after you saw me.


In [76]:
doc = nlp("You went there after you saw me.")
span = doc[2:6] # span.start is the index of the first token of the Span
# and span.start_char is the start offset of the Span at the character level.
print(span.start)
print(span.end)
print(span.start_char)
print(span.end_char)

2
6
9
28


In [77]:
doc = nlp("You went there after you saw me")
span = doc[2:6]
print(type(span))
small_doc = span.as_doc()
print(type(small_doc))

<class 'spacy.tokens.span.Span'>
<class 'spacy.tokens.doc.Doc'>


In [79]:
# more features of spaCy
doc = nlp("HELLO, Hello, hello, hEllO")
print(doc[0].is_upper)
print(doc[0].is_lower)
print(doc[1].is_upper)
print(doc[1].is_lower)

True
False
False
False


In [80]:
doc = nlp("( [ He said yes. ] )")
print(doc[0])
print(doc[0].is_left_punct)

print(doc[1])

print(doc[1].is_left_punct)

print(doc[-1])

print(doc[-1].is_right_punct)
print(doc[-2])
print(doc[-2].is_right_punct)

(
True
[
True
)
True
]
True


In [81]:
doc = nlp("( You said [1] and {2} is not applicable.)")
print(doc[0].is_bracket, doc[-1].is_bracket)

print(doc[3].is_bracket, doc[5].is_bracket)

print(doc[7].is_bracket, doc[9].is_bracket)

True True
True True
True True


In [82]:
doc = nlp("( You said '1\" is not applicable.)")
print(doc[3])

print(doc[3].is_quote)
print(doc[5])
print(doc[5].is_quote)

'
True
"
True


In [83]:
doc = nlp("I emailed you at least hundred times")
print(doc[-2])
print(doc[-2].like_num)

doc = nlp("My email is duygu@packt.com and you can visit me under https://duygua.github.io any time you want.")
print(doc[3])
print(doc[3].like_email)
print(doc[10])
print(doc[10].like_url)

hundred
True
duygu@packt.com
True
https://duygua.github.io
True


In [84]:
doc11 = nlp("Girl called Kathy has a nickname Cat123.")
for token in doc11:
    print(token.text, token.shape_)

Girl Xxxx
called xxxx
Kathy Xxxxx
has xxx
a x
nickname xxxx
Cat123 Xxxddd
. .


In [85]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Define a simple pattern for an address (e.g., "123 Main St.")
# This is a basic example and might not cover all address formats.
pattern = [{"SHAPE": "ddd"}, {"IS_ALPHA": True, "LENGTH": {">=": 2}}, {"LOWER": "st.", "OP": "?"}]
matcher.add("ADDRESS", [pattern])

doc = nlp("She lives at 123 Main St. in New York.")
matches = matcher(doc)

for match_id, start, end in matches:
    span = doc[start:end]
    print("Found address:", span.text)


Found address: 123 Main
Found address: 123 Main St.


In [92]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize the Matcher
matcher = Matcher(nlp.vocab)

# Define patterns for street names and street numbers
patterns = {
    "STREET_NAME": [
        [{"TEXT": {"REGEX": "^(?i)(street|st|road|rd|avenue|ave|lane|ln|boulevard|blvd)$"}}],
        [{"TEXT": {"REGEX": "^(?i)(street|st|road|rd|avenue|ave|lane|ln|boulevard|blvd)$"}}, {"IS_PUNCT": True}],
        [{"LOWER": {"IN": ["main", "elm", "pine"]}}, {"TEXT": {"REGEX": "^(?i)(street|st|road|rd|avenue|ave|lane|ln|boulevard|blvd)$"}}]
    ],
    "STREET_NUMBER": [
        [{"LIKE_NUM": True}]
    ]
}

# Function to add patterns to the matcher
def add_patterns(matcher, patterns):
    for label, pattern in patterns.items():
        matcher.add(label, pattern)

# Add initial patterns to the matcher
add_patterns(matcher, patterns)

# Process a text and print matched entities
def process_text(text):
    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = Span(doc, start, end, label=nlp.vocab.strings[match_id])
        print(f"{span.label_}: {span.text}")

text = "I moved to 1234 Elm St last year. My office is located at 5678 Pine Street. There's a bakery on Main Road near the intersection with Avenue. At the corner of avenue look for WallMart"

process_text(text)


STREET_NUMBER: 1234
STREET_NAME: Elm St
STREET_NAME: St
STREET_NUMBER: 5678
STREET_NAME: Pine Street
STREET_NAME: Street
STREET_NAME: Street.
STREET_NAME: Main Road
STREET_NAME: Road
STREET_NAME: Avenue
STREET_NAME: Avenue.
STREET_NAME: avenue
