In [143]:
import spacy
nlp = spacy.load("en_core_web_sm",disable=["ner"])

In [144]:
doc=nlp("This is a sentance.")
[i for i in doc]

[This, is, a, sentance, .]

In [145]:
from spacy.lang.en import English

In [146]:
nlp=English()

In [147]:
doc=nlp("Hello World!")

In [148]:
for i in doc:
    print(i.text)

Hello
World
!


In [149]:
token=doc[1]

In [150]:
print(token.text)

World


In [151]:
span=doc[1:3] #one or more tokens

In [152]:
span.text

'World!'

In [153]:
#Lexical Attributes

In [154]:
doc=nlp("It costs $5.")

In [155]:
print("Index:    ",[token.i for token in doc])
print("Text:     ",[token.text for token in doc])
print("Is_Alpha:     ",[token.is_alpha for token in doc])
print("Is-Punct:     ",[token.is_punct for token in doc])
print("like_num:     ",[token.like_num for token in doc])



Index:     [0, 1, 2, 3, 4]
Text:      ['It', 'costs', '$', '5', '.']
Is_Alpha:      [True, True, False, False, False]
Is-Punct:      [False, False, False, False, True]
like_num:      [False, False, False, True, False]


In [156]:
#Excercise

In [157]:
#Statistical Models

In [158]:
nlp=spacy.load("en_core_web_sm")

In [159]:
#part_of_speechTagging

In [160]:
doc=nlp("She ate the pizza.")

In [161]:
for token in doc:
    print(token.text,token.pos_,token.dep_,token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate
. PUNCT punct ate


In [162]:
from spacy import displacy

In [163]:
displacy.render(doc)

In [164]:
doc=nlp("Apple is lookong at buying U.K. startup for $1 billion.")

In [165]:
#predicting named entities
for ent in doc.ents:
    print(ent.text,ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [166]:
spacy.explain("GPE"), spacy.explain("NNP"),spacy.explain("dobj")

('Countries, cities, states', 'noun, proper singular', 'direct object')

In [167]:
#Rule based Matcher and we do not use regular expression(not only strings but also docs and objects)

In [168]:
import spacy
from spacy.matcher import Matcher

nlp=spacy.load("en_core_web_sm")

matcher=Matcher(nlp.vocab)

pattern=[{"TEXT":"iPhone"},{"TEXT":"X"}]

matcher.add("IPHONE_PATTERN",None,pattern)

doc=nlp("Upcoming iPhone X relsease date leaked")

matches=matcher(doc)



In [169]:
for match_id, start, end in matches:
    matched_span=doc[start:end]
    print(matched_span.text)

iPhone X


In [170]:
#Matching Lexical attributes
pattern=[
    {"IS_DIGIT":True},
    {"LOWER":"fifa"},
    {"LOWER":"world"},
    {"LOWER":"cup"},
    {"IS_PUNCT":True}
]

In [171]:
doc=nlp("2018 FIFA World Cup: France won")

In [172]:
mather=Matcher(nlp.vocab)
mather.add("FIFA_PATTERN",None, pattern)
mathes=mather(doc)
for match_id,start,end in mathes:
    matched_span=doc[start:end]
    print(matched_span.text)

2018 FIFA World Cup:


In [173]:
pattern=[{"LEMMA":"love","POS":"VERB"},{"POS":"NOUN"}]

In [174]:
doc=nlp("I loved dogs but now I love cats more.")
mather=Matcher(nlp.vocab)
mather.add("LOVE_PATTERN",None, pattern)
mathes=mather(doc)
for match_id,start,end in mathes:
    matched_span=doc[start:end]
    print(matched_span.text)

loved dogs
love cats


In [175]:
#using operators and quantifiers(+,*,?,!)

pattern=[{"LEMMA":"buy"},
        {"POS":"DET","OP":"?"},{"POS":"NOUN"}]
doc=nlp("I bought a samrtphone. Now I'm buying apps.")
mather=Matcher(nlp.vocab)
mather.add("LOVE_PATTERN",None, pattern)
mathes=mather(doc)
for match_id,start,end in mathes:
    matched_span=doc[start:end]
    print(matched_span.text)

bought a samrtphone
buying apps


In [176]:
#Large_Scale_Data_Analyisis with spacy

In [177]:
#shared_vocab and string store
#Hash_Values
coffee_hash=nlp.vocab.strings["coffee"]
coffee_hash

3197928453018144401

In [178]:
coffee_string=nlp.vocab.strings[3197928453018144401]

KeyError: "[E018] Can't retrieve string for hash '3197928453018144401'. This usually refers to an issue with the `Vocab` or `StringStore`."

In [187]:
doc=nlp("I love coffee")
print("Hash Value:",nlp.vocab.strings["coffee"])

Hash Value: 3197928453018144401


In [188]:
print("String Value:",nlp.vocab.strings[3197928453018144401])

String Value: coffee


In [189]:
doc=nlp("I love coffee")
print("Hash Value:",doc.vocab.strings["coffee"])

Hash Value: 3197928453018144401


In [190]:
doc=nlp("I love coffee")
lexeme=nlp.vocab["coffee"]

In [191]:
print(lexeme.text,lexeme.orth,lexeme.is_alpha)

coffee 3197928453018144401 True


In [192]:
#DOC/SPAN/Token

In [193]:
from spacy.tokens import Doc, Span

In [194]:
#word vectors not included in en_core_web_sm but only in medium and large models

In [195]:
import en_core_web_md

In [196]:
import spacy

In [197]:
nlp=en_core_web_md.load()

In [198]:
doc1=nlp("I like fast food")
doc2=nlp("I like pizza")
print(doc1.similarity(doc2))

0.8627204117787385


In [199]:
doc=nlp("I like pizza and pasta")
token1=doc[2]
token2=doc[4]
print(token1.similarity(token2))

0.73695457


In [200]:
doc=nlp("I like pizza")
token=nlp("soap")[0]

In [201]:
print(doc.similarity(token))

0.32531983166759537


In [202]:
span=nlp("I like pizza and pasta")[2:5]
doc=nlp("McDoalds sells burgers")

print(span.similarity(doc))

0.6423544447974996


In [203]:
doc=nlp("I have a banana")

In [204]:
print(doc[3].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

In [205]:
doc1=nlp("I like cats")
doc2=nlp("I hate cats")
doc1.similarity(doc2)

0.9501446702124066

In [206]:
#combining statistical models with rule based sysytems

In [207]:
matcher=Matcher(nlp.vocab)
matcher.add("DOG",None,[{"LOWER":"golden"},{"LOWER":"retriever"}])
doc=nlp("I have a Golden Retriever")

In [208]:
for match_id, start, end in matcher(doc):
    span=doc[start:end]
    print("MAtched Span:",span.text)
    print("Root token:",span.root.text)
    print("Root head toekn:",span.root.head.text)
    print("Previous Token:",doc[start-1].text,doc[start-1].pos_)
    

MAtched Span: Golden Retriever
Root token: Retriever
Root head toekn: have
Previous Token: a DET


In [209]:
#PhraseMatcher

In [210]:
from spacy.matcher import PhraseMatcher
matcher=PhraseMatcher(nlp.vocab)

pattern=nlp("Golden Retriever")
matcher.add("DOG",None,pattern)
doc=nlp("I have a Golden Retriever")

for match_id, start, end in matcher(doc):
    span=doc[start:end]
    print("Matched Span:",span.text)

Matched Span: Golden Retriever


In [211]:
#processing pipelines

In [212]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [213]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1965a946f60>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1962e5006a8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1962e500708>)]

In [214]:
#custome pipeline components

In [215]:
nlp=spacy.load("en_core_web_sm")
def custome_component(doc):
    print("Doc Length:",len(doc))
    return doc

nlp.add_pipe(custome_component,first=True)

print("Pipeline:",nlp.pipe_names)

Pipeline: ['custome_component', 'tagger', 'parser', 'ner']


In [216]:
doc= nlp("Hello World:")

Doc Length: 3


In [217]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)


def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc



nlp.add_pipe(animal_component, after="ner")
print(nlp.pipe_names)


doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]
['tagger', 'parser', 'ner', 'animal_component']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


In [218]:
#extension attributes

In [219]:
from spacy.tokens import Doc, Token, Span

In [220]:
Doc.set_extension("title",default=None)
Token.set_extension("is_color",default=False)
Span.set_extension("has_color",default=False)


ValueError: [E090] Extension 'title' already exists on Doc. To overwrite the existing extension, set `force=True` on `Doc.set_extension`.

In [221]:
#1> Attribute extensions
Token.set_extension("is_color",default=False,force=True)

In [222]:
doc=nlp("The sky is blue.")

doc[3]._.is_color=True

In [223]:
#2> Property Extension
def get_is_color (token):
    colors=["red","yello","blue"]
    return token.text in colors

Token.set_extension("is_color",getter=get_is_color,force=True)

doc=nlp("The sky is blue.")

print(doc[3]._.is_color,"-",doc[3].text)

True - blue


In [224]:
#2> Property Extension
def get_is_color (token):
    colors=["red","yello","blue"]
    return token.text in colors

Token.set_extension("is_color",getter=get_is_color,force=True)

doc=nlp("The sky is pink.")

print(doc[3]._.is_color,"-",doc[3].text)

False - pink


In [225]:
#2> Property Extension(span)
def get_has_color (span):
    colors=["red","yello","blue"]
    return any(token.text in colors for token in span)

Span.set_extension("has_color",getter=get_has_color,force=True)

doc=nlp("The sky is blue.")

print(doc[1:4]._.has_color,"-",doc[1:4].text)
print(doc[0:2]._.has_color,"-",doc[0:2].text)

True - sky is blue
False - The sky


In [226]:
#3> Method Extension(span)
def has_token (doc,token_text):
    in_doc=token_text in [token.text for token in doc]    
    return in_doc

Doc.set_extension("has_token",method=has_token,force=True)

doc=nlp("The sky is blue.")

print(doc._.has_token("blue"),"-blue")
print(doc._.has_token("cloud"),"-","-cloud")

True -blue
False - -cloud


In [227]:
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()


def get_reversed(token):
    return token.text[::-1]


Token.set_extension("reversed", getter=get_reversed,force=True)

# Process the text and print the reversed attribute for each token
doc = nlp("All generalizations are false, including this one.")
for token in doc:
    print("reversed:", token._.reversed)

reversed: llA
reversed: snoitazilareneg
reversed: era
reversed: eslaf
reversed: ,
reversed: gnidulcni
reversed: siht
reversed: eno
reversed: .


In [242]:
from spacy.lang.en import English
from spacy.tokens import Doc

nlp = English()

# Define the getter function
def get_has_number(doc):
    # Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)


# Register the Doc property extension "has_number" with the getter get_has_number
Doc.set_extension("has_number", getter=get_has_number,force=True)

# Process the text and check the custom has_number attribute
doc = nlp("The museum closed for five years in 2012.")
print("has_number:", doc._.has_number)

has_number: True


In [243]:
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")


def get_wikipedia_url(span):
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text


Span.set_extension("wikipedia_url", getter=get_wikipedia_url,force=True)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    print(ent.text, ent._.wikipedia_url)

fifty years None
first None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


In [244]:
doc.ents

(fifty years, first, David Bowie)

In [245]:
# import json
# from spacy.lang.en import English
# from spacy.tokens import Span
# from spacy.matcher import PhraseMatcher

# with open("exercises/en/countries.json", encoding="utf8") as f:
#     COUNTRIES = json.loads(f.read())

# with open("exercises/en/capitals.json", encoding="utf8") as f:
#     CAPITALS = json.loads(f.read())

# nlp = English()
# matcher = PhraseMatcher(nlp.vocab)
# matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


# def countries_component(doc):
#     # Create an entity Span with the label "GPE" for all matches
#     matches = matcher(doc)
#     doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matches]
#     return doc


# # Add the component to the pipeline
# nlp.add_pipe(countries_component)
# print(nlp.pipe_names)

# # Getter that looks up the span text in the dictionary of country capitals
# get_capital = lambda span: CAPITALS.get(span.text)

# # Register the Span extension attribute "capital" with the getter get_capital
# Span.set_extension("capital", getter=get_capital)

# # Process the text and print the entity text, label and capital attributes
# doc = nlp("Czech Republic may help Slovakia protect its airspace")
# print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

In [246]:
#Scaling and performance

In [247]:
#docs=list(nlp.pipe(lots_of_text))

In [248]:
data=[
    ("This is a text",{"id":1,"page_number":15}),
    ("And another text",{"id":2,"page_number":16})
]

In [249]:
for doc, context in nlp.pipe(data,as_tuples=True):
    print(doc.text,context["page_number"])

This is a text 15
And another text 16


In [251]:
Doc.set_extension("id",default=None,force=True)
Doc.set_extension("page_number",default=None,force=True)
data=[
    ("This is a text",{"id":1,"page_number":15}),
    ("And another text",{"id":2,"page_number":16})
]

In [252]:
for doc, context in nlp.pipe(data,as_tuples=True):
    doc._.id=context["id"]
    doc._.page_number=context["page_number"]

In [253]:
#using only the tokenizer

In [254]:
doc=nlp.make_doc("Hello World !")

In [255]:
#disable pipeline componants

In [256]:
text="Hello World !"
with nlp.disable_pipes("tagger","parser"):
    doc=nlp(text)
    print(doc.ents)
    
#outside this with block all the pipeline componants autometically gets abled

()
