
Date: 2023/12/2

Reference: https://course.spacy.io/en/

## Chapter 1

### 1

In [1]:
import spacy
nlp = spacy.blank('en')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
doc = nlp('Hello world!')
doc.text

'Hello world!'

In [3]:
for token in doc:
    print(token.text)

Hello
world
!


In [4]:
token = doc[1]
print(token.text)

world


In [5]:
span = doc[1:3]
print(span.text)

world!


In [6]:
doc = nlp('It costs $5.')

In [7]:
print('Index:', [token.i for token in doc])
print('Index:', [token.text for token in doc])
print('Index:', [token.is_alpha for token in doc])
print('Index:', [token.is_punct for token in doc])
print('Index:', [token.like_num for token in doc])

Index: [0, 1, 2, 3, 4]
Index: ['It', 'costs', '$', '5', '.']
Index: [True, True, False, False, False]
Index: [False, False, False, False, True]
Index: [False, False, False, True, False]


### 5

In [8]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [9]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [10]:
doc = nlp('She ate the pizza')
doc.text

'She ate the pizza'

In [11]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.text)

She PRON nsubj She
ate VERB ROOT ate
the DET det the
pizza NOUN dobj pizza


In [12]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [13]:
spacy.explain('GPE')

'Countries, cities, states'

In [14]:
spacy.explain('NNP')

'noun, proper singular'

In [15]:
spacy.explain('dobj')

'direct object'

## Chapter 2

In [16]:
import json
import spacy

with open("data/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

nlp = spacy.blank("en")
doc1 = nlp("Czech Republic may help Slovakia protect its airspace")
doc2 = nlp("CZECH republic may help Slovakia protect its airspace")

# Import the PhraseMatcher and initialize it
from spacy.matcher import PhraseMatcher

matcher1 = PhraseMatcher(nlp.vocab)
matcher2 = PhraseMatcher(nlp.vocab, attr="LOWER")

# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
#print([nlp(country) for country in COUNTRIES])
patterns = list(nlp.pipe(COUNTRIES))
matcher1.add("COUNTRY", patterns)
matcher2.add("COUNTRY", patterns)

# Call the matcher on the test document and print the result
matches = matcher1(doc1)
print([doc1[start:end] for match_id, start, end in matches])
matches = matcher1(doc2)
print([doc2[start:end] for match_id, start, end in matches])
matches = matcher2(doc2)
print([doc2[start:end] for match_id, start, end in matches])

[Czech Republic, Slovakia]
[Slovakia]
[CZECH republic, Slovakia]


## Chapter 3

In [17]:
# spacy.blank("en")は Tokenizer しかない空のパイプライン。これに、countries_componentを追加する。
# このコード、結構、使い道がある。

import json
import spacy
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("data/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("data/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = spacy.blank("en")
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", list(nlp.pipe(COUNTRIES)))


@Language.component("countries_component")
def countries_component_function(doc):
    # Create an entity Span with the label "GPE" for all matches
    matches = matcher(doc)
    doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matches]
    return doc


# Add the component to the pipeline
nlp.add_pipe("countries_component")
print(nlp.pipe_names)

# Getter that looks up the span text in the dictionary of country capitals
get_capital = lambda span: CAPITALS.get(span.text)

# Register the Span extension attribute "capital" with the getter get_capital
Span.set_extension("capital", getter=get_capital)

# Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

['countries_component']
[('Czech Republic', 'GPE', 'Prague'), ('Slovakia', 'GPE', 'Bratislava')]
