In [13]:
import spacy
from spacy import displacy

# Spacy Overview

In [9]:
nlp_en = spacy.load('en_core_web_md')

In [11]:
nlp_de = spacy.load('de_core_news_lg')

In [58]:
#doc = nlp_de('Ich habe ein schönes Auto')
# doc = nlp_de('Die Firma Xing hat dich zum data insights Event eingeladen')
doc = nlp_de("Er arebitet im Bereich Softwareentwicklung")

In [59]:
displacy.render(doc, style='dep')

In [22]:
displacy.render(doc, style='ent')

## Tokenizer

In [29]:
doc = nlp_en("The pig ate all the food.")

In [30]:
print([token for token in doc])

[The, pig, ate, all, the, food, .]


In [36]:
doc = nlp_en("I'd rather not eat that!!!, it's rat poison.\n and I prefer mango juice")
print([token for token in doc])

[I, 'd, rather, not, eat, that, !, !, !, ,, it, 's, rat, poison, ., 
 , and, I, prefer, mango, juice]


In [38]:
# sentences
print([token for token in doc.sents])

[I'd rather not eat that!!!, , it's rat poison., 
 and I prefer mango juice]


## Lemmatization

In [39]:
nlp = spacy.load("en_core_web_md")
doc = nlp("I went there for working and worked for 3 years.")
for token in doc:
    print(token.text, token.lemma_)

I I
went go
there there
for for
working work
and and
worked work
for for
3 3
years year
. .


In [40]:
# adding custom lemmatization. useful for nicknames

nlp.get_pipe("attribute_ruler").add([[{"TEXT": "Angeltown"}]], {"LEMMA": "Los Angeles"})
doc = nlp("I am flying to Angeltown")
for token in doc:
    print(token.text, token.lemma_)

I I
am be
flying fly
to to
Angeltown Los Angeles


## Containers

In [43]:
nlp_de = spacy.load('de_core_news_lg')
doc = nlp_de("""
Hallo Herr Alfermann,
im Rahmen unserer Personalrecherche bin ich auf Ihr Profil gestoßen. Ich bin bei Robert Half für das Interne Recruiting in Hamburg und Berlin zuständig. Robert Half ist ein internationales Unternehmen und einer der ersten und größten Personaldienstleitungsunternehmen weltweit. Aufgrund unseres deutschlandweiten Expansionskurses sind wir auf der Suche nach Consultants und Resource Manager für Freelancer in Hamburg.
Wenn Sie offen für einen Wechsel sind und sich mit mir über einen Einstieg bei Robert Half austauschen möchten, freue ich mich über Ihren Lebenslauf.
Gerne können wir auch einen Termin für ein telefonische Gespräch vereinbaren.
Ich freue mich auf Ihr Feedback!
Viele Grüße
Ragna Paulsen
""")

In [46]:
# list(doc.sents) # sentences 
# list(doc.ents) # entities
# list(doc.noun_chunks) # noun phrases

## Core Operations

In [50]:
# token.is_alpha
# token.is_currency
# token.is_punct
# token.like_email
# token.like_url
# token.like_num
# for token in doc:
# 	if token.is_oov:
# 		print(token)

# Spacy Features

## Linguistic features

In [51]:
nlp = spacy.load('en_core_web_md')
doc = nlp("I saw flowers.")
token = doc[2]
token.text, token.tag_, spacy.explain(token.tag_)
('flowers', 'NNS', 'noun, plural')

('flowers', 'NNS', 'noun, plural')

### POS tagging

In [57]:
doc = nlp("Alicia and me went to the school by bus.")
for token in doc:
	print(f"token: {token.text}, pos: {token.pos_}, tag: {token.tag_}")
	print(spacy.explain(token.pos_))
	print(spacy.explain(token.tag_))

token: Alicia, pos: PROPN, tag: NNP
proper noun
noun, proper singular
token: and, pos: CCONJ, tag: CC
coordinating conjunction
conjunction, coordinating
token: me, pos: PRON, tag: PRP
pronoun
pronoun, personal
token: went, pos: VERB, tag: VBD
verb
verb, past tense
token: to, pos: ADP, tag: IN
adposition
conjunction, subordinating or preposition
token: the, pos: DET, tag: DT
determiner
determiner
token: school, pos: NOUN, tag: NN
noun
noun, singular or mass
token: by, pos: ADP, tag: IN
adposition
conjunction, subordinating or preposition
token: bus, pos: NOUN, tag: NN
noun
noun, singular or mass
token: ., pos: PUNCT, tag: .
punctuation
punctuation mark, sentence closer


### Dependency Parsing

In [61]:
spacy.explain("nsubj")

'nominal subject'

In [62]:
# cat is a direct object. The word that the verb applies on.
doc = nlp("I own a ginger cat.")
token = doc[4]
token.text, token.dep_, spacy.explain(token.dep_)


('cat', 'dobj', 'direct object')

In [65]:
doc = nlp("I counted white sheep")

for token in doc:
      print(token.text, token.tag_, token.dep_, token.head)

displacy.render(doc, style='dep')

I PRP nsubj counted
counted VBD ROOT counted
white JJ amod sheep
sheep NN dobj counted


### NER

In [68]:
spacy.explain("ORG")
doc = nlp("He worked for XING.")
token = doc[3]
token.ent_type_, spacy.explain(token.ent_type_)

('ORG', 'Companies, agencies, institutions, etc.')

In [69]:
doc = nlp("Albert Einstein was born in Ulm on 1879. He studied electronical engineering at ETH Zurich.")
doc.ents

for token in doc:
    print(token.text, token.ent_type_, spacy.explain(token.ent_type_))

Albert PERSON People, including fictional
Einstein PERSON People, including fictional
was  None
born  None
in  None
Ulm GPE Countries, cities, states
on  None
1879 DATE Absolute or relative dates or periods
.  None
He  None
studied  None
electronical  None
engineering  None
at  None
ETH ORG Companies, agencies, institutions, etc.
Zurich ORG Companies, agencies, institutions, etc.
.  None




In [None]:
# TODO: load recruiter messages and perform an entity counting like in the code below.
# check chapter 3 of the book Mastering Spacy.
# from collections import Counter
# labels = [ent.label_ for ent in doc.ents]
# Counter(labels)

# Rules Based Matching

## Token Based matching

In [72]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_md")
doc = nlp("Good morning, I want to reserve a ticket.")
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "good"}, {"LOWER": "morning"},{"IS_PUNCT": True}]

matcher.add("morningGreeting", [pattern])

matches = matcher(doc)

for match_id, start, end in matches:
     m_span = doc[start:end]  
     print(start, end, m_span.text)

0 3 Good morning,


In [84]:
# A more complex matching example

doc1 = nlp("Hello hello hello, how are you?")
doc2 = nlp("Hello, how are you?")
doc3 = nlp("How are you?")

# check both * and + to see the difference
pattern = [{"LOWER": {"IN": ["hello", "hi", "hallo"]}, "OP":"+"}, {"IS_PUNCT": True}]
# pattern = [{"LOWER": {"IN": ["hello", "hi", "hallo"]}, "OP":"*"}, {"IS_PUNCT": True}]

matcher = Matcher(nlp.vocab)
matcher.add("greetings",  [pattern])

for mid, start, end in matcher(doc1):
     print(start, end, doc1[start:end])

2 4 hello,
1 4 hello hello,
0 4 Hello hello hello,


In [78]:
# wildcards
doc = nlp("My name is Alice and his name was Elliot.")
pattern = [{"LOWER": "name"},{"LEMMA": "be"},{}]
matcher.add("pickName", [pattern])

for mid, start, end in matcher(doc):
     print(start, end, doc[start:end])

1 4 name is Alice
6 9 name was Elliot
9 10 .


In [90]:
doc1 = nlp_de("Ich bin bei der Firma X. Er war in Köln")

# {} wildcard token means any token
pattern = [{"POS": "PRON"},{"LEMMA": "sein"},{}]

matcher = Matcher(nlp_de.vocab)
matcher.add("pickName", [pattern])

for mid, start, end in matcher(doc1):
     print(start, end, doc1[start:end])

0 3 Ich bin bei
6 9 Er war in


In [92]:
# working wirh regex
doc1 = nlp("I travelled by bus.")
doc2 = nlp("She traveled by bike.")
pattern = [{"POS": "PRON"},{"TEXT": {"REGEX": "[Tt]ravell?ed"}}]

matcher = Matcher(nlp.vocab)
matcher.add("langVariatons", [pattern])
for mid, start, end in matcher(doc1):
     print(start, end, doc1[start:end])

for mid, start, end in matcher(doc2):
     print(start, end, doc2[start:end])

0 2 I travelled
0 2 She traveled


In [94]:
# using regex with POS tags

doc = nlp("I went to Italy; he has been there too. His mother also has told me she wants to visit Rome.")
pattern = [{"TAG": {"REGEX": "^V"}}]
matcher.add("verbs",  [pattern])

for mid, start, end in matcher(doc):
    print(start, end, doc[start:end])

1 2 went
6 7 has
7 8 been
14 15 has
15 16 told
18 19 wants
20 21 visit


In [95]:
# Phrase matcher
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_md")

matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = ["Asset", "Investment", "Derivatives", "Demand",  "Market"]

patterns = [nlp.make_doc(term) for term in terms]
matcher.add("financeTerms", patterns)

doc = nlp("During the last decade, derivatives market became an asset class of their own and influenced the financial landscape strongly.")
matches = matcher(doc)

for mid, start, end in matches:
    print(start, end, doc[start:end])

5 6 derivatives
6 7 market
9 10 asset


## Entity Ruler

In [97]:
pattern = [{"ENT_TYPE": "PERSON"}]

doc = nlp("Bill Gates visited Berlin.")

matcher = Matcher(nlp.vocab)
matcher.add("personEnt",  [pattern])
matches = matcher(doc)

for mid, start, end in matches:
    print(start, end, doc[start:end])

0 1 Bill
1 2 Gates


In [100]:
# Matching two conseutive tokens,  one is an entity type and the other is a POS tag
pattern = [{"ENT_TYPE": "PERSON", "OP": "+"}, {"POS" : "VERB"}]

matcher = Matcher(nlp.vocab)
matcher.add("personEntAction",  [pattern])
doc = nlp("Today German chancellor Angela Merkel met with the US president.")

matches = matcher(doc)

for mid, start, end in matches:
    print(start, end, doc[start:end])

4 6 Merkel met
3 6 Angela Merkel met


In [103]:
# add custom entities
nlp = spacy.load('en_core_web_md')
doc = nlp("I have an acccount with chime since 2017")
print(doc.ents)
patterns = [{"label": "ORG", "pattern": [{"LOWER": "chime"}]}]

ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
doc = nlp("I have an acccount with chime since 2017")
print(doc.ents)

(2017,)
(chime, 2017)


## Combining Spacy Models and Matchers

In [112]:
# finding iban numbers
nlp = spacy.load('en_core_web_md')
doc = nlp("My IBAN number is BE71 0961 2345 6769, please send the money there.")
doc1 = nlp("My IBAN number is FR76 3000 6000 0112 3456 7890 189, please send the money there.")

pattern = [{"SHAPE": "XXdd"},{"TEXT": {"REGEX": "\d{1,4}"}, "OP":"+"}]

matcher = Matcher(nlp.vocab)
matcher.add("ibanNum",  [pattern])

for mid, start, end in matcher(doc1):
    print(start, end, doc1[start:end])

4 6 FR76 3000
4 7 FR76 3000 6000
4 8 FR76 3000 6000 0112
4 9 FR76 3000 6000 0112 3456
4 10 FR76 3000 6000 0112 3456 7890
4 11 FR76 3000 6000 0112 3456 7890 189


In [115]:
# hash tags matching
doc = nlp("Start working out now #WeekendShred")
pattern = [{"TEXT": "#"}, {"IS_ASCII": True}]
matcher = Matcher(nlp.vocab)
matcher.add("hashTag",  [pattern])
matches = matcher(doc)

for mid, start, end in matches:
	print(start, end, doc[start:end])

4 6 #WeekendShred


In [120]:
# emojis matching
pos_emoji = ["😀", "🦝", "", "", "", ""] 
neg_emoji = ["", "", "", "", "", ""]

pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

matcher = Matcher(nlp.vocab)
matcher.add("posEmoji", pos_patterns)
matcher.add("negEmoji", neg_patterns)

doc = nlp(" I love Zara 😀")

for mid, start, end in matcher(doc):
    print(start, end, doc[start:end])

4 5 😀


In [122]:
doc = nlp("Einstein lived in Zurich.")
[(ent.text, ent.label_) for ent in doc.ents]

[('Einstein', 'PERSON'), ('Zurich', 'GPE')]

In [123]:
person_ents = [ent for ent in doc.ents if ent.label_ == "PERSON"]

for person_ent in person_ents:

	#We use head of the entity's last token
	head = person_ent[-1].head  

	if head.lemma_ == "live":
		preps = [token for token in head.children if token.dep_ == "prep"]

		for prep in preps:
			places = [token for token in prep.children if token.ent_type_ == "GPE"]   
			
			# Verb is in past or present tense
			print({'person': person_ent, 'city': places,'past': head.tag_ == "VBD"})

{'person': Einstein, 'city': [Zurich], 'past': True}


# Word Vectors and Semantic Similarities

## Categorizing text with semantic similarity

# Semantic Parsing with spaCy: Use case

In [124]:
import pandas as pd

dataset = pd.read_csv("data/atis_intents.csv", header=None)

In [125]:
dataset.head()

Unnamed: 0,0,1
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...


In [126]:
for text in dataset[1].head():
	print(text)

 i want to fly from boston at 838 am and arrive in denver at 1110 in the morning
 what flights are available from pittsburgh to baltimore on thursday morning
 what is the arrival time in san francisco for the 755 am flight leaving washington
 cheapest airfare from tacoma to orlando
 round trip fares from pittsburgh to philadelphia under 1000 dollars


In [128]:
grouped = dataset.groupby(0).size().head()
print(grouped)

0
atis_abbreviation                           147
atis_aircraft                                81
atis_aircraft#atis_flight#atis_flight_no      1
atis_airfare                                423
atis_airfare#atis_flight_time                 1
dtype: int64


In [130]:
# calculating the frequency of the entities in all documents

from collections import Counter
import spacy

nlp = spacy.load("en_core_web_md")

corpus = open("data/atis_utterances.txt", "r").read().split("\n")

all_ent_labels = []

for sentence in corpus:
	doc = nlp(sentence.strip())
	ents = doc.ents
	all_ent_labels += [ent.label_ for ent in ents]

c = Counter(all_ent_labels)
print(c)

Counter({'GPE': 9124, 'DATE': 1474, 'TIME': 994, 'ORG': 428, 'CARDINAL': 317, 'ORDINAL': 218, 'NORP': 74, 'QUANTITY': 44, 'MONEY': 42, 'LOC': 16, 'PRODUCT': 12, 'FAC': 9, 'PERSON': 6, 'EVENT': 1})


In [131]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "ADP"}, {"ENT_TYPE": "GPE"}]
matcher.add("prepositionLocation", [pattern])

doc = nlp("show me flights from denver to boston on tuesday")
matches = matcher(doc)

for mid, start, end in matches:
    print(doc[start:end])

from denver
to boston


In [133]:
pattern = [{"ENT_TYPE": "ORG", "OP": "+"}]
matcher = Matcher(nlp.vocab)
matcher.add("AirlineName", [pattern])
doc = nlp("what is the earliest united airlines flight flying from denver")
matches = matcher(doc)
for mid,start,end in matches:
	print(doc[start:end])

united
united airlines
airlines


In [134]:
# Finding abbreviations

pattern1 = [{"TEXT": {"REGEX": "\w{1,2}\d{1,2}"}}]

pattern2 = [{"SHAPE": { "IN": ["x", "xx"]}}, {"SHAPE": { "IN": ["d", "dd"]}}]

pattern3 = [{"TEXT": {"IN": ["class", "code", "abbrev", "abbreviation"]}}, {"SHAPE": { "IN": ["x", "xx"]}}]

pattern4 =   [{"POS": "NOUN", "SHAPE": { "IN": ["x", "xx"]}}]

matcher = Matcher(nlp.vocab)
matcher.add("abbrevEntities", [pattern1, pattern2, pattern3, pattern4])

In [135]:
sentences = [
'what does restriction ap 57 mean',
'what does the abbreviation co mean',
'what does fare code qo mean',
'what is the abbreviation d10',
'what does code y mean',
'what does the fare code f and fn mean',
'what is booking class c']

for sent in sentences:

   doc = nlp(sent)
   matches = matcher(doc)
   for mid, start, end in matches:
     print(doc[start:end])

ap 57
57
abbreviation co
co
code qo
d10
code y
code f
class c
c


In [141]:
import spacy

nlp = spacy.load("en_core_web_md")


def reach_parent(source_token, dest_token):
	"""Checks if source and desrination token are relarted"""
	source_token = source_token.head
	while source_token != dest_token:
		if source_token.head == source_token:
			return None

		source_token = source_token.head

	return source_token

doc = nlp("I'm going to a conference in Munich.")

In [142]:
print("source token: ", doc[-2])
print("destination token: ",doc[3])
reach_parent(doc[-2], doc[3])

source token:  Munich
destination token:  to


to

In [143]:
# find direct object of a sentence. dobj.

import spacy

nlp = spacy.load("en_core_web_md")
doc = nlp("find a flight from washington to sf")

for token in doc:
  if token.dep_ == "dobj":
    print(token.head.text + token.text.capitalize())

findFlight
