# spaCy: Practice Sheet
#### For reference visit: https://spacy.io/usage/linguistic-features

In [1]:
import spacy
from spacy import displacy

Load English tokenizer, tagger, parser, NER and word vectors

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
text = ("She saw a secret little clearing, and a secret little hot made of rustic poles. And she had never been here before!")

doc = nlp(text)

### Part-of-speech reognition
- Text: The original word text.
- Lemma: The base form of the word.
- POS: The simple part-of-speech tag.
- Tag: The detailed part-of-speech tag.
- Dep: Syntactic dependency, i.e. the relation between tokens.
- Shape: The word shape – capitalization, punctuation, digits.
- is alpha: Is the token an alpha character?
- is stop: Is the token part of a stop list, i.e. the most common words of the language?

In [4]:
#numbered sentences
for num,sentence in enumerate(doc.sents):
    print(f'{num}: {sentence}')

0: She saw a secret little clearing, and a secret little hot made of rustic poles.
1: And she had never been here before!


In [5]:
#parts of speech
for word in doc:
    print(word.text, word.pos_)

She PRON
saw VERB
a DET
secret ADJ
little ADJ
clearing NOUN
, PUNCT
and CCONJ
a DET
secret ADJ
little ADJ
hot ADJ
made NOUN
of ADP
rustic ADJ
poles NOUN
. PUNCT
And CCONJ
she PRON
had VERB
never ADV
been VERB
here ADV
before ADV
! PUNCT


In [6]:
#parts of speech detailed
for word in doc:
    print(word.text, word.pos_, word.tag_)

She PRON PRP
saw VERB VBD
a DET DT
secret ADJ JJ
little ADJ JJ
clearing NOUN NN
, PUNCT ,
and CCONJ CC
a DET DT
secret ADJ JJ
little ADJ JJ
hot ADJ JJ
made NOUN NN
of ADP IN
rustic ADJ JJ
poles NOUN NNS
. PUNCT .
And CCONJ CC
she PRON PRP
had VERB VBD
never ADV RB
been VERB VBN
here ADV RB
before ADV RB
! PUNCT .


In [7]:
spacy.explain('VBP')

'verb, non-3rd person singular present'

### Dependancy Recognition

In [8]:
#syntactic dependancy
for word in doc:
    print(word.text, word.pos_, word.tag_, word, word.dep_)

She PRON PRP She nsubj
saw VERB VBD saw ROOT
a DET DT a det
secret ADJ JJ secret amod
little ADJ JJ little amod
clearing NOUN NN clearing dobj
, PUNCT , , punct
and CCONJ CC and cc
a DET DT a det
secret ADJ JJ secret amod
little ADJ JJ little amod
hot ADJ JJ hot amod
made NOUN NN made conj
of ADP IN of prep
rustic ADJ JJ rustic amod
poles NOUN NNS poles pobj
. PUNCT . . punct
And CCONJ CC And cc
she PRON PRP she nsubj
had VERB VBD had aux
never ADV RB never neg
been VERB VBN been ROOT
here ADV RB here advmod
before ADV RB before advmod
! PUNCT . ! punct


In [9]:
#dependancy visualisation
displacy.render(doc,style='dep',jupyter='true')

In [10]:
#lemmatizing - a word based on its intended meaning
for word in doc:
    print(word.text, word.lemma_, word.pos_)

She -PRON- PRON
saw see VERB
a a DET
secret secret ADJ
little little ADJ
clearing clearing NOUN
, , PUNCT
and and CCONJ
a a DET
secret secret ADJ
little little ADJ
hot hot ADJ
made made NOUN
of of ADP
rustic rustic ADJ
poles pole NOUN
. . PUNCT
And and CCONJ
she -PRON- PRON
had have VERB
never never ADV
been be VERB
here here ADV
before before ADV
! ! PUNCT


### Named Entity Recognition or Detection:
>takes a string of text as input and indentifies relevant nouns (people, places, and organisations) that are mentioned in that string.

Uses:
- tagging
- improving search
- content recommendation

In [11]:
doc2 = nlp("Apple is looking at buying UK startup for $1 billion")

for ent in doc2.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
UK 27 29 GPE
$1 billion 42 52 MONEY


In [12]:
spacy.explain('NORP')

'Nationalities or religious or political groups'

In [13]:
displacy.render(doc2, style='ent',jupyter='True')

### Semantic Similarity
>The model you're using has no word vectors loaded, so the result of the Doc.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.
- python -m spacy download en_core_web_lg

object1.similarity(object2)

In [14]:
doc3 = nlp('smart')
doc4 = nlp('clever')

doc3.similarity(doc4)

  "__main__", mod_spec)


0.5818587817189651

In [15]:
ex1 = nlp('wolf dog cat fish')

for token1 in ex1:
    for token2 in ex1:
        print((token1.text,token2.text), 'Similarity: ', token1.similarity(token2))

('wolf', 'wolf') Similarity:  1.0


  "__main__", mod_spec)


('wolf', 'dog') Similarity:  0.53469515


  "__main__", mod_spec)


('wolf', 'cat') Similarity:  0.48574424


  "__main__", mod_spec)


('wolf', 'fish') Similarity:  0.359923


  "__main__", mod_spec)


('dog', 'wolf') Similarity:  0.53469515
('dog', 'dog') Similarity:  1.0


  "__main__", mod_spec)


('dog', 'cat') Similarity:  0.70062745


  "__main__", mod_spec)


('dog', 'fish') Similarity:  0.3323872


  "__main__", mod_spec)


('cat', 'wolf') Similarity:  0.48574424


  "__main__", mod_spec)


('cat', 'dog') Similarity:  0.70062745
('cat', 'cat') Similarity:  1.0


  "__main__", mod_spec)


('cat', 'fish') Similarity:  0.4402516


  "__main__", mod_spec)


('fish', 'wolf') Similarity:  0.359923


  "__main__", mod_spec)


('fish', 'dog') Similarity:  0.3323872


  "__main__", mod_spec)


('fish', 'cat') Similarity:  0.4402516
('fish', 'fish') Similarity:  1.0


In [16]:
mylist = [(token1.text,token2.text,token1.similarity(token2)) for token2 in ex1 for token1 in ex1]
mylist

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


[('wolf', 'wolf', 1.0),
 ('dog', 'wolf', 0.53469515),
 ('cat', 'wolf', 0.48574424),
 ('fish', 'wolf', 0.359923),
 ('wolf', 'dog', 0.53469515),
 ('dog', 'dog', 1.0),
 ('cat', 'dog', 0.70062745),
 ('fish', 'dog', 0.3323872),
 ('wolf', 'cat', 0.48574424),
 ('dog', 'cat', 0.70062745),
 ('cat', 'cat', 1.0),
 ('fish', 'cat', 0.4402516),
 ('wolf', 'fish', 0.359923),
 ('dog', 'fish', 0.3323872),
 ('cat', 'fish', 0.4402516),
 ('fish', 'fish', 1.0)]

#### Using Pandas to analyse similarity data

In [17]:
import pandas as pd

In [18]:
df = pd.DataFrame(mylist)
df

Unnamed: 0,0,1,2
0,wolf,wolf,1.0
1,dog,wolf,0.534695
2,cat,wolf,0.485744
3,fish,wolf,0.359923
4,wolf,dog,0.534695
5,dog,dog,1.0
6,cat,dog,0.700627
7,fish,dog,0.332387
8,wolf,cat,0.485744
9,dog,cat,0.700627


In [19]:
df.corr()

Unnamed: 0,2
2,1.0


### Noun Chunks

In [25]:
doc5 = nlp('The dirty little fox jumped over the hill')
for token in doc5.noun_chunks:
    print(token.text)

The dirty little fox
the hill


In [26]:
for token in doc5.noun_chunks:
    print(token.root.text)

fox
hill
