# SpaCy

Solving tiypical Natural Language Processing tasks with SpaCy

Made by [Artem Konevskikh](https://aiculedssul.net/)

In [1]:
#@title Load libraries
import os
import spacy
from spacy import displacy
import pandas as pd
from google.colab import data_table
from IPython.display import clear_output 

data_table.enable_dataframe_formatter()

In [7]:
#@title Select language model
lang = "en_core_web_sm"  #@param ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "ru_core_news_sm", "es_core_news_sm", "fr_core_news_sm" ]
#@markdown All supported languages are here https://spacy.io/usage/models#languages
# !python -m spacy download $lang
nlp = spacy.load(lang)

In [18]:
import en_core_web_lg
nlp = en_core_web_lg.load()

In [19]:
#@title Linguistic Features
#@markdown Read more here https://spacy.io/usage/linguistic-features
text = "Timothy Bloxam Morton (born 19 June 1968) is a professor and Rita Shea Guffey Chair in English at Rice University."  #@param {type:"string"}
doc = nlp(text)
cols = ("text", "lemma", "POS", "explain", "tag", "dep", "shape", "alpha", "stopword")

rows = []

for t in doc:
    row = [t.text, t.lemma_, t.pos_, spacy.explain(t.pos_), t.tag_, t.dep_,
            t.shape_, t.is_alpha, t.is_stop]
    rows.append(row)

df = pd.DataFrame(rows, columns=cols)
    
df

Unnamed: 0,text,lemma,POS,explain,tag,dep,shape,alpha,stopword
0,Timothy,Timothy,PROPN,proper noun,NNP,compound,Xxxxx,True,False
1,Bloxam,Bloxam,PROPN,proper noun,NNP,compound,Xxxxx,True,False
2,Morton,Morton,PROPN,proper noun,NNP,nsubj,Xxxxx,True,False
3,(,(,PUNCT,punctuation,-LRB-,punct,(,False,False
4,born,bear,VERB,verb,VBN,acl,xxxx,True,False
5,19,19,NUM,numeral,CD,nummod,dd,False,False
6,June,June,PROPN,proper noun,NNP,npadvmod,Xxxx,True,False
7,1968,1968,NUM,numeral,CD,nummod,dddd,False,False
8,),),PUNCT,punctuation,-RRB-,punct,),False,False
9,is,be,AUX,auxiliary,VBZ,ROOT,xx,True,True


In [9]:
#@title Dependency parsing
#@markdown Noun chunks are “base noun phrases” – flat phrases that have a noun as their head. You can think of noun chunks as a noun plus the words describing the noun 

#@markdown **Text**: The original noun chunk text.<br/>
#@markdown **Root text**: The original text of the word connecting the noun chunk to the rest of the parse.<br/>
#@markdown **Root dep**: Dependency relation connecting the root to its head.<br/>
#@markdown **Root head text**: The text of the root token’s head.

#@markdown Mor on dependency parsing https://spacy.io/usage/linguistic-features#dependency-parse
text = "Timothy Bloxam Morton (born 19 June 1968) is a professor and Rita Shea Guffey Chair in English at Rice University."  #@param {type:"string"}
doc = nlp(text)
cols = ('Text', 'Root text', 'Root dependency', 'Root head text')
rows = []
for chunk in doc.noun_chunks:
    row = [chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text]
    rows.append(row)

df = pd.DataFrame(rows, columns=cols)

df

Unnamed: 0,Text,Root text,Root dependency,Root head text
0,Timothy Bloxam Morton,Morton,nsubj,born
1,a professor,professor,attr,is
2,Rita Shea Guffey Chair,Chair,conj,professor
3,English,English,pobj,in
4,Rice University,University,pobj,at


In [10]:
#@title Visualize dependencies

displacy.render(doc, style='dep', jupyter=True)

In [11]:
#@title Named Entity Recognition
#@markdown Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example, identifying a token as a person, an organisation or a location.

#@markdown Read more https://spacy.io/usage/linguistic-features#named-entities

text = "Timothy Bloxam Morton (born 19 June 1968) is a professor and Rita Shea Guffey Chair in English at Rice University."  #@param {type:"string"}
doc = nlp(text)
cols = ('Text', 'Start', 'End', 'Label')
rows = []
for ent in doc.ents:
    row = [ent.text, ent.start_char, ent.end_char, ent.label_]
    rows.append(row)

df = pd.DataFrame(rows, columns=cols)

df

Unnamed: 0,Text,Start,End,Label
0,Timothy Bloxam Morton,0,21,PERSON
1,June 1968,31,40,DATE
2,Rita Shea Guffey Chair,61,83,PERSON
3,English,87,94,LANGUAGE
4,Rice University,98,113,ORG


In [12]:
#@title Visualize NER
displacy.render(doc, style="ent", jupyter=True)

In [14]:
#@title All entities
for ent in nlp.entity.labels:
  print(f"{ent} - {spacy.explain(ent)}")

CARDINAL - Numerals that do not fall under another type
DATE - Absolute or relative dates or periods
EVENT - Named hurricanes, battles, wars, sports events, etc.
FAC - Buildings, airports, highways, bridges, etc.
GPE - Countries, cities, states
LANGUAGE - Any named language
LAW - Named documents made into laws.
LOC - Non-GPE locations, mountain ranges, bodies of water
MONEY - Monetary values, including unit
NORP - Nationalities or religious or political groups
ORDINAL - "first", "second", etc.
ORG - Companies, agencies, institutions, etc.
PERCENT - Percentage, including "%"
PERSON - People, including fictional
PRODUCT - Objects, vehicles, foods, etc. (not services)
QUANTITY - Measurements, as of weight or distance
TIME - Times smaller than a day
WORK_OF_ART - Titles of books, songs, etc.


In [25]:
#@title Semantic similarity
#@markdown Similarity is determined by comparing word vectors or “word embeddings”, multi-dimensional meaning representations of a word. 

#@markdown **NOTE:** spaCy’s small pipeline packages (all packages that end in sm) don’t ship with word vectors, and only include context-sensitive tensors. This means you can still use the similarity() methods to compare documents, spans and tokens – but the result won’t be as good, and individual tokens won’t have any vectors assigned. So in order to use real word vectors, you need to download a larger pipeline package

#@markdown Read more https://spacy.io/usage/linguistic-features#vectors-similarity

text1 = "spaCy features an extremely fast statistical entity recognition system, that assigns labels to contiguous spans of tokens."  #@param {type:"string"}
text2 = "The Onyx was designed as an affordable performance benchmark" #@param {type:"string"}

doc1 = nlp(text1)
doc2 = nlp(text2)

print(f"Similarity: {doc1.similarity(doc2)}")

Similarity: 0.7655950295149547


In [None]:
# word analogy example
# king is to man as what is to woman?
king = nlp.vocab['king']
man = nlp.vocab['man']
woman = nlp.vocab['woman']

# resulting vector
result = king.vector - man.vector + woman.vector
result

## Read more

SpaCy has really great doc page with lots of examples and explanations of how these algorithms can be used in the real life. So if you are interested in NLP, go and check it here https://spacy.io/usage