# SpaCy

Solving tiypical Natural Language Processing tasks with SpaCy

Made by [Artem Konevskikh](https://aiculedssul.net/)

In [None]:
#@title Load libraries
import os
import spacy
from spacy import displacy
import pandas as pd
from google.colab import data_table
from IPython.display import clear_output 

data_table.enable_dataframe_formatter()

In [None]:
#@title Select language model
lang = "en_core_web_lg"  #@param ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "ru_core_news_sm", "es_core_news_sm", "fr_core_news_sm" ]
#@markdown All supported languages are here https://spacy.io/usage/models#languages
!python -m spacy download $lang
nlp = spacy.load(lang)

In [None]:
#@title Linguistic Features
#@markdown Read more here https://spacy.io/usage/linguistic-features
text = "Timothy Bloxam Morton (born 19 June 1968) is a professor and Rita Shea Guffey Chair in English at Rice University."  #@param {type:"string"}
doc = nlp(text)
cols = ("text", "lemma", "POS", "explain", "tag", "dep", "shape", "alpha", "stopword")

rows = []

for t in doc:
    row = [t.text, t.lemma_, t.pos_, spacy.explain(t.pos_), t.tag_, t.dep_,
            t.shape_, t.is_alpha, t.is_stop]
    rows.append(row)

df = pd.DataFrame(rows, columns=cols)
    
df

In [None]:
#@title Dependency parsing
#@markdown Noun chunks are “base noun phrases” – flat phrases that have a noun as their head. You can think of noun chunks as a noun plus the words describing the noun 

#@markdown **Text**: The original noun chunk text.<br/>
#@markdown **Root text**: The original text of the word connecting the noun chunk to the rest of the parse.<br/>
#@markdown **Root dep**: Dependency relation connecting the root to its head.<br/>
#@markdown **Root head text**: The text of the root token’s head.

#@markdown Mor on dependency parsing https://spacy.io/usage/linguistic-features#dependency-parse
text = "Timothy Bloxam Morton (born 19 June 1968) is a professor and Rita Shea Guffey Chair in English at Rice University."  #@param {type:"string"}
doc = nlp(text)
cols = ('Text', 'Root text', 'Root dependency', 'Root head text')
rows = []
for chunk in doc.noun_chunks:
    row = [chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text]
    rows.append(row)

df = pd.DataFrame(rows, columns=cols)

df

In [None]:
#@title Visualize dependencies

displacy.render(doc, style='dep', jupyter=True)

In [None]:
#@title Named Entity Recognition
#@markdown Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example, identifying a token as a person, an organisation or a location.

#@markdown Read more https://spacy.io/usage/linguistic-features#named-entities

text = "Timothy Bloxam Morton (born 19 June 1968) is a professor and Rita Shea Guffey Chair in English at Rice University."  #@param {type:"string"}
doc = nlp(text)
cols = ('Text', 'Start', 'End', 'Label')
rows = []
for ent in doc.ents:
    row = [ent.text, ent.start_char, ent.end_char, ent.label_]
    rows.append(row)

df = pd.DataFrame(rows, columns=cols)

df

In [None]:
#@title Visualize NER
displacy.render(doc, style="ent", jupyter=True)

In [None]:
#@title All entities
for ent in nlp.pipe_labels['ner']:
  print(f"{ent} - {spacy.explain(ent)}")

In [None]:
#@title Semantic similarity
#@markdown Similarity is determined by comparing word vectors or “word embeddings”, multi-dimensional meaning representations of a word. 

#@markdown **NOTE:** spaCy’s small pipeline packages (all packages that end in sm) don’t ship with word vectors, and only include context-sensitive tensors. This means you can still use the similarity() methods to compare documents, spans and tokens – but the result won’t be as good, and individual tokens won’t have any vectors assigned. So in order to use real word vectors, you need to download a larger pipeline package

#@markdown Read more https://spacy.io/usage/linguistic-features#vectors-similarity

text1 = "spaCy features an extremely fast statistical entity recognition system, that assigns labels to contiguous spans of tokens."  #@param {type:"string"}
text2 = "The Onyx was designed as an affordable performance benchmark" #@param {type:"string"}

doc1 = nlp(text1)
doc2 = nlp(text2)

print(f"Similarity: {doc1.similarity(doc2)}")

In [None]:
#@title Vector Algebra
#@markdown Is `Queen = King - Man + Woman`?
from scipy import spatial
 
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
 
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector
king = nlp.vocab['king'].vector
 
new_vector = king-man+woman
similarities = []

for word in nlp.vocab:
    if word.has_vector and word.is_alpha and word.is_lower:
        similarities.append((cosine_similarity(new_vector,word.vector),word.text))

for similarity,word in  sorted(similarities,reverse=True)[:10]:
    print(word)

## Read more

SpaCy has really great doc page with lots of examples and explanations of how these algorithms can be used in the real life. So if you are interested in NLP, go and check it here https://spacy.io/usage