In [96]:
import spacy
import numpy as np
from sklearn.decomposition import PCA
import re
from spacy.matcher import Matcher, PhraseMatcher
from spacy.training import Example
import random

Tokenization with spaCy

In [2]:
text = 'NLP is becoming increasingly popular for providing business solutions.'

In [3]:
# Load en_core_web_sm and create an nlp object
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x247522ffa60>

In [4]:
# Create a Doc container for the text object
doc = nlp(text)
doc

NLP is becoming increasingly popular for providing business solutions.

In [5]:
# Create a list containing the text of each token in the Doc container
print([token.text for token in doc])

['NLP', 'is', 'becoming', 'increasingly', 'popular', 'for', 'providing', 'business', 'solutions', '.']


In [6]:
text = 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'

# Create a Doc container of the given text
document = nlp(text)
    
# Store and review the token text values of tokens for the Doc container
first_text_tokens = [token.text for token in document]
print("First text tokens:\n", first_text_tokens, "\n")

First text tokens:
 ['I', 'have', 'bought', 'several', 'of', 'the', 'Vitality', 'canned', 'dog', 'food', 'products', 'and', 'have', 'found', 'them', 'all', 'to', 'be', 'of', 'good', 'quality', '.', 'The', 'product', 'looks', 'more', 'like', 'a', 'stew', 'than', 'a', 'processed', 'meat', 'and', 'it', 'smells', 'better', '.', 'My', 'Labrador', 'is', 'finicky', 'and', 'she', 'appreciates', 'this', 'product', 'better', 'than', ' ', 'most', '.'] 



Running a spaCy pipeline

In [7]:
texts = ['A loaded spaCy model can be used to compile documents list!',
 'Tokenization is the first step in any spacy pipeline.']

In [8]:
# Run an nlp model on each item of texts and append the Doc container to documents
documents = []
for text in texts:
    documents.append(nlp(text))
documents

[A loaded spaCy model can be used to compile documents list!,
 Tokenization is the first step in any spacy pipeline.]

In [9]:
# Print the token texts for each Doc container
for doc in documents:
    print([token.text for token in doc])

['A', 'loaded', 'spaCy', 'model', 'can', 'be', 'used', 'to', 'compile', 'documents', 'list', '!']
['Tokenization', 'is', 'the', 'first', 'step', 'in', 'any', 'spacy', 'pipeline', '.']


Lemmatization with spaCy

In [10]:
text = 'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'

In [11]:
document = nlp(text)

tokens = [token.text for token in document]
print("Tokens:\n", tokens)

lemmas = [token.lemma_ for token in document]
print("Lemmas:\n", lemmas, "\n")

Tokens:
 ['I', 'have', 'bought', 'several', 'of', 'the', 'Vitality', 'canned', 'dog', 'food', 'products', 'and', 'have', 'found', 'them', 'all', 'to', 'be', 'of', 'good', 'quality', '.', 'The', 'product', 'looks', 'more', 'like', 'a', 'stew', 'than', 'a', 'processed', 'meat', 'and', 'it', 'smells', 'better', '.', 'My', 'Labrador', 'is', 'finicky', 'and', 'she', 'appreciates', 'this', 'product', 'better', 'than', ' ', 'most', '.']
Lemmas:
 ['I', 'have', 'buy', 'several', 'of', 'the', 'Vitality', 'can', 'dog', 'food', 'product', 'and', 'have', 'find', 'they', 'all', 'to', 'be', 'of', 'good', 'quality', '.', 'the', 'product', 'look', 'more', 'like', 'a', 'stew', 'than', 'a', 'process', 'meat', 'and', 'it', 'smell', 'well', '.', 'my', 'Labrador', 'be', 'finicky', 'and', 'she', 'appreciate', 'this', 'product', 'well', 'than', ' ', 'most', '.'] 



Sentence segmentation with spaCy

In [12]:
texts = ['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
 'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
 'If you are looking for the secret ingredient in Robitussin I believe I have found it.  I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda.  The flavor is very medicinal.',
 'Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.',
 'I got a wild hair for taffy and ordered this five pound bag. The taffy was all very enjoyable with many flavors: watermelon, root beer, melon, peppermint, grape, etc. My only complaint is there was a bit too much red/black licorice-flavored pieces (just not my particular favorites). Between me, my kids, and my husband, this lasted only two weeks! I would recommend this brand of taffy -- it was a delightful treat.',
 "This saltwater taffy had great flavors and was very soft and chewy.  Each candy was individually wrapped well.  None of the candies were stuck together, which did happen in the expensive version, Fralinger's.  Would highly recommend this candy!  I served it at a beach-themed party and everyone loved it!",
 'This taffy is so good.  It is very soft and chewy.  The flavors are amazing.  I would definitely recommend you buying it.  Very satisfying!!',
 "Right now I'm mostly just sprouting this so my cats can eat the grass. They love it. I rotate it around with Wheatgrass and Rye too",
 'This is a very healthy dog food. Good for their digestion. Also good for small puppies. My dog eats her required amount at every feeding.']

In [13]:
# Generating a documents list of all Doc containers
documents = [nlp(text) for text in texts]

# Iterate through documents and append sentences in each doc to the sentences list
sentences = []
for doc in documents:
    sentences.append([s for s in doc.sents])
    
# Find number of sentences per each doc container
print([len(s) for s in sentences])

[3, 2, 7, 3, 4, 5, 5, 5, 3, 4]


POS tagging with spaCy

In [14]:
texts = ['What is the arrival time in San francisco for the 7:55 AM flight leaving Washington?',
 'Cheapest airfare from Tacoma to Orlando is 650 dollars.',
 'Round trip fares from Pittsburgh to Philadelphia are under 1000 dollars!']

In [15]:
# Compile a list of all Doc containers of texts
documents = [nlp(text) for text in texts]

# Print token texts and POS tags for each Doc container
for doc in documents:
    for token in doc:
        print("Text: ", token.text, "| POS tag: ", token.pos_)
    print("\n")

Text:  What | POS tag:  PRON
Text:  is | POS tag:  AUX
Text:  the | POS tag:  DET
Text:  arrival | POS tag:  NOUN
Text:  time | POS tag:  NOUN
Text:  in | POS tag:  ADP
Text:  San | POS tag:  PROPN
Text:  francisco | POS tag:  PROPN
Text:  for | POS tag:  ADP
Text:  the | POS tag:  DET
Text:  7:55 | POS tag:  NUM
Text:  AM | POS tag:  PROPN
Text:  flight | POS tag:  NOUN
Text:  leaving | POS tag:  VERB
Text:  Washington | POS tag:  PROPN
Text:  ? | POS tag:  PUNCT


Text:  Cheapest | POS tag:  ADJ
Text:  airfare | POS tag:  NOUN
Text:  from | POS tag:  ADP
Text:  Tacoma | POS tag:  PROPN
Text:  to | POS tag:  ADP
Text:  Orlando | POS tag:  PROPN
Text:  is | POS tag:  AUX
Text:  650 | POS tag:  NUM
Text:  dollars | POS tag:  NOUN
Text:  . | POS tag:  PUNCT


Text:  Round | POS tag:  ADJ
Text:  trip | POS tag:  NOUN
Text:  fares | POS tag:  NOUN
Text:  from | POS tag:  ADP
Text:  Pittsburgh | POS tag:  PROPN
Text:  to | POS tag:  ADP
Text:  Philadelphia | POS tag:  PROPN
Text:  are | POS

NER with spaCy

In [16]:
texts = ['I want to fly from Boston at 8:38 am and arrive in Denver at 11:10 in the morning',
 'What flights are available from Pittsburgh to Baltimore on Thursday morning?',
 'What is the arrival time in San francisco for the 7:55 AM flight leaving Washington?']

In [17]:
# Compile a list of all Doc containers of texts
documents = [nlp(text) for text in texts]

# Print the entity text and label for the entities in each document
for doc in documents:
    print([(ent.text, ent.label_) for ent in doc.ents])

[('Boston', 'GPE'), ('8:38 am', 'TIME'), ('Denver', 'GPE'), ('11:10 in the morning', 'TIME')]
[('Pittsburgh', 'GPE'), ('Baltimore', 'GPE'), ('Thursday', 'DATE'), ('morning', 'TIME')]
[('San francisco', 'GPE'), ('7:55 AM', 'TIME'), ('Washington', 'GPE')]


In [18]:
# Print the 6th token's text and entity type of the second document
print("\nText:", documents[1][5].text, "| Entity type: ", documents[1][5].ent_type_)


Text: Pittsburgh | Entity type:  GPE


Text processing with spaCy

In [19]:
texts = ['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
 'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
 'If you are looking for the secret ingredient in Robitussin I believe I have found it.  I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda.  The flavor is very medicinal.',
 'Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.',
 'I got a wild hair for taffy and ordered this five pound bag. The taffy was all very enjoyable with many flavors: watermelon, root beer, melon, peppermint, grape, etc. My only complaint is there was a bit too much red/black licorice-flavored pieces (just not my particular favorites). Between me, my kids, and my husband, this lasted only two weeks! I would recommend this brand of taffy -- it was a delightful treat.',
 "This saltwater taffy had great flavors and was very soft and chewy.  Each candy was individually wrapped well.  None of the candies were stuck together, which did happen in the expensive version, Fralinger's.  Would highly recommend this candy!  I served it at a beach-themed party and everyone loved it!",
 'This taffy is so good.  It is very soft and chewy.  The flavors are amazing.  I would definitely recommend you buying it.  Very satisfying!!',
 "Right now I'm mostly just sprouting this so my cats can eat the grass. They love it. I rotate it around with Wheatgrass and Rye too",
 'This is a very healthy dog food. Good for their digestion. Also good for small puppies. My dog eats her required amount at every feeding.']

In [20]:
documents = [nlp(text) for text in texts]

In [21]:
# Create a list to store sentences of each Doc container in documents
sentences = [[sent for sent in doc.sents] for doc in documents]
sentences[0]

[I have bought several of the Vitality canned dog food products and have found them all to be of good quality.,
 The product looks more like a stew than a processed meat and it smells better.,
 My Labrador is finicky and she appreciates this product better than  most.]

In [22]:
# Print number of sentences in each Doc container in documents
num_sentences = [len(s) for s in sentences]
print("Number of sentences in documents:\n", num_sentences)

Number of sentences in documents:
 [3, 2, 7, 3, 4, 5, 5, 5, 3, 4]


In [23]:
# Record entities text and corresponding label of the third Doc container
third_text_entities = [(ent.text, ent.label_) for ent in documents[2].ents]
print("Third text entities:\n", third_text_entities, "\n")

Third text entities:
 [('citrus gelatin', 'PERSON'), ('Filberts', 'PERSON'), ("C.S. Lewis'", 'ORG'), ('The Lion, The Witch', 'WORK_OF_ART'), ('The Wardrobe', 'WORK_OF_ART'), ('Edmund', 'GPE'), ('Sisters', 'PERSON'), ('Witch', 'LOC')] 



In [24]:
# Record first ten tokens and corresponding POS tag for the third Doc container
third_text_10_pos = [(token.text, token.pos_) for token in documents[2]][:10]
print("First ten tokens of third text:\n", third_text_10_pos)

First ten tokens of third text:
 [('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('confection', 'NOUN'), ('that', 'PRON'), ('has', 'AUX'), ('been', 'AUX'), ('around', 'ADP'), ('a', 'DET'), ('few', 'ADJ')]


Word-sense disambiguation with spaCy

In [25]:
texts = ["This device is used to jam the signal.",
         "I am stuck in a traffic jam"]

# Create a list of Doc containers in the texts list
documents = [nlp(t) for t in texts]

# Print a token's text and POS tag if the word jam is in the token's text
for i, doc in enumerate(documents):
    print(f"Sentence {i+1}: ", [(token.text, token.pos_) for token in doc if "jam" in token.text], "\n")

Sentence 1:  [('jam', 'VERB')] 

Sentence 2:  [('jam', 'NOUN')] 



Dependency parsing with spaCy

In [26]:
texts = ['I want to fly from Boston at 8:38 am and arrive in Denver at 11:10 in the morning',
 'What flights are available from Pittsburgh to Baltimore on Thursday morning?',
 'What is the arrival time in San francisco for the 7:55 AM flight leaving Washington?']

In [27]:
# Create a list of Doc containts of texts list
documents = [nlp(t) for t in texts]

# Print each token's text, dependency label and its explanation
for doc in documents:
    print([(token.text, token.dep_, spacy.explain(token.dep_)) for token in doc], "\n")

[('I', 'nsubj', 'nominal subject'), ('want', 'ROOT', 'root'), ('to', 'aux', 'auxiliary'), ('fly', 'xcomp', 'open clausal complement'), ('from', 'prep', 'prepositional modifier'), ('Boston', 'pobj', 'object of preposition'), ('at', 'prep', 'prepositional modifier'), ('8:38', 'nummod', 'numeric modifier'), ('am', 'pobj', 'object of preposition'), ('and', 'cc', 'coordinating conjunction'), ('arrive', 'conj', 'conjunct'), ('in', 'prep', 'prepositional modifier'), ('Denver', 'pobj', 'object of preposition'), ('at', 'prep', 'prepositional modifier'), ('11:10', 'pobj', 'object of preposition'), ('in', 'prep', 'prepositional modifier'), ('the', 'det', 'determiner'), ('morning', 'pobj', 'object of preposition')] 

[('What', 'det', 'determiner'), ('flights', 'nsubj', 'nominal subject'), ('are', 'ROOT', 'root'), ('available', 'acomp', 'adjectival complement'), ('from', 'prep', 'prepositional modifier'), ('Pittsburgh', 'pobj', 'object of preposition'), ('to', 'prep', 'prepositional modifier'), ('B

spaCy vocabulary

In [28]:
# Load the en_core_web_md model
lg_nlp = spacy.load("en_core_web_lg")

lg_nlp.meta["vectors"]

{'width': 300,
 'vectors': 514157,
 'keys': 514157,
 'name': 'en_vectors',
 'mode': 'default'}

In [29]:
# Print the number of words in the model's vocabulary
print("Number of words: ", lg_nlp.meta["vectors"]["vectors"], "\n")

# Print the dimensions of word vectors in en_core_web_md model
print("Dimension of word vectors: ", lg_nlp.meta["vectors"]["width"])

Number of words:  514157 

Dimension of word vectors:  300


Word vectors in spaCy vocabulary

In [30]:
words = ["like", "love"]

# IDs of all the given words
ids = [lg_nlp.vocab.strings[w] for w in words]
ids

[18194338103975822726, 3702023516439754181]

In [31]:
# Store the first ten elements of the word vectors for each word
word_vectors = [lg_nlp.vocab.vectors[i][:10] for i in ids]
word_vectors

[array([-2.3334 , -1.3695 , -1.133  , -0.68461, -1.8482 , -0.63712,
         2.6791 ,  4.1433 , -2.5616 , -1.8061 ], dtype=float32),
 array([ 2.0565 , -3.2259 , -5.7364 , -6.146  ,  0.15748, -2.4284 ,
         7.658  ,  2.7064 , -2.211  , -0.8999 ], dtype=float32)]

Word vectors projection

In [32]:
words = ["tiger", "bird"]

# Extract word IDs of given words
word_ids = [lg_nlp.vocab.strings[w] for w in words]
word_ids

[5423999730010037932, 10103162543233135282]

In [33]:
# Extract word vectors and stack the first five (or ten) elements vertically
word_vectors = np.vstack([lg_nlp.vocab.vectors[i][:10] for i in word_ids])
word_vectors

array([[ 0.032863,  1.8007  , -1.3854  , -3.5269  , -0.24236 ,  0.41086 ,
         0.26883 ,  0.26619 ,  2.2089  ,  0.5561  ],
       [ 4.8752  , -1.9177  , -1.3281  , -5.278   ,  2.2977  , -0.40337 ,
        -2.4936  ,  0.63511 , -2.1338  ,  2.1657  ]], dtype=float32)

In [34]:
# Calculate the transformed word vectors using the pca object
pca = PCA(n_components=2)
word_vectors_transformed = pca.fit_transform(word_vectors)
word_vectors_transformed

array([[ 4.3782477e+00,  4.1429936e-07],
       [-4.3782473e+00,  4.1429939e-07]], dtype=float32)

In [35]:
# Print the first component of the transformed word vectors
print(word_vectors_transformed[:, 0])

[ 4.3782477 -4.3782473]


Similar words in a vocabulary

In [36]:
id_comp = lg_nlp.vocab.strings['computer']
id_comp

4912942957612137283

In [37]:
word_vector = lg_nlp.vocab.vectors[id_comp]
word_vector

array([ 0.67285 , -0.86102 ,  1.6945  , -0.60859 ,  0.13298 ,  1.5135  ,
        3.8222  ,  5.4456  , -2.8585  , -1.2448  ,  6.2764  ,  4.2784  ,
       -4.589   ,  5.011   ,  1.015   ,  2.0452  ,  3.8958  ,  0.33248 ,
       -3.322   ,  1.1575  ,  2.4343  ,  0.31949 , -0.33847 , -1.6331  ,
       -3.269   , -4.7326  , -2.4177  , -4.63    ,  1.3962  ,  3.7998  ,
       -0.787   , -0.85938 , -3.0182  ,  0.28905 ,  0.64035 , -0.45379 ,
        3.4345  , -0.22517 ,  3.8027  ,  2.6739  ,  0.97571 , -0.76596 ,
        0.53999 ,  2.2714  ,  0.87652 , -0.84191 ,  0.24501 , -3.1094  ,
        1.3224  , -2.7244  , -0.30636 ,  2.9063  ,  0.26466 , -3.5191  ,
       -1.0453  ,  2.3392  , -1.3542  ,  3.7928  , -1.4989  , -0.82555 ,
        3.2697  ,  4.2062  , -5.0726  , -1.458   ,  0.82807 ,  2.1958  ,
       -1.1805  , -4.4558  , -0.26838 ,  3.1355  , -0.60561 ,  1.8562  ,
       -0.35967 , -0.53501 ,  2.6208  , -2.8202  , -2.2845  , -1.1683  ,
       -4.325   , -2.5027  , -2.8288  , -0.52345 , 

In [38]:
# Find the most similar word to the word computer
most_similar_words = lg_nlp.vocab.vectors.most_similar(np.asarray([word_vector]), n = 10)
most_similar_words

(array([[ 4912942957612137283,  6749792856493933245,  7262238544365424156,
          3596863613112108743, 18005194085716360258,  2048265920074810992,
         17343804685088620554, 14132390025522728554, 14099521314249290004,
         17778691834923080873]], dtype=uint64),
 array([[  1306, 344354, 439921, 429316,   5438, 159166,  95645, 476453,
         138676, 314483]], dtype=int32),
 array([[1.    , 0.9592, 0.9515, 0.9069, 0.8993, 0.8867, 0.8786, 0.862 ,
         0.8492, 0.8385]], dtype=float32))

In [39]:
# Find the list of similar words given the word IDs
words = [lg_nlp.vocab.strings[w] for w in most_similar_words[0][0]]
print(words)

['computer', 'computer-', 'computer--', 'Batcomputer', 'computers', 'minicomputer', 'microcomputer', 'Komputer', 'microcomputers', 'computerize']


Doc similarity with spaCy

In [40]:
texts = ['I like the Vitality canned dog food products.',
 'The peanuts were actually small sized unsalted. Not sure if this was an error.',
 'It is a light, pillowy citrus gelatin with nuts - in this case Filberts.',
 'the Root Beer Extract I ordered is very medicinal.',
 'Great taffy at a great price.']

In [41]:
# Create a documents list containing Doc containers
documents = [lg_nlp(t) for t in texts]
documents

[I like the Vitality canned dog food products.,
 The peanuts were actually small sized unsalted. Not sure if this was an error.,
 It is a light, pillowy citrus gelatin with nuts - in this case Filberts.,
 the Root Beer Extract I ordered is very medicinal.,
 Great taffy at a great price.]

In [42]:
# Create a Doc container of the category
category = "canned dog food"
category_document = lg_nlp(category)
category_document

canned dog food

In [43]:
# Print similarity scores of each Doc container and the category_document
for i, doc in enumerate(documents):
    print(f"Semantic similarity with document {i+1}:", round(doc.similarity(category_document), 3))

Semantic similarity with document 1: 0.761
Semantic similarity with document 2: 0.365
Semantic similarity with document 3: 0.336
Semantic similarity with document 4: 0.382
Semantic similarity with document 5: 0.29


Span similarity with spaCy

In [44]:
# Print similarity score of a given Span and category_document
document_span = documents[0][0:3]
print(f"Semantic similarity with", document_span.text, ":", round(document_span.similarity(category_document), 3))

Semantic similarity with I like the : 0.32


Semantic similarity for categorizing text

In [45]:
texts = 'This hot sauce is amazing! We picked up a bottle on a trip! '

In [46]:
# Populate Doc containers for the word "sauce" and for "texts" string
key = lg_nlp('sauce')
sentences = lg_nlp(texts)

In [47]:
# Calculate similarity score of each sentence and a Doc container for the word sauce
semantic_scores = []
for sent in sentences.sents:
    semantic_scores.append({"score": round(sent.similarity(key), 2)})
    print(sent, sent.similarity(key))
semantic_scores

This hot sauce is amazing! 0.49996989411361104
We picked up a bottle on a trip! 0.16954893952092578


[{'score': 0.5}, {'score': 0.17}]

Adding pipes in spaCy

In [48]:
texts ='I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most. Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo". This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch. If you are looking for the secret ingredient in Robitussin I believe I have found it.  I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda.  The flavor is very medicinal. Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.'

In [49]:
# Load a blank spaCy English model and add a sentencizer component
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

# Create Doc containers, store sentences and print its number of sentences
doc = nlp(texts)
sentences = [s for s in doc.sents]
print("Number of sentences: ", len(sentences), "\n")
sentences

Number of sentences:  19 



[I have bought several of the Vitality canned dog food products and have found them all to be of good quality.,
 The product looks more like a stew than a processed meat and it smells better.,
 My Labrador is finicky and she appreciates this product better than  most.,
 Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted.,
 Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".,
 This is a confection that has been around a few centuries.,
  It is a light, pillowy citrus gelatin with nuts - in this case Filberts.,
 And it is cut into tiny squares and then liberally coated with powdered sugar.,
  And it is a tiny mouthful of heaven.,
  Not too chewy, and very flavorful.,
  I highly recommend this yummy treat.,
  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.,
 If you are

In [50]:
# Print the list of tokens in the second sentence
print("Second sentence tokens: ", [token for token in sentences[1]])

Second sentence tokens:  [The, product, looks, more, like, a, stew, than, a, processed, meat, and, it, smells, better, .]


Analyzing pipelines in spaCy

In [51]:
# Load a blank spaCy English model
nlp = spacy.blank("en")

# Add tagger and entity_linker pipeline components
nlp.add_pipe("tagger")
nlp.add_pipe("entity_linker")

# Analyze the pipeline
analysis = nlp.analyze_pipes(pretty=True)

# In this instance, the pipeline is missing sentence segmentation and named entity recognition components before entity_linker component.

[1m

#   Component       Assigns           Requires         Scores        Retokenizes
-   -------------   ---------------   --------------   -----------   -----------
0   tagger          token.tag                          tag_acc       False      
                                                                                
1   entity_linker   token.ent_kb_id   doc.ents         nel_micro_f   False      
                                      doc.sents        nel_micro_r              
                                      token.ent_iob    nel_micro_p              
                                      token.ent_type                            

[1m
[38;5;3m⚠ 'entity_linker' requirements not met: doc.ents, doc.sents,
token.ent_iob, token.ent_type[0m


EntityRuler with blank spaCy model

In [52]:
nlp = spacy.blank("en")
patterns = [{"label": "ORG", "pattern": [{"LOWER": "openai"}]},
            {"label": "ORG", "pattern": [{"LOWER": "microsoft"}]}]
text = "OpenAI has joined forces with Microsoft."

# Add EntityRuler component to the model
entity_ruler = nlp.add_pipe("entity_ruler")

# Add given patterns to the EntityRuler component
entity_ruler.add_patterns(patterns)

# Run the model on a given text
doc = nlp(text)

# Print entities text and type for all entities in the Doc container
print([(ent.text, ent.label_) for ent in doc.ents])

[('OpenAI', 'ORG'), ('Microsoft', 'ORG')]


EntityRuler for NER

In [53]:
nlp = spacy.load("en_core_web_sm")
text = "New York Group was built in 1987."

# Add an EntityRuler to the nlp before NER component
ruler = nlp.add_pipe("entity_ruler", before="ner")

# Define a pattern to classify lower cased new york group as ORG
patterns = [{"label": "ORG", "pattern": [{"lower": "new york group"}]}]

# Add the patterns to the EntityRuler component
ruler.add_patterns(patterns)

# Run the model and print entities text and type for all the entities
doc = nlp(text)
print([(ent.text, ent.label_) for ent in doc.ents])

[('New York Group', 'ORG'), ('1987', 'DATE')]


EntityRuler with multi-patterns in spaCy

In [54]:
example_text = 'This is a confection. In this case Filberts. And it is cut into tiny squares. This is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.'

In [55]:
nlp = spacy.load("en_core_web_md")

# Print a list of tuples of entities text and types in the example_text
print("Before EntityRuler: ", [(ent.text, ent.label_) for ent in nlp(example_text).ents], "\n")

# Define pattern to add a label PERSON for lower cased sisters and brother entities
patterns = [{"label": "PERSON", "pattern": [{"lower": "brother"}]},
            {"label": "PERSON", "pattern": [{"lower": "sisters"}]}]

# Add an EntityRuler component and add the patterns to the ruler
ruler = nlp.add_pipe("entity_ruler", before = "ner")
ruler.add_patterns(patterns)

# Print a list of tuples of entities text and types
print("After EntityRuler: ", [(ent.text, ent.label_) for ent in nlp(example_text).ents])

Before EntityRuler:  [('Filberts', 'PERSON'), ('Edmund', 'PERSON'), ('Sisters', 'ORG')] 

After EntityRuler:  [('Filberts', 'PERSON'), ('Edmund', 'PERSON'), ('Brother', 'PERSON'), ('Sisters', 'PERSON')]


RegEx in Python

In [56]:
text = "Our phone number is (425)-123-4567."

# Define a pattern to match phone numbers
pattern = r"\((\d){3}\)-(\d){3}-(\d){4}"

# Find all the matching patterns in the text
phones = re.finditer(pattern, text)
phones

<callable_iterator at 0x2475eff39a0>

In [57]:
# Print start and end characters and matching section of the text
for match in phones:
    start_char = match.start()
    end_char = match.end()
    print("Start character: ", start_char, "| End character: ", end_char, "| Matching text: ", text[start_char:end_char])

Start character:  20 | End character:  34 | Matching text:  (425)-123-4567


RegEx with EntityRuler in spaCy

In [58]:
text = "Our phone number is 4251234567."

# Define a pattern to match phone numbers
patterns = [{"label": "PHONE_NUMBERS", "pattern": [{"TEXT": {"REGEX": "(\d){10}"}}]}]

# Load a blank model and add an EntityRuler
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")

# Add the compiled patterns to the EntityRuler
ruler.add_patterns(patterns)

# Print the tuple of entities texts and types for the given text
doc = nlp(text)
print([(ent.text, ent.label_) for ent in doc.ents])

[('4251234567', 'PHONE_NUMBERS')]


Matching a single term in spaCy

In [59]:
example_text = 'I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.'

In [60]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(example_text)

# Initialize a Matcher object
matcher = Matcher(nlp.vocab)

# Define a pattern to match lower cased word witch
pattern = [{"lower" : "witch"}]

# Add the pattern to matcher object and find matches
matcher.add("CustomMatcher", [pattern])
matches = matcher(doc)

# Print start and end token indices and span of the matched text
for match_id, start, end in matches:
    print("Start token: ", start, " | End token: ", end, "| Matched text: ", doc[start:end].text)

Start token:  24  | End token:  25 | Matched text:  Witch
Start token:  47  | End token:  48 | Matched text:  Witch


PhraseMatcher in spaCy

In [61]:
text = "There are only a few acceptable IP addresse: (1) 127.100.0.1, (2) 123.4.1.0."
terms = ["110.0.0.0", "101.243.0.7"]

# Initialize a PhraseMatcher class to match to shapes of given terms
matcher = PhraseMatcher(nlp.vocab, attr = "SHAPE")

# Create patterns to add to the PhraseMatcher object
patterns = [nlp.make_doc(term) for term in terms]
matcher.add("IPAddresses", patterns)
# Find matches to the given patterns and print start and end characters and matches texts
doc = nlp(text)
doc


There are only a few acceptable IP addresse: (1) 127.100.0.1, (2) 123.4.1.0.

In [62]:
matches = matcher(doc)
matches

[(17762269709679412013, 12, 13), (17762269709679412013, 17, 18)]

In [63]:
for match_id, start, end in matches:
    print("Start token: ", start, " | End token: ", end, "| Matched text: ", doc[start:end].text)

Start token:  12  | End token:  13 | Matched text:  127.100.0.1
Start token:  17  | End token:  18 | Matched text:  123.4.1.0


Matching with extended syntax in spaCy

In [64]:
example_text = 'It is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.'

In [65]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(example_text)

# Define a matcher object
matcher = Matcher(nlp.vocab)
# Define a pattern to match tiny squares and tiny mouthful
pattern = [{"lower": "tiny"}, {"lower": {"IN": ["squares", "mouthful"]}}]

# Add the pattern to matcher object and find matches
matcher.add("CustomMatcher", [pattern])
matches = matcher(doc)
matches

[(13760853105470257182, 4, 6), (13760853105470257182, 19, 21)]

In [66]:
# Print out start and end token indices and the matched text span per match
for match_id, start, end in matches:
    print("Start token: ", start, " | End token: ", end, "| Matched text: ", doc[start:end].text)

Start token:  4  | End token:  6 | Matched text:  tiny squares
Start token:  19  | End token:  21 | Matched text:  tiny mouthful


Model performance on your data

In [None]:
# Load en_core_web_sm and create an nlp object
nlp = spacy.load('en_core_web_sm')

In [70]:
documents = ["Product arrived labeled as Jumbo Salted Peanuts.",
 "Not sure if the product was labeled as Jumbo."]

documents = [nlp(doc) for doc in documents]
documents

[Product arrived labeled as Jumbo Salted Peanuts.,
 Not sure if the product was labeled as Jumbo.]

In [72]:
texts = ['Product arrived labeled as Jumbo Salted Peanuts.',
 'Not sure if the product was labeled as Jumbo.']

texts = [nlp(text) for text in texts]
texts

[Product arrived labeled as Jumbo Salted Peanuts.,
 Not sure if the product was labeled as Jumbo.]

In [73]:
# Append a tuple of (entities text, entities label) if Jumbo is in the entity
target_entities = []
for doc in documents:
    target_entities.extend([(ent.text, ent.label_) for ent in doc.ents if "Jumbo" in ent.text])
print(target_entities)

[('Jumbo Salted Peanuts', 'PERSON'), ('Jumbo', 'PERSON')]


In [74]:
# Append True to the correct_labels list if the entity label is `PRODUCT`
correct_labels = []
for ent in target_entities:
    if ent[1] == "PRODUCT":
        correct_labels.append(True)
    else:
        correct_labels.append(False)
print(correct_labels)

[False, False]


Annotation and preparing training data

In [75]:
text = "A patient with chest pain had hyperthyroidism."
entity_1 = "chest pain"
entity_2 = "hyperthyroidism"

# Store annotated data information in the correct format
annotated_data = {"sentence": text, "entities": [{"label": "SYMPTOM", "value": entity_1}, {"label": "DISEASE", "value": entity_2}]}

# Extract start and end characters of each entity
entity_1_start_char = text.find(entity_1)
entity_1_end_char = entity_1_start_char + len(entity_1)
entity_2_start_char = text.find(entity_2)
entity_2_end_char = entity_2_start_char + len(entity_2)

# Store the same input information in the proper format for training
training_data = [(text, {"entities": [(entity_1_start_char,entity_1_end_char,"SYMPTOM"), 
                                      (entity_2_start_char,entity_2_end_char,"DISEASE")]})]
print(training_data)

[('A patient with chest pain had hyperthyroidism.', {'entities': [(15, 25, 'SYMPTOM'), (30, 45, 'DISEASE')]})]


Compatible training data

In [84]:
example_text = 'A patient with chest pain had hyperthyroidism.'
training_data = [(example_text, {'entities': [(15, 25, 'SYMPTOM'), (30, 45, 'DISEASE')]})]

all_examples = []
# Iterate through text and annotations and convert text to a Doc container
for text, annotations in training_data:
    doc = nlp(text)

    # Create an Example object from the doc contianer and annotations
    example_sentence = Example.from_dict(doc, annotations)
    print(example_sentence.to_dict(), "\n")

    # Append the Example object to the list of all examples
    all_examples.append(example_sentence)

print("Number of formatted training data: ", len(all_examples))

{'doc_annotation': {'cats': {}, 'entities': ['O', 'O', 'O', 'B-SYMPTOM', 'L-SYMPTOM', 'O', 'U-DISEASE', 'O'], 'spans': {}, 'links': {}}, 'token_annotation': {'ORTH': ['A', 'patient', 'with', 'chest', 'pain', 'had', 'hyperthyroidism', '.'], 'SPACY': [True, True, True, True, True, True, False, False], 'TAG': ['', '', '', '', '', '', '', ''], 'LEMMA': ['', '', '', '', '', '', '', ''], 'POS': ['', '', '', '', '', '', '', ''], 'MORPH': ['', '', '', '', '', '', '', ''], 'HEAD': [0, 1, 2, 3, 4, 5, 6, 7], 'DEP': ['', '', '', '', '', '', '', ''], 'SENT_START': [1, 0, 0, 0, 0, 0, 0, 0]}} 

Number of formatted training data:  1


Training preparation steps

In [88]:
nlp = spacy.load("en_core_web_sm")

# Disable all pipeline components of  except `ner`
all_pipes = [pipeline for pipeline in nlp.pipe_names]
other_pipes = [pipeline for pipeline in nlp.pipe_names if pipeline != 'ner']
nlp.disable_pipes(*other_pipes)

all_pipes

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [90]:
# Convert a text and its annotations to the correct format usable for training
doc = nlp.make_doc(example_text)
example = Example.from_dict(doc, annotations)
print("Example object for training: \n", example.to_dict())

Example object for training: 
 {'doc_annotation': {'cats': {}, 'entities': ['O', 'O', 'O', 'B-SYMPTOM', 'L-SYMPTOM', 'O', 'U-DISEASE', 'O'], 'spans': {}, 'links': {}}, 'token_annotation': {'ORTH': ['A', 'patient', 'with', 'chest', 'pain', 'had', 'hyperthyroidism', '.'], 'SPACY': [True, True, True, True, True, True, False, False], 'TAG': ['', '', '', '', '', '', '', ''], 'LEMMA': ['', '', '', '', '', '', '', ''], 'POS': ['', '', '', '', '', '', '', ''], 'MORPH': ['', '', '', '', '', '', '', ''], 'HEAD': [0, 1, 2, 3, 4, 5, 6, 7], 'DEP': ['', '', '', '', '', '', '', ''], 'SENT_START': [1, 0, 0, 0, 0, 0, 0, 0]}}


Train an existing NER model

In [91]:
training_data = [('I will visit you in Austin.', {'entities': [(20, 26, 'GPE')]}),
 ("I'm going to Sam's house.",
  {'entities': [(13, 16, 'PERSON'), (19, 24, 'GPE')]}),
 ('I will go.', {'entities': []})]

In [92]:
test = "I'm going to Sam's house."

nlp = spacy.load("en_core_web_sm") # A spaCy en_core_web_sm model that is accessible as nlp, which is not able to correctly predict house as an entity in a test string.
print("Before training: ", [(ent.text, ent.label_) for ent in nlp(test).ents])

Before training:  [('Sam', 'PERSON')]


In [93]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
nlp.disable_pipes(*other_pipes)
optimizer = nlp.create_optimizer()

other_pipes

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']

In [97]:
epochs = 2

# Shuffle training data and the dataset using random package per epoch
for i in range(epochs):
    random.shuffle(training_data)
    for text, annotations in training_data:
        doc = nlp.make_doc(text)
        # Update nlp model after setting sgd argument to optimizer
        example = Example.from_dict(doc, annotations)
        nlp.update([example], sgd = optimizer)
print("After training: ", [(ent.text, ent.label_) for ent in nlp(test).ents])

After training:  [('Sam', 'PERSON'), ('house', 'GPE')]


Training a spaCy model from scratch

In [98]:
training_data = [('Diarrhea, also spelled diarrhoea, is the condition of having at least three loose, liquid, or watery bowel movements each day.[2] It often lasts for a few days and can result in dehydration due to fluid loss.[2] Signs of dehydration often begin with loss of the normal stretchiness of the skin and irritable behaviour.[2] This can progress to decreased urination, loss of skin color, a fast heart rate, and a decrease in responsiveness as it becomes more severe.[2] Loose but non-watery stools in babies who are exclusively breastfed, however, are normal.[2]',
  {'entities': [(364, 382, 'MedicalCondition'),
    (0, 8, 'MedicalCondition'),
    (94, 116, 'MedicalCondition'),
    (178, 189, 'MedicalCondition'),
    (221, 232, 'MedicalCondition'),
    (23, 32, 'MedicalCondition'),
    (409, 435, 'MedicalCondition'),
    (386, 401, 'MedicalCondition')]}),
 ('Antiretroviral therapy (ART) is recommended for all HIV-infected individuals to reduce the risk of disease progression.\nART also is recommended for HIV-infected individuals for the prevention of transmission of HIV.\nPatients starting ART should be willing and able to commit to treatment and understand the benefits and risks of therapy and the importance of adherence. Patients may choose to postpone therapy, and providers, on a case-by-case basis, may elect to defer therapy on the basis of clinical and/or psychosocial factors.',
  {'entities': [(0, 22, 'Medicine'),
    (24, 27, 'Medicine'),
    (120, 123, 'Medicine'),
    (211, 214, 'Pathogen'),
    (52, 55, 'Pathogen'),
    (234, 237, 'Medicine'),
    (148, 151, 'Pathogen')]}),
 ("The goals of treatment are to reduce pain, decrease inflammation, and improve a person's overall functioning.[5] This may be helped by balancing rest and exercise, the use of splints and braces, or the use of assistive devices.[1][6][7] Pain medications, steroids, and NSAIDs are frequently used to help with symptoms.[1] Disease-modifying antirheumatic drugs (DMARDs), such as hydroxychloroquine and methotrexate, may be used to try to slow the progression of disease.[1] Biological DMARDs may be used when disease does not respond to other treatments.[8] However, they may have a greater rate of adverse effects.[9] Surgery to repair, replace, or fuse joints may help in certain situations.[1] Most alternative medicine treatments are not supported by evidence.[10][11]",
  {'entities': [(401, 413, 'Medicine'),
    (378, 396, 'Medicine'),
    (473, 490, 'Medicine'),
    (255, 263, 'Medicine')]}),
 ("Hantaviruses, usually found in rodents and shrews, were discovered in two species of bats. The Mouyassué virus (MOUV) was isolated from banana pipistrelle bats captured near Mouyassué village in Cote d'Ivoire, West Africa. The Magboi virus was isolated from hairy slit-faced bats found near the Magboi River in Sierra Leone in 2011. They are single-stranded, negative sense, RNA viruses in the Bunyaviridae family.[29][30][31][32]",
  {'entities': [(0, 12, 'Pathogen'),
    (394, 406, 'Pathogen'),
    (227, 239, 'Pathogen'),
    (95, 110, 'Pathogen')]}),
 ('Bats are the most common source of rabies in humans in North and South America, Western Europe, and Australia. In the United States, there were 19 cases of human rabies from 1997–2006, 17 of which were attributed to bats.[27] In North America, about half of human rabies instances are cryptic, meaning that the patient has no known bite history.[24] While it has been speculated that rabies virus could be transmitted through aerosols, studies of the rabies virus have concluded that this is only feasible in limited conditions. These conditions include a very large colony of bats in a hot and humid cave with poor ventilation. While two human deaths in 1956 and 1959 had been tentatively attributed to aerosolization of the rabies virus after entering a cave with bats, "investigations of the 2 reported human cases revealed that both infections could be explained by means other than aerosol transmission".[28] It is instead generally thought that most instances of cryptic rabies are the result of an unknown bat bite.[24] Bites from a bat can be so small that they are not visible without magnification equipment, for example. Outside of bites, rabies virus exposure can also occur if infected fluids come in contact with a mucous membrane or a break in the skin. Rabies virus has also been transmitted when an infected human unknowingly dies of rabies, and their organs are transplanted to others.[28]',
  {'entities': [(35, 41, 'MedicalCondition'),
    (162, 168, 'MedicalCondition'),
    (384, 396, 'Pathogen'),
    (1269, 1281, 'Pathogen'),
    (1343, 1347, 'MedicalCondition'),
    (977, 983, 'MedicalCondition'),
    (1027, 1032, 'MedicalCondition')]}),
 ('Other groups of intracellular bacterial pathogens include Salmonella, Neisseria, Brucella, Mycobacterium, Nocardia, Listeria, Francisella, Legionella, and Yersinia pestis. These can exist intracellularly, but can exist outside of host cells.',
  {'entities': [(116, 124, 'Pathogen'),
    (155, 170, 'Pathogen'),
    (126, 137, 'Pathogen'),
    (70, 79, 'Pathogen'),
    (139, 149, 'Pathogen'),
    (106, 114, 'Pathogen'),
    (91, 104, 'Pathogen'),
    (81, 89, 'Pathogen'),
    (58, 68, 'Pathogen')]}),
 ('One of the bacterial diseases with the highest disease burden is tuberculosis, caused by Mycobacterium tuberculosis bacteria, which kills about 2 million people a year, mostly in sub-Saharan Africa. Pathogenic bacteria contribute to other globally important diseases, such as pneumonia, which can be caused by bacteria such as Streptococcus and Pseudomonas, and foodborne illnesses, which can be caused by bacteria such as Shigella, Campylobacter, and Salmonella. Pathogenic bacteria also cause infections such as tetanus, typhoid fever, diphtheria, syphilis, and leprosy. Pathogenic bacteria are also the cause of high infant mortality rates in developing countries.[3]',
  {'entities': [(327, 340, 'Pathogen'),
    (514, 521, 'MedicalCondition'),
    (452, 462, 'Pathogen'),
    (276, 285, 'MedicalCondition'),
    (523, 536, 'MedicalCondition'),
    (564, 571, 'MedicalCondition'),
    (433, 446, 'Pathogen'),
    (538, 548, 'MedicalCondition'),
    (345, 356, 'Pathogen'),
    (65, 77, 'MedicalCondition'),
    (550, 558, 'MedicalCondition'),
    (89, 115, 'Pathogen'),
    (423, 431, 'Pathogen')]}),
 ("Although the vast majority of bacteria are harmless or beneficial to one's body, a few pathogenic bacteria can cause infectious diseases. The most common bacterial disease is tuberculosis, caused by the bacterium Mycobacterium tuberculosis, which affects about 2 million people mostly in sub-Saharan Africa. Pathogenic bacteria contribute to other globally important diseases, such as pneumonia, which can be caused by bacteria such as Streptococcus and Pseudomonas, and foodborne illnesses, which can be caused by bacteria such as Shigella, Campylobacter, and Salmonella. Pathogenic bacteria also cause infections such as tetanus, typhoid fever, diphtheria, syphilis, and Hansen's disease. They typically range between 1 and 5 micrometers in length.",
  {'entities': [(659, 667, 'MedicalCondition'),
    (436, 449, 'Pathogen'),
    (673, 689, 'MedicalCondition'),
    (30, 38, 'Pathogen'),
    (454, 465, 'Pathogen'),
    (647, 657, 'MedicalCondition'),
    (87, 106, 'Pathogen'),
    (532, 540, 'Pathogen'),
    (561, 571, 'Pathogen'),
    (623, 630, 'MedicalCondition'),
    (471, 490, 'MedicalCondition'),
    (632, 645, 'MedicalCondition'),
    (542, 555, 'Pathogen')]}),
 ('Much like viral pathogens, infection by certain bacterial pathogens can be prevented via vaccines.[30] Vaccines against bacterial pathogens include the anthrax vaccine and the pneumococcal vaccine. Many other bacterial pathogens lack vaccines as a preventive measure, but infection by these bacteria can often be treated or prevented with antibiotics. Common antibiotics include amoxicillin, ciprofloxacin, and doxycycline. Each antibiotic has different bacteria that it is effective against and has different mechanisms to kill that bacteria. For example, doxycycline inhibits the synthesis of new proteins in both gram-negative and gram-positive bacteria which leads to the death of the affected bacteria.[35]',
  {'entities': [(379, 390, 'Medicine'),
    (152, 167, 'Medicine'),
    (411, 422, 'Medicine'),
    (392, 405, 'Medicine'),
    (176, 196, 'Medicine')]}),
 ('The term pathogen came into use in the 1880s.[1][2] Typically, the term is used to describe an infectious microorganism or agent, such as a virus, bacterium, protozoan, prion, viroid, or fungus.[',
  {'entities': [(158, 167, 'Pathogen'),
    (95, 119, 'Pathogen'),
    (187, 193, 'Pathogen'),
    (147, 156, 'Pathogen'),
    (140, 145, 'Pathogen')]}),
 ("Some antidepressants are used as a treatment for social anxiety disorder, but their efficacy is not entirely convincing, as only a small proportion of antidepressants showed some efficacy for this condition. Paroxetine was the first drug to be FDA-approved for this disorder. Its efficacy is considered beneficial, although not everyone responds favorably to the drug. Sertraline and fluvoxamine extended release were later approved for it as well, while escitalopram is used off-label with acceptable efficacy. However, there isn't enough evidence to support citalopram for treating social phobia, and fluoxetine was no better than placebo in clinical trials. SSRIs are used as a first-line treatment for social anxiety, but they don't work for everyone. One alternative would be venlafaxine, which is a SNRI. It showed benefits for social phobia in five clinical trials against placebo, while the other SNRIs are not considered particularly useful for this disorder as many of them didn't undergo testing for it. As of now, it is unclear if duloxetine and desvenlafaxine can provide benefits for social anxiety sufferers. However, another class of antidepressants called MAOIs are considered effective for social anxiety, but they come with many unwanted side effects and are rarely used. Phenelzine was shown to be a good treatment option, but its use is limited by dietary restrictions. Moclobemide is a RIMA and showed mixed results but still got approval in some European countries for social anxiety disorder. TCA antidepressants, such as clomipramine and imipramine, are not considered effective for this anxiety disorder in particular. This leaves out SSRIs such as paroxetine, sertraline and fluvoxamine CR as acceptable and tolerated treatment options for this disorder.[19][20]",
  {'entities': [(384, 395, 'Medicine'),
    (1098, 1112, 'MedicalCondition'),
    (1687, 1697, 'Medicine'),
    (49, 72, 'MedicalCondition'),
    (1173, 1178, 'Medicine'),
    (1702, 1713, 'Medicine'),
    (781, 792, 'Medicine'),
    (1563, 1573, 'Medicine'),
    (603, 613, 'Medicine'),
    (1675, 1685, 'MedicalCondition'),
    (1613, 1629, 'MedicalCondition'),
    (369, 379, 'Medicine'),
    (1291, 1301, 'Medicine'),
    (1546, 1558, 'Medicine'),
    (455, 467, 'Medicine'),
    (1391, 1402, 'Medicine'),
    (584, 597, 'MedicalCondition')]}),
 ("However, existing data suggest that patients taking bedaquiline in addition to standard TB therapy are five times more likely to die than those without the new drug,[184] which has resulted in medical journal articles raising health policy questions about why the FDA approved the drug and whether financial ties to the company making bedaquiline influenced physicians' support for its use.[183][185]",
  {'entities': [(88, 98, 'Medicine'),
    (335, 346, 'Medicine'),
    (52, 63, 'Medicine')]}),
 ('Tuberculosis may infect any part of the body, but most commonly occurs in the lungs (known as pulmonary tuberculosis).[9] Extrapulmonary TB occurs when tuberculosis develops outside of the lungs, although extrapulmonary TB may coexist with pulmonary TB.[9]\n\nGeneral signs and symptoms include fever, chills, night sweats, loss of appetite, weight loss, and fatigue.[9] Significant nail clubbing may also occur.[16]',
  {'entities': []}),
 ('A number of factors make people more susceptible to TB infections. The most important risk factor globally is HIV; 13% of all people with TB are infected by the virus.[39] This is a particular problem in sub-Saharan Africa, where rates of HIV are high.[40][41] Of people without HIV who are infected with tuberculosis, about 5–10% develop active disease during their lifetimes;[16] in contrast, 30% of those coinfected with HIV develop the active disease.[16]',
  {'entities': [(279, 282, 'Pathogen')]}),
 ('Examples of common human diseases caused by viruses include the common cold, influenza, chickenpox, and cold sores. Many serious diseases such as rabies, Ebola virus disease, AIDS (HIV), avian influenza, and SARS are caused by viruses. The relative ability of viruses to cause disease is described in terms of virulence. Other diseases are under investigation to discover if they have a virus as the causative agent, such as the possible connection between human herpesvirus 6 (HHV6) and neurological diseases such as multiple sclerosis and chronic fatigue syndrome.[151] There is controversy over whether the bornavirus, previously thought to cause neurological diseases in horses, could be responsible for psychiatric illnesses in humans.[152]',
  {'entities': [(518, 536, 'MedicalCondition'),
    (154, 165, 'Pathogen'),
    (708, 729, 'MedicalCondition'),
    (463, 476, 'Pathogen'),
    (77, 86, 'MedicalCondition'),
    (88, 98, 'MedicalCondition'),
    (187, 202, 'MedicalCondition'),
    (610, 620, 'Pathogen')]}),
 ('Buprenorphine has been shown experimentally (1982–1995) to be effective against severe, refractory depression',
  {'entities': [(0, 13, 'Medicine'), (88, 109, 'MedicalCondition')]}),
 ('Bupropion (Wellbutrin), an anti-depressant, is also used as a smoking cessation aid; this indication was later approved, and the name of the smoking cessation product is Zyban. In Ontario, Canada, smoking cessation drugs are not covered by provincial drug plans; elsewhere, Zyban is priced higher than Wellbutrin, despite being the same drug. Therefore, some physicians prescribe Wellbutrin for both indications.[',
  {'entities': [(274, 279, 'Medicine'),
    (11, 21, 'Medicine'),
    (302, 312, 'Medicine'),
    (380, 390, 'Medicine'),
    (170, 175, 'Medicine'),
    (0, 9, 'Medicine')]}),
 ('Carbamazepine is an approved treatment for bipolar disorder and epileptic seizures, but it has side effects useful in treating attention-deficit hyperactivity disorder (ADHD), schizophrenia, phantom limb syndrome, paroxysmal extreme pain disorder, neuromyotonia, and post-traumatic stress disorder.[8]',
  {'entities': [(267, 288, 'MedicalCondition'),
    (248, 261, 'MedicalCondition'),
    (0, 13, 'Medicine'),
    (43, 59, 'MedicalCondition'),
    (145, 167, 'MedicalCondition'),
    (176, 189, 'MedicalCondition'),
    (64, 82, 'MedicalCondition'),
    (191, 212, 'MedicalCondition')]}),
 ('The antiviral drugs amantadine and rimantadine inhibit a viral ion channel (M2 protein), thus inhibiting replication of the influenza A virus.[86] These drugs are sometimes effective against influenza A if given early in the infection but are ineffective against influenza B viruses, which lack the M2 drug target.[160] Measured resistance to amantadine and rimantadine in American isolates of H3N2 has increased to 91% in 2005.[161] This high level of resistance may be due to the easy availability of amantadines as part of over-the-counter cold remedies in countries such as China and Russia,[162] and their use to prevent outbreaks of influenza in farmed poultry.[163][164] The CDC recommended against using M2 inhibitors during the 2005–06 influenza season due to high levels of drug resistance.[165]',
  {'entities': [(639, 648, 'MedicalCondition'),
    (35, 46, 'Medicine'),
    (712, 725, 'Medicine'),
    (20, 30, 'Medicine')]}),
 ('The two classes of antiviral drugs used against influenza are neuraminidase inhibitors (oseltamivir, zanamivir, laninamivir and peramivir) and M2 protein inhibitors (adamantane derivatives)',
  {'entities': [(128, 137, 'Medicine'),
    (101, 110, 'Medicine'),
    (112, 123, 'Medicine'),
    (48, 57, 'MedicalCondition'),
    (88, 99, 'Medicine')]})]

In [99]:
labels = ['Pathogen', 'MedicalCondition', 'Medicine']

In [100]:
# Load a blank English model, add NER component, add given labels to the ner pipeline
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
for ent in labels:
    ner.add_label(ent)

In [101]:
# Disable other pipeline components, complete training loop and run training loop
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
nlp.disable_pipes(*other_pipes)
losses = {}
optimizer = nlp.begin_training()
other_pipes

[]

In [102]:
for text, annotation in training_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotation)
    nlp.update([example], sgd=optimizer, losses=losses)
    print(losses)

{'ner': 98.42856240272522}
{'ner': 188.5091609954834}
{'ner': 309.517851293087}
{'ner': 375.8999624848366}
{'ner': 575.185431599617}
{'ner': 605.2146937847137}
{'ner': 669.3962617814541}
{'ner': 717.6792084276676}
{'ner': 736.8563084732741}
{'ner': 746.4217485651607}
{'ner': 779.9957114736317}
{'ner': 785.9423416967093}
{'ner': 785.9639989202074}
{'ner': 787.9575986306503}
{'ner': 803.9006502354792}
{'ner': 807.8630042147869}
{'ner': 819.7619179444417}
{'ner': 835.6793623805029}
{'ner': 843.6444945651712}
{'ner': 853.4423006288563}
