In [1]:
from collections import Counter
from qaam_nlp import QAAM

In [4]:
qaam = QAAM(model='en_core_web_sm')
qaam.add_url("http://25665f7a.ngrok.io")

In [5]:
BLOG_URL = ("https://www.freecodecamp.org/news/lessons-learned-"
            "from-deploying-my-first-full-stack-web-application-34f94ec0a286/")

# extract all the texts from the url.
qaam.texts_from_url(BLOG_URL)
# now we can query questions related to the context.
qaam.answer("What was the hardest lesson learned?")

{'answer': 'deployment',
 'context': 'I didn’t understand what was involved in deployment. This was the most frustrating point for me.'}

In [4]:
def get_blog_entities(spacy_doc: object, top_k=10):
    entities = dict()
    for ent in doc.ents:
        ent = ent.text
        if ent not in entities:
            entities[ent] = 1
        else:
            entities[ent] += 1
    return Counter(entities).most_common(top_k)

get_blog_entities(qaam.doc, top_k=10)

[('Backend', 7),
 ('Frontend', 6),
 ('two', 5),
 ('Nginx', 4),
 ('JavaScript', 4),
 ('first', 3),
 ('one', 3),
 ('Google', 2),
 ('Amazon', 2),
 ('thousands', 2)]

In [29]:
from typing import List
from similarity.metric_lcs import MetricLCS

def compute_distance(target: str, tokens: List[str], distance_threshold: float = 0.4):
    metric_lcs = MetricLCS()
    matches = []
    similar = {}
    for word in tokens:
        dist = metric_lcs.distance(target, word)
        # lower values -> the closer in distance.
        if dist <= distance_threshold:
            similar[word] = dist
    return similar

# Load the question to get the context from the max model.
question = "What was the hardest lesson learned?"
prediction = qaam.answer(question)
answer = prediction['answer']
context = prediction['context']

# Higher thereshold to see the distance metric given to each token.
distance = compute_distance(answer, context.split(), 0.9)

# Sorted from highest to lowest. 
sorted(distance.items(), key=lambda i: i[1], reverse=True)

[('what', 0.9),
 ('in', 0.9),
 ('the', 0.9),
 ('for', 0.9),
 ('involved', 0.8),
 ('most', 0.8),
 ('me.', 0.8),
 ('didn’t', 0.7),
 ('understand', 0.7),
 ('point', 0.6),
 ('deployment.', 0.09090909090909094)]

> Instead of using sklearn and TFIDF I could instead use bert for similary.

In [31]:
qaam = QAAM(model='en_trf_bertbaseuncased_lg')
qaam.add_url("http://25665f7a.ngrok.io")

blog_url = "https://explosion.ai/blog/spacy-transformers"
question = "How can I fine tune a transformer model for my task?"
# print out the default similarity metric used by the QAAM class.
qaam.texts_from_url(blog_url)
predicted_answer = qaam.answer(question)
print(predicted_answer)

{'answer': "via spaCy's standard nlp", 'context': "Support is provided for fine-tuning the transformer models via spaCy's standard nlp.update training API."}


In [32]:
qaam.nlp.pipe_names

['sentencizer', 'trf_wordpiecer', 'trf_tok2vec']

In [33]:
# similarity metric from the question to the whole transformers document.
qaam.doc.similarity(qaam.nlp(question))

0.7558835699095946

In [34]:
# hmm this is too high and not related to the context..
qaam.doc.similarity(qaam.nlp("How can I know if the sky is blue in my area?"))

0.6286082380288756

In [35]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(qaam.nlp.vocab)
matcher.add("CODE", None, qaam.nlp("nlp.update"))

In [36]:
context_doc = qaam.nlp(predicted_answer['context'])
doc_matches = matcher(context_doc)
# (matcher_id, start, end)
print(doc_matches)

[(8672180710615960695, 14, 15)]


In [37]:
# PhraseMatcher is also available as a batch pipe.
matcher_id, start, end = doc_matches[0]
context_doc[start:end]

nlp.update

#### PhraseMatcher.remove

> To remove a rule from the matcher by match ID - read the following documentation [removing matcher rules](https://spacy.io/api/phrasematcher#remove)

In [38]:
# checking whether the matcher contains rules for a match ID
"CODE" in matcher

True

> One important detail is that BERT uses wordpieces (e.g. playing -> play + ##ing)instead of words. This is effective in reducing the size of the vocabulary and increases the amount of data that is available for each word.

In [39]:
print(context_doc._.trf_word_pieces_)

['[CLS]', 'support', 'is', 'provided', 'for', 'fine', '-', 'tuning', 'the', 'transform', '##er', 'models', 'via', 'spa', '##cy', "'", 's', 'standard', 'nl', '##p', '.', 'update', 'training', 'api', '.', '[SEP]']


In [40]:
import cupy
import numpy
import numpy as np
from scipy.spatial import distance

In [41]:
# NOTE: This is the same as -> qaam.doc.similarity(qaam.nlp(doc))
_question = qaam.nlp(question)
_context = qaam.nlp(predicted_answer['context'])
a1_embedding = cupy.asnumpy(_question.tensor.sum(axis=0))
a2_embedding = cupy.asnumpy(_context.tensor.sum(axis=0))

# similarity is defined as 1 - cosine distance between to arrays
cosine_similarity = (1 - distance.cosine(a1_embedding, a2_embedding))
print(f'similarity between _question and _context is: {cosine_similarity}')

similarity between _question and _context is: 0.6397733092308044


In [42]:
def embed_sequence(sequence: str):
    tensor = qaam.nlp(sequence).tensor.sum(axis=0)
    embedd = cupy.asnumpy(tensor)
    return embedd

In [43]:
embed1 = embed_sequence("Transformer models to improve NLP tasks")
embed2 = embed_sequence("How transformer models work with textual data")
cossim = 1 - distance.cosine(embed1, embed2)
cossim

0.8727603554725647