In [None]:
import sys
import os
nb_dir = os.path.split(os.getcwd())[0]
sys.path.append(os.getcwd() + "/../")

In [None]:
import logging
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
logging.getLogger("bertft").setLevel(logging.DEBUG)

In [None]:
import bertft
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Important: auto-reload of bertft module
%load_ext autoreload
%autoreload 2

In [None]:
def mk_graph(x1):
    x1 = list(filter(lambda x: -2 < x < 0.99, x1))[:40]
    kwargs = dict(alpha=0.3, bins=20)

    plt.hist(x1, **kwargs, color='g', label='FastText score')
    plt.gca().set(title='Top 40 masks histogram of embeddings score', ylabel='Count')

    plt.legend()
    plt.show()


def mk_graph2(x1):
    kwargs = dict(alpha=1, bins=50)

    plt.hist(x1, **kwargs, color='r', label='Weighted score')
    plt.gca().set(
        title='Distribution of weighted score of top 200 unfiltered results (Target excluded)',
        ylabel='Count'
    )

    plt.legend()
    plt.show()


def on_run(self, kunfiltered, unfiltered, filtered_top, target, tokenizer, top_tokens):
    print("Unfiltered top:")

    print(pd.DataFrame({
        'word': lget(kunfiltered, 0),
        'bert': self.dget(kunfiltered, 1),
        'normalized': self.dget(kunfiltered, 2),
        'ftext': self.dget(kunfiltered, 3),
        'ftext-sentence': self.dget(kunfiltered, 4),
        'score': lget(kunfiltered, 5),
    }))

    print("Filtered top:")

    print(filtered_top)

    mk_graph(lget(unfiltered, 2)[:100])
    mk_graph2(lget(list(filter(lambda x: x[0] != target, unfiltered)), 4))

    if target is not None:
        vec = tokenizer.encode(target, return_tensors="pt")[0]
        if len(vec) == 3:
            tk = vec[1].item()
            pos = None
            score = None

            for e, (t, v) in enumerate(top_tokens):
                if t == tk:
                    score = v
                    break
            print("Original word position: %s; score: %s " % (pos, score))
        else:
            if len(vec) > 3:
                print("Original word is more then 1 token")
                print(tokenizer.tokenize(target))
            else:
                print("Original word wasn't found")


In [None]:
pipeline = bertft.Pipeline(False)

In [None]:
# Example of usage
res = pipeline.find_top(
    # List of sentences with target word position
    [
        ("what is the local weather forecast?", 3, 4),
        ("what is chances of rain tomorrow?", 4, 2),
        ("is driving a car faster then taking a bus?", 3),
        ("who is the best football player of all time?", 4)
    ],
    k = 20, # Filter best k results (by weighted score)
    top_bert = 100, # Number of initial filter of bert output 
    min_ftext = 0.3, # Minimal required score of fast text  
    min_bert = 0.5, # Minimal required score of Bert  
    weights = [ # Weights of models scores to calculate total weighted score
        1, # bert
        1, # fast text
    ],
    min_score = 0 # Minimum required score
)
print(res)

In [None]:
pipeline.sentence_similarity("Current weather in New York", "Would it rain in London?")
pipeline.sentence_similarity("Current weather in London", "Would it rain in London?")
pipeline.sentence_similarity("Current weather in New York", "Buy car in London")
pipeline.sentence_similarity("Current weather in London", "Buy car in London")