In [2]:
!pip install sentence-transformers



In [3]:
# import dependencies
import json
import torch
import faiss
from pprint import pprint
import numpy as np
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


### Load data and preprocess

In [4]:
# Load documents from JSON file
with open('data/data.json', 'r') as file:
    documents = json.load(file)

In [5]:
# Load a transformers model
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [6]:
# Compute sentence embeddings for every text n the documents
corpus = [d['text'] for d in documents]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

In [7]:
## move the tensores to CPU for the FAISS index to work
corpus_embeddings=corpus_embeddings.cpu()

### Create a Faiss index

In [8]:
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
index.add_with_ids(corpus_embeddings.numpy(), 
                   np.array(range(0, len(corpus))))

# save the index for future use
faiss.write_index(index, 'data/pandemics')

### Search the documents

In [9]:
# Build a search function that finds the most relevant search results
def search(query, documents, k=5):
    # the difference from milestone1 is the use of emberdder which is SBERt instead of BERT
    encoded_query = embedder.encode([query])
    top_k = index.search(encoded_query, k)
    scores = top_k[0][0]
    results = [documents[_id] for _id in top_k[1][0]]
    return list(zip(results, scores))
    

In [10]:
pprint(search("spanish flu casualties",corpus, k=2))

: 

: 