## Notebook to create hard negatives for fine-tuning the [paraphrase-mpnet-base-v2](https://huggingface.co/sentence-transformers/paraphrase-mpnet-base-v2) model

In [1]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import spatial
# from torchinfo import summary
import time
from IPython.display import clear_output

  from tqdm.autonotebook import tqdm, trange


## Load Data

In [2]:
df_ques_url_train = pd.read_pickle('../../data/questions_relevant_urls_chunks_train.pkl')
df_ques_url_test = pd.read_pickle('../../data/questions_relevant_urls_chunks_test.pkl')

print(df_ques_url_train.shape)
print(df_ques_url_test.shape)
df_ques_url_train.head(3)

(20000, 3)
(5000, 3)


Unnamed: 0,question,relevant_docs_urls,num_rel_chunks
0,What is (are) keratoderma with woolly hair ?,[https://ghr.nlm.nih.gov/condition/keratoderma...,5
1,How many people are affected by keratoderma wi...,[https://ghr.nlm.nih.gov/condition/keratoderma...,5
2,What are the genetic changes related to kerato...,[https://ghr.nlm.nih.gov/condition/keratoderma...,5


In [3]:
df_kb = pd.read_pickle('../../data/kb_chunks_emb.pkl')
print(df_kb.shape)
df_kb.head(3)

(33545, 3)


Unnamed: 0,doc_url,chunk_content,embedding
0,https://ghr.nlm.nih.gov/condition/keratoderma-...,keratoderma with woolly hair : medlineplus gen...,"[-0.0039987266, 0.08037464, 0.049785912, -0.12..."
1,https://ghr.nlm.nih.gov/condition/keratoderma-...,"##ma, woolly hair, and a form of cardiomyopath...","[-0.09539697, -0.09132044, 0.0027289127, 0.005..."
2,https://ghr.nlm.nih.gov/condition/keratoderma-...,##pathy in people with this group of condition...,"[0.026278932, 0.060939535, 0.031438153, -0.044..."


## Load Model

In [4]:
model_name = "sentence-transformers/paraphrase-mpnet-base-v2"
model = SentenceTransformer(model_name)
model = model.to('cuda')

## Create Hard Negatives

In [None]:
def cos_sim(a, b):
    return 1 - spatial.distance.cosine(a, b)

def retrieve(ques, df_kb, model):
    question_embedding = model.encode(ques, convert_to_tensor=True).cpu().numpy()
    # Copy the 'doc_url' and 'embedding' columns to a new dataframe
    df_kb_copy = df_kb[['doc_url', 'embedding']].copy()
     # Calculate the cosine similarity between the question embedding and all document embeddings
    df_kb_copy['cosine_similarity'] = df_kb_copy['embedding'].apply(
        lambda x: cos_sim(question_embedding, x))
    # Sort the dataframe by the cosine similarity in descending order
    df_kb_copy = df_kb_copy.sort_values(by='cosine_similarity', ascending=False)
    ret_urls = df_kb_copy['doc_url'].values
    return ret_urls

def get_hard_negs(row):
    """
    Get the hard negatives corresponding to a question by retrieving the relevant documents and
    removing the relevant documents from the retrieved documents.
    """
    ques = row['question']
    ret_urls = retrieve(ques, df_kb, model)
    rel_urls = row['relevant_docs_urls']
    hard_negs = [url for url in ret_urls if url not in rel_urls]
    return hard_negs

In [6]:
df_ques_url_train['hard_negatives_li'] = df_ques_url_train.apply(get_hard_negs, axis=1)
df_ques_url_train.to_pickle('../../data/ques_rel_url_hard_negs_train.pkl')

In [7]:
df_ques_url_train.head()

Unnamed: 0,question,relevant_docs_urls,num_rel_chunks,hard_negatives_li
0,What is (are) keratoderma with woolly hair ?,[https://ghr.nlm.nih.gov/condition/keratoderma...,5,[https://www.nlm.nih.gov/medlineplus/ency/arti...
1,How many people are affected by keratoderma wi...,[https://ghr.nlm.nih.gov/condition/keratoderma...,5,[https://www.nlm.nih.gov/medlineplus/ency/arti...
2,What are the genetic changes related to kerato...,[https://ghr.nlm.nih.gov/condition/keratoderma...,5,[https://ghr.nlm.nih.gov/condition/monilethrix...
3,Is keratoderma with woolly hair inherited ?,[https://ghr.nlm.nih.gov/condition/keratoderma...,5,[https://www.nlm.nih.gov/medlineplus/ency/arti...
4,What are the treatments for keratoderma with w...,[https://ghr.nlm.nih.gov/condition/keratoderma...,5,[https://www.nlm.nih.gov/medlineplus/ency/arti...


In [8]:
df_ques_url_test['hard_negatives_li'] = df_ques_url_test.apply(get_hard_negs, axis=1)
df_ques_url_test.to_pickle('../../data/ques_rel_url_hard_negs_test.pkl')

In [9]:
df_ques_url_test.head()

Unnamed: 0,question,relevant_docs_urls,num_rel_chunks,hard_negatives_li
0,What are the symptoms of Diabetic hyperglycemi...,[https://www.nlm.nih.gov/medlineplus/ency/arti...,4,[https://www.nlm.nih.gov/medlineplus/hyperglyc...
1,How to diagnose Diabetic hyperglycemic hyperos...,[https://www.nlm.nih.gov/medlineplus/ency/arti...,4,[https://www.nlm.nih.gov/medlineplus/ency/arti...
2,What are the treatments for Diabetic hyperglyc...,[https://www.nlm.nih.gov/medlineplus/ency/arti...,4,[https://www.nlm.nih.gov/medlineplus/ency/arti...
3,What is the outlook for Diabetic hyperglycemic...,[https://www.nlm.nih.gov/medlineplus/ency/arti...,4,[https://www.nlm.nih.gov/medlineplus/ency/arti...
4,What are the complications of Diabetic hypergl...,[https://www.nlm.nih.gov/medlineplus/ency/arti...,4,[https://www.nlm.nih.gov/medlineplus/ency/arti...
