In [None]:
!pip install -qU \
  datasets \
  openai \
  pinecone-client \
  cohere

In [None]:
!pip install -q datasets
!pip install -q sentence-transformers
!pip install llama_index
!pip install llama-index-embeddings-huggingface
!pip install pinecone-client
!pip install llama-index-vector-stores-pinecone
!pip install openai

In [None]:
!pip install pytorch-metric-learning

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.vector_stores import VectorStoreQuery
from llama_index.llms.openai import OpenAI
import pinecone
from tqdm import tqdm
import os
from datasets import load_dataset
import torch
from pinecone import ServerlessSpec

In [None]:
hpqa = load_dataset("hotpot_qa", "fullwiki", split = 'train')

In [None]:
hpqa_test = load_dataset("hotpot_qa", "fullwiki", split = "test")
hpqa_distractors = load_dataset("hotpot_qa", "distractor", split = "train")
hpqa_distractors_test = load_dataset("hotpot_qa", "distractor", split = "validation")

In [None]:
hpqa_df = hpqa.to_pandas()

In [None]:
hpqa_df.iloc[0].supporting_facts['title']

In [None]:
hpqa_df.iloc[0].context

In [None]:
def generate_relevance_triplets(df, limit = 100):
    '''
    Generates triples of the form [query, relevant document text, non relevant document text].
    For HotPotQA, relevant documents are simply the text of the supporting facts,
    whereas the non relevant document text is simply any non supporting fact document.
    '''
    triples = []
    rele_docs = {}
    irrele_docs = {}

    for i in range(limit):
        row = df.iloc[i]
        query = row.question
        relevant_doc_titles = [x for x in row.supporting_facts['title']]
        relevant_texts = []
        irrelevant_texts = []
        for i in range(len(row.context['title'])):
            curr_title = row.context['title'][i]
            toAppend = " ".join(row.context['sentences'][i].tolist())
            if curr_title in relevant_doc_titles:
                relevant_texts.append(toAppend)

            else:
                irrelevant_texts.append(toAppend)
        rele_docs[query] = relevant_texts
        irrele_docs[query] = irrelevant_texts
        for relevant_text in relevant_texts:
            for irrelevant_text in irrelevant_texts:
                triples.append([query, relevant_text, irrelevant_text])
    return triples, rele_docs, irrele_docs

In [None]:
hpqa_triplets, relevant_texts, irrelevant_texts = generate_relevance_triplets(hpqa_df, 1000)

In [None]:
hpqa_test_df

In [None]:
hpqa_test_triplets, relevant_texts_test, irrelevant_texts_test = generate_relevance_triplets(hpqa_test_df, 1000)

In [None]:
hpqa_triplets[0][2]

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.util import pairwise_dot_score
from torch.utils.data import DataLoader
import torch

In [None]:
student_model = SentenceTransformer('sentence-transformers/distilbert-base-nli-mean-tokens', device = "cpu")


In [None]:
import cohere
co = cohere.Client("vFlFiQEmv32FGdKtvtvq9IXC3OYiodm4p95TnuK8")

In [None]:
# Example
q, r, i  = hpqa_triplets[0]
res = co.rerank(query = q, documents= [r])

In [None]:
res.results[0].relevance_score

In [None]:
ires = co.rerank(query = q, documents= [i])

In [None]:
ires.results[0].relevance_score

In [None]:
def get_relevance_scores(triples, co):
    relevance_scores = {}
    for i in tqdm(range(len(triples))):
        query, relevant, irrelevant = triples[i]
        if (query, relevant) not in relevance_scores:
            result = co.rerank(query = query, documents= [relevant])
            relevance_scores[(query, relevant)] = result.results[0].relevance_score
        if (query, irrelevant) not in relevance_scores:
            result = co.rerank(query = query, documents= [irrelevant])
            relevance_scores[(query, irrelevant)] = result.results[0].relevance_score
    return relevance_scores



In [None]:
r_scores = get_relevance_scores(hpqa_triplets, co)

In [None]:
r_scores_test = get_relevance_scores(hpqa_test_triplets, co)

In [None]:
def generate_labels(triples, scores):
    labels = []
    for q, r, i in triples:
        pos_score = scores[(q, r)]
        neg_score = scores[(q, i)]
        labels.append(pos_score - neg_score)
    return torch.tensor(labels)

In [None]:
labels = generate_labels(hpqa_triplets, r_scores)

In [None]:
train_input_examples = [InputExample(texts=x, label=labels[i]) for i,x in enumerate(hpqa_triplets)]

In [None]:
train_loss = losses.MarginMSELoss(model = student_model)

In [None]:
train_dataloader = DataLoader(train_input_examples, shuffle=True, batch_size=1)

In [None]:
student_model.fit(
    [(train_dataloader, train_loss)],
    epochs=1
)

In [None]:
with torch.no_grad():
  q,r,i = hpqa_triplets[0]
  print(r_scores[(q,r)])
  print(r_scores[(q,i)])
  print(student_model.predict([q,r]))




In [None]:
import pickle

with open('r_scores.pkl', 'wb') as f:
    pickle.dump(r_scores, f)

In [None]:
hpqa_df.to_csv("hpqa_df")

In [None]:
from sentence_transformers import CrossEncoder

In [None]:
ce_model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2", device="cpu")

In [None]:
# train_input_examples = [InputExample(texts=x, label=labels[i]) for i,x in enumerate(hpqa_triplets)]

In [None]:
labels = []
train_input_examples = []
for q, t in r_scores:
  labels.append(r_scores[(q,t)])
  train_input_examples.append(InputExample(texts=[q,t], label = r_scores[(q,t)]))
train_dataloader = DataLoader(train_input_examples, shuffle=True, batch_size=1)
train_loss = losses.MarginMSELoss(model = ce_model)

In [None]:
train_loss = losses.MarginMSELoss(model = ce_model)

In [None]:
ce_model.fit(
    train_dataloader = train_dataloader,
    loss_fct = torch.nn.MSELoss(),
    epochs=1
)

In [None]:
with torch.no_grad():
  q,r,i = hpqa_triplets[0]
  print(r_scores[(q,r)])
  print(r_scores[(q,i)])
  print(ce_model.predict([q,r]))
  print(ce_model.predict([q,i]))

In [None]:
# model.fit(
#     train_dataloader=train_dataloader,
#     loss_fct=torch.nn.MSELoss(),
#     evaluator=evaluator,
#     epochs=num_epochs,
#     evaluation_steps=5000,
#     warmup_steps=warmup_steps,
#     output_path=model_save_path,
#     optimizer_params={"lr": 7e-6},
#     use_amp=True,
# )

In [None]:
model_name = "microsoft/MiniLM-L12-H384-uncased"
model = CrossEncoder(model_name, num_labels=1, max_length=512, default_activation_function=torch.nn.Identity(), device = "cpu")

labels = []
train_input_examples = []
for q, t in r_scores:
  labels.append(r_scores[(q,t)])
  train_input_examples.append(InputExample(texts=[q,t], label = r_scores[(q,t)]))
train_dataloader = DataLoader(train_input_examples, shuffle=True, batch_size=1)

In [None]:
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator

In [None]:
model.fit(train_dataloader=train_dataloader, loss_fct = torch.nn.MSELoss(), epochs = 1)

In [None]:
ce_model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2", device = "cpu")
ce_model.fit(train_dataloader = train_dataloader, loss_fct = torch.nn.MSELoss(), epochs = 1)

In [None]:
import time

In [None]:
def compute_reranking(reranker_model, index, queries, rele_docs):
    # Make query to vector store
    # Compute reranking based on reranker model
    # Get the MRR and the HR
    num_hit = 0
    num_hit_rel = 0
    mrr_sum = 0
    hits_per_q = []
    for i in tqdm(range(len(queries))):
        hit_perc = 0
        query = queries[i]
        engine = index.as_query_engine(similarity_top_k = 5)
        query_result = engine.query(query)
        supporting_docs = rele_docs[query]
        if i > 0 and i % 499 == 0:
            time.sleep(60)
        inter_hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]

        hit_flag = False
        hits = reranker_model.rank(query, inter_hits, return_documents = True)
        hits = [x['text'] for x in hits]
        for j in range(len(supporting_docs)):
            if supporting_docs[j] in hits:
                hit_flag = True
                num_hit += 1
                hit_perc += 1
        
        for j in range(len(hits)):
            if hits[j] in supporting_docs:
                mrr_sum += (1/(j+1))
                break
        if hit_flag:
            num_hit_rel += 1
        
        hits_per_q += [hit_perc]
                
    return num_hit, num_hit_rel, mrr_sum, hits_per_q

In [None]:
with torch.no_grad():
    q, _,_ = hpqa_triplets[0]
    texts = relevant_texts[q]
    print(ce_model.rank(q, texts, return_documents=True))

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core import Document

contexts = []
for c in relevant_texts:
    contexts.append(relevant_texts[c])
for c in irrelevant_texts:
    contexts.append(irrelevant_texts[c])

documents = []
for i in range(len(contexts)):
    for j in range(len(contexts[i])):
        documents.append(Document(text=contexts[i][j]))
index = VectorStoreIndex.from_documents(
    documents,
    top_k = 5,
    llm = OpenAI(temperature = 0, model = 'gpt-3.5-turbo')
)

In [None]:
all_queries = [q for q, _ in r_scores]
all_queries = list(set(all_queries))
n, nr , m ,h = compute_reranking(ce_model, index, all_queries, relevant_texts)

In [None]:
print(n/len(all_queries), nr/len(all_queries), m/len(all_queries))

index.as_query_engine

In [None]:
h

In [None]:
all_queries = [q for q, _ in r_scores]
all_queries = list(set(all_queries))
n, nr , m ,h = compute_reranking(model, index, all_queries, relevant_texts)

In [None]:
len(all_queries)

In [None]:
x = len(h)
print(sum([h[i] == 0 for i in range(x)]))
print(sum([h[i] == 1 for i in range(x)]))
print(sum([h[i] == 2 for i in range(x)]))

In [None]:
603/994

In [None]:
nr/len(all_queries)

In [None]:
b = index.as_query_engine(similarity_top_k = 5)
qr = b.query(all_queries[0])
hs = [qr.source_nodes[i].text for i in range(len(qr.source_nodes))]

In [None]:
hs

In [None]:
relevant_texts[all_queries[0]]

In [None]:
ce_model.rank(all_queries[0], hs, return_documents=True)

In [None]:
n, nr , m ,h = compute_reranking(model, index, all_queries, relevant_texts)

In [None]:
x = len(h)
print(sum([h[i] == 0 for i in range(x)]))
print(sum([h[i] == 1 for i in range(x)]))
print(sum([h[i] == 2 for i in range(x)]))

In [None]:
n/(2*len(all_queries))

In [None]:
m

In [None]:
from sklearn.metrics import ndcg_score

In [None]:
model.predict()

In [None]:
scores = []
true_scores = []
for q, doc in relevant_texts.items():
    for t in doc:
        if(q,t) in r_scores:
            scores.append(model.predict(sentences=[q, t]))
            true_scores.append(r_scores[(q,t)])

In [None]:
ndcg_score([true_scores], [scores])

In [None]:
scores = []
true_scores = []
for q, doc in relevant_texts.items():
    for t in doc:
        if(q,t) in r_scores:
            scores.append(ce_model.predict(sentences=[q, t]))
            true_scores.append(r_scores[(q,t)])

In [None]:
ndcg_score([true_scores], [scores])

In [None]:
ce_model_2 = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device = "cpu")

In [None]:
ce_model_2.fit(train_dataloader = train_dataloader, loss_fct = torch.nn.MSELoss(), epochs = 1)

In [None]:
all_queries = [q for q, _ in r_scores]
all_queries = list(set(all_queries))
n, nr , m ,h = compute_reranking(ce_model_2, index, all_queries, relevant_texts)

In [None]:
scores = []
true_scores = []
for q, doc in relevant_texts.items():
    for t in doc:
        if(q,t) in r_scores:
            scores.append(ce_model_2.predict(sentences=[q, t]))
            true_scores.append(r_scores[(q,t)])

In [None]:
ndcg_score([true_scores], [scores])

TIME TO RERANK

In [None]:
%%timeit
query = all_queries[0]
engine = index.as_query_engine(similarity_top_k = 5)
query_result = engine.query(query)
inter_hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]
results = co.rerank(query=query, documents = inter_hits, return_documents=True)
    



In [None]:
%%timeit
query = all_queries[0]
engine = index.as_query_engine(similarity_top_k = 5)
query_result = engine.query(query)
inter_hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]
hits = model.rank(query, inter_hits, return_documents = True)

In [None]:
%%timeit
query = all_queries[0]
engine = index.as_query_engine(similarity_top_k = 5)
query_result = engine.query(query)
inter_hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]
hits = ce_model.rank(query, inter_hits, return_documents = True)

In [None]:
%%timeit
query = all_queries[0]
engine = index.as_query_engine(similarity_top_k = 5)
query_result = engine.query(query)
inter_hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]
hits = ce_model_2.rank(query, inter_hits, return_documents = True)

Test Queries

In [None]:
hpqa_test_df = hpqa_test.to_pandas()

In [None]:
test_queries = hpqa_test_df['question'].to_list()
test_contexts = hpqa_test_df['context'].to_list()

In [None]:
%%timeit
i = 0
for test_q in test_queries:
    engine = index.as_query_engine(similarity_top_k = 5)
    query_result = engine.query(test_q)
    inter_hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]
    results = co.rerank(query=test_q, documents = inter_hits, return_documents=True)
    i+= 1
    if i == 100:
        break

In [None]:
%%timeit
i = 0
for test_q in test_queries:
    engine = index.as_query_engine(similarity_top_k = 5)
    query_result = engine.query(test_q)
    inter_hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]
    hits = model.rank(test_q, inter_hits, return_documents = True)
    i+= 1
    if i == 100:
        break

In [None]:
%%timeit
i = 0
for test_q in test_queries:
    engine = index.as_query_engine(similarity_top_k = 5)
    query_result = engine.query(test_q)
    inter_hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]
    hits = ce_model.rank(test_q, inter_hits, return_documents = True)
    i+= 1
    if i == 100:
        break

In [None]:
%%timeit
i = 0
for test_q in test_queries:
    engine = index.as_query_engine(similarity_top_k = 5)
    query_result = engine.query(test_q)
    inter_hits = [query_result.source_nodes[i].text for i in range(len(query_result.source_nodes))]
    hits = ce_model_2.rank(test_q, inter_hits, return_documents = True)
    i += 1
    if i == 100:
        break

In [None]:
test_sentences = [list(test_contexts[y]['sentences']) for y in range(len(test_contexts))]


In [None]:
test_queries[0]

In [None]:
test_sentences = [" ".join(x) for y in range(len(test_sentences)) for x in test_sentences[y]]

In [None]:
all_docs = []
for x in range(len(test_sentences)):
    curr_doc = []
    for y in range(len(test_sentences[x])):
        new = "".join(test_sentences[x][y].tolist())
        curr_doc.append(new)
    all_docs.append(curr_doc)

In [None]:
all_docs[0]

In [None]:
test_queries[607]

In [None]:
test_rerank_results = []
for i in tqdm(range(len(test_queries))):
    query = test_queries[i]
    docs = all_docs[i]
    if len(docs) == 0:
        continue
    res = co.rerank(query = query, documents = docs, return_documents=True)
    test_rerank_results.append(res)




In [None]:
test_rerank_results[0].results[0].relevance_score

In [None]:
true_r_scores = [true_results.results[x].relevance_score for x in range(len(true_results.results))]

In [None]:
true_r_scores

In [None]:
res_ptr = 0
tiny_bert_avg_ndcg = 0
minilm_avg_ndcg = 0
uncased_mini_lm_ndcg = 0

for i in tqdm(range(500)):
    query = test_queries[i]
    docs = all_docs[i]
    if len(docs) <= 1:
        continue
    true_results = test_rerank_results[res_ptr]
    res_ptr += 1

    true_r_scores = [true_results.results[x].relevance_score for x in range(len(true_results.results))]

    # TinyBERT
    scores = ce_model.predict(sentences = [[query, doc] for doc in docs])
    tiny_bert_avg_ndcg += ndcg_score([true_r_scores], [scores])

    # MS Marco
    scores = ce_model_2.predict(sentences = [[query, doc] for doc in docs])
    minilm_avg_ndcg += ndcg_score([true_r_scores], [scores])

    # uncased
    scores = model.predict(sentences = [[query, doc] for doc in docs])
    uncased_mini_lm_ndcg += ndcg_score([true_r_scores], [scores])   


    
    

In [None]:
print(minilm_avg_ndcg /500)
print(uncased_mini_lm_ndcg /500)
print(tiny_bert_avg_ndcg /500)