In [None]:
import json
import numpy as np
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
with open('Copy of corpus_evidence_unified.json','r') as f:
    evidence_data = json.load(f)

In [None]:
# Extract and tokenize evidence documents
evidence_snippets = [text for text in evidence_data.values()]
tokenized_corpus = [doc.split() for doc in evidence_snippets]
# Initialize BM25
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
def retrieve_evidence_bm25(claim, k=100):
    tokenized_query = claim.split()
    scores = bm25.get_scores(tokenized_query)
    top_k_indices = np.argsort(scores)[-k:][::-1]  
    #combine the retrieved evidence as a single string. 
    return [evidence_snippets[i] for i in top_k_indices]
    # return("".join(selected_evidence))

In [None]:
def rerank_evidence(claim, evidence_list, reranker_model):
    claim_embedding = reranker_model.encode(claim, convert_to_tensor=True)
    evidence_embeddings = reranker_model.encode([e[0] for e in evidence_list], convert_to_tensor=True)

    cos_scores = util.pytorch_cos_sim(claim_embedding, evidence_embeddings)[0]
    cos_scores_cpu = cos_scores.cpu().numpy()
    reranked_indices = np.argsort(cos_scores_cpu)[::-1]

    reranked_evidence = [evidence_list[i] for i in reranked_indices]
    return reranked_evidence

In [None]:
with open('train_claims_quantemp.json','r') as f:
    train_data = json.load(f)
    
# with open('ollama-decomposed-test.json','r') as f:
#     test_data = json.load(f)

In [None]:
results = []
reranker_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

for entry in train_data:
    claim = entry['claim']
    label = entry['label']
    # subqueries = entry['subqueries']

    # Retrieve top-100 evidence as a single string
    # reranked_evidence = retrieve_evidence_bm25(claim,k=100)


     # Retrieve top-100 evidence
    initial_evidence = retrieve_evidence_bm25(claim, k=100)

    # Re-rank evidence
    reranked_evidence = rerank_evidence(claim, initial_evidence, reranker_model)[:5]  # Select top-5 after re-ranking
    combined_evidence = "".join(reranked_evidence)

    results.append({
        "claim": claim,
        "label": label,
        # 'subqueries': subqueries, 
        "doc": combined_evidence, 
        "retrieved_evidence": reranked_evidence
    })
    
    
with open('val_claims_quantemp_bm25.json', 'w') as f:
    json.dump(results, f)
    
print('done')