In [None]:
!pip install datasets
!pip install langchain_openai
!pip install nltk
!pip install rank_bm25
!pip install langchain_text_splitters
!pip install torch
!pip install transformers
!pip install sentence_transformers

In [None]:
import json
import os
import gc
from datasets import load_dataset
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
import pandas as pd
from rank_bm25 import BM25Okapi
import nltk
from rank_bm25 import BM25Okapi
import numpy as np
from langchain_text_splitters import RecursiveCharacterTextSplitter
import torch
from transformers import AutoTokenizer, AutoModel
from nltk.tokenize import word_tokenize
from torch.nn import DataParallel
from sentence_transformers import SentenceTransformer

In [None]:
nltk.download("punkt")

# Azure Chat OpenAI Model (GPT-4o)
azure_configs = {
    "base_url": "https://open-ai-aus.openai.azure.com/",
    "model_deployment": "gpt-4o",
    "model_name": "gpt-4o",
    "embedding_model_name": "text-embedding-ada-002",
    "openai_api_key": "",
    "openai_api_version": "2023-03-15-preview",
}

azure_model = AzureChatOpenAI(
    openai_api_version=azure_configs["openai_api_version"],
    azure_deployment=azure_configs["model_deployment"],
    azure_endpoint=azure_configs["base_url"],
    api_key=azure_configs["openai_api_key"],
    temperature=0.0,
)

azure_embedding_model = AzureOpenAIEmbeddings(
    model=azure_configs["embedding_model_name"],
    azure_endpoint=azure_configs["base_url"],
    api_key=azure_configs["openai_api_key"],
    openai_api_version=azure_configs["openai_api_version"],
)

query_tokenizer = AutoTokenizer.from_pretrained("facebook/dragon-plus-query-encoder")
query_encoder = AutoModel.from_pretrained("facebook/dragon-plus-query-encoder").to("cuda")
context_encoder = AutoModel.from_pretrained("facebook/dragon-plus-context-encoder").to("cuda")

contriever_model = SentenceTransformer(
    "facebook/contriever",
    trust_remote_code=True,
).cuda()

contriever_ms_model = SentenceTransformer(
    "facebook/contriever-msmarco",
    trust_remote_code=True,
).cuda()

stella_model = SentenceTransformer(
    "dunzhang/stella_en_400M_v5",
    trust_remote_code=True,
    device="cuda",
    config_kwargs={"use_memory_efficient_attention": False, "unpad_inputs": False}
).cuda()

In [None]:
def retrieve_context(context, input_question, retriever="BM25", top_k=5, batch_size=32):
    def word_count(text):
        return len(word_tokenize(text))

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,  # Chunk size is 200 words
        chunk_overlap=20,
        length_function=word_count,
    )
    
    context_chunks = splitter.split_text(context)


    if retriever == "BM25":
        # BM25 retrieval
        tokenized_chunks = [chunk.split() for chunk in context_chunks]
        bm25 = BM25Okapi(tokenized_chunks)
        query_tokens = input_question.split()
        scores = bm25.get_scores(query_tokens)

        # Get indices of the top k scores
        top_k_indices = np.argsort(scores)[::-1][:top_k]
        top_k_chunks = [context_chunks[i] for i in top_k_indices]

        return top_k_chunks

    elif retriever == "Dragon":
        with torch.no_grad():
            # Tokenize and encode the input query and move to GPU
            query_input = query_tokenizer(input_question, return_tensors="pt").to("cuda")
            query_emb = query_encoder(**query_input).last_hidden_state[:, 0, :]

            scores_list = []
            indices_list = []

            for i in range(0, len(context_chunks), batch_size):
                # Get the batch of context chunks and move to GPU
                batch_chunks = context_chunks[i:i + batch_size]
                ctx_input = query_tokenizer(
                    batch_chunks, padding=True, truncation=True, max_length=512, return_tensors="pt"
                ).to("cuda")

                ctx_emb = context_encoder(**ctx_input).last_hidden_state[:, 0, :]

                # Compute similarity scores using dot product on GPU
                scores = torch.matmul(query_emb, ctx_emb.T)
                scores = scores.squeeze(0)  # Ensure scores is 1D
                scores_list.append(scores.cpu())  # Move scores to CPU before appending
                indices_list.extend(range(i, min(i + batch_size, len(context_chunks))))

                # Delete batch variables and clear GPU cache
                del ctx_input, ctx_emb, scores
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

            # Concatenate scores and get top k indices
            all_scores = torch.cat(scores_list).cpu()  # Ensure scores are on CPU
            top_k = min(top_k, len(all_scores))
            top_k_indices = torch.topk(all_scores, k=top_k).indices
            top_k_chunks = [context_chunks[indices_list[i]] for i in top_k_indices]

            # Delete variables and clear GPU cache
            del query_input, query_emb, scores_list, all_scores
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            return top_k_chunks
        
    elif retriever == "Contriever":
        # Encode the input question
        query_embedding = contriever_model.encode(
            [input_question], show_progress_bar=False, convert_to_tensor=True, device="cuda"
        )

        # Initialize lists to store embeddings and indices
        embeddings_list = []
        indices_list = []

        # Encode context chunks in batches
        for i in range(0, len(context_chunks), batch_size):
            batch_chunks = context_chunks[i:i + batch_size]
            # Encode the batch of context chunks
            ctx_embeddings = contriever_model.encode(
                batch_chunks, show_progress_bar=False, convert_to_tensor=True, device="cuda"
            )
            embeddings_list.append(ctx_embeddings)
            indices_list.extend(range(i, min(i + batch_size, len(context_chunks))))

            # Clear GPU cache
            del ctx_embeddings
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        # Concatenate all context embeddings
        context_embeddings = torch.cat(embeddings_list, dim=0)

        # Compute similarity scores
        similarities = contriever_model.similarity(query_embedding, context_embeddings)
        similarities = similarities.squeeze(0)  # Convert to 1D tensor
        similarities = similarities.cpu().numpy()
        indices_list = np.array(indices_list)

        # Get top k indices and sort them
        top_k = min(top_k, len(similarities))
        top_k_indices = np.argsort(-similarities)[:top_k]
        sorted_results = [(similarities[i], indices_list[i]) for i in top_k_indices]
        top_k_chunks = [context_chunks[idx] for _, idx in sorted_results]

        # Clear variables and GPU cache
        del query_embedding, embeddings_list, context_embeddings, similarities
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return top_k_chunks

    elif retriever == "ContrieverMsmarco":
        # Encode the input question
        query_embedding = contriever_ms_model.encode(
            [input_question], show_progress_bar=False, convert_to_tensor=True, device="cuda"
        )

        # Initialize lists to store embeddings and indices
        embeddings_list = []
        indices_list = []

        # Encode context chunks in batches
        for i in range(0, len(context_chunks), batch_size):
            batch_chunks = context_chunks[i:i + batch_size]
            # Encode the batch of context chunks
            ctx_embeddings = contriever_ms_model.encode(
                batch_chunks, show_progress_bar=False, convert_to_tensor=True, device="cuda"
            )
            embeddings_list.append(ctx_embeddings)
            indices_list.extend(range(i, min(i + batch_size, len(context_chunks))))

            # Clear GPU cache
            del ctx_embeddings
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        # Concatenate all context embeddings
        context_embeddings = torch.cat(embeddings_list, dim=0)

        # Compute similarity scores
        similarities = contriever_ms_model.similarity(query_embedding, context_embeddings)
        similarities = similarities.squeeze(0)  # Convert to 1D tensor
        similarities = similarities.cpu().numpy()
        indices_list = np.array(indices_list)

        # Get top k indices and sort them
        top_k = min(top_k, len(similarities))
        top_k_indices = np.argsort(-similarities)[:top_k]
        sorted_results = [(similarities[i], indices_list[i]) for i in top_k_indices]
        top_k_chunks = [context_chunks[idx] for _, idx in sorted_results]

        # Clear variables and GPU cache
        del query_embedding, embeddings_list, context_embeddings, similarities
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return top_k_chunks

    elif retriever == "Stella":
        # Encode the input question
        query_embedding = stella_model.encode(
            [input_question], show_progress_bar=False, convert_to_tensor=True, device="cuda"
        )

        # Initialize lists to store embeddings and indices
        embeddings_list = []
        indices_list = []

        # Encode context chunks in batches
        for i in range(0, len(context_chunks), batch_size):
            batch_chunks = context_chunks[i:i + batch_size]
            # Encode the batch of context chunks
            ctx_embeddings = stella_model.encode(
                batch_chunks, show_progress_bar=False, convert_to_tensor=True, device="cuda"
            )
            embeddings_list.append(ctx_embeddings)
            indices_list.extend(range(i, min(i + batch_size, len(context_chunks))))

            # Clear GPU cache
            del ctx_embeddings
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        # Concatenate all context embeddings
        context_embeddings = torch.cat(embeddings_list, dim=0)

        # Compute similarity scores
        similarities = stella_model.similarity(query_embedding, context_embeddings)
        similarities = similarities.squeeze(0)  # Convert to 1D tensor
        similarities = similarities.cpu().numpy()
        indices_list = np.array(indices_list)

        # Get top k indices and sort them
        top_k = min(top_k, len(similarities))
        top_k_indices = np.argsort(-similarities)[:top_k]
        sorted_results = [(similarities[i], indices_list[i]) for i in top_k_indices]
        top_k_chunks = [context_chunks[idx] for _, idx in sorted_results]

        # Clear variables and GPU cache
        del query_embedding, embeddings_list, context_embeddings, similarities
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return top_k_chunks

    elif retriever == "TextAda002":
        # Encode the input question using text-embedding-ada-002
        query_embedding = azure_embedding_model.embed_documents([input_question])

        # Initialize lists to store embeddings and indices
        embeddings_list = []
        indices_list = []

        # Encode context chunks in batches
        for i in range(0, len(context_chunks), batch_size):
            batch_chunks = context_chunks[i:i + batch_size]
            # Encode the batch of context chunks
            ctx_embeddings = azure_embedding_model.embed_documents(batch_chunks)
            embeddings_list.append(ctx_embeddings)
            indices_list.extend(range(i, min(i + batch_size, len(context_chunks))))

        # Concatenate all context embeddings
        context_embeddings = np.vstack(embeddings_list)

        # Compute similarity scores using cosine similarity
        similarities = np.dot(query_embedding, context_embeddings.T).squeeze(0)

        # Get top k indices and sort them
        top_k_indices = np.argsort(-similarities)[:top_k]
        top_k_chunks = [context_chunks[i] for i in top_k_indices]

        return top_k_chunks
    
    else:
        raise ValueError(f"Unknown retriever type: {retriever}")

In [None]:
def prompt_builder(prompt_format, context, input_question):
    # Join the top_k_chunks into a single context string
    context_text = "\n".join(context)
    return prompt_format.format(context=context_text, input=input_question)


qasper_ds = load_dataset("THUDM/LongBench", "qasper", split="test")
narrativeqa_ds = load_dataset("THUDM/LongBench", "narrativeqa", split="test")
multifieldqa_en_ds = load_dataset("THUDM/LongBench", "multifieldqa_en", split="test")

In [None]:
import time

def invoke_azure_model(
    ds, output_file, dataset_name, retriever="BM25", batch_size=10
):
    if not os.path.exists(output_file):
        with open("/kaggle/input/cos40011-dataset/dataset2prompt.json", "r") as f:
            dataset2prompt = json.load(f)
        prompt_format = dataset2prompt[dataset_name]

        total_samples = len(ds)
        results = []

        for index in range(total_samples):
#             if index == 2:
#                 break
            row = ds[index]
            # Retrieve context with memory management
            context = retrieve_context(
                row["context"], row["input"], retriever=retriever
            )
            input_question = row["input"]
            prompt = prompt_builder(prompt_format, context, input_question)

            prediction = azure_model.predict(prompt)
            result = {
                **row,  # Unpack all key-value pairs from row
                "prediction": prediction,  # Add the prediction explicitly
            }
            result["context"] = context
            results.append(result)
            print(f"Process {dataset_name} completed: {index + 1} / {total_samples} with context length: {len(context)} and {[ len(x) for x in context ]}")
            time.sleep(0.5)  # Pauses for 0,5 second between batch requests

            # Save results periodically to prevent data loss
            if (index + 1) % batch_size == 0 or (index + 1) == total_samples:
                # Check if file exists to append or create new
                if os.path.exists(output_file):
                    existing_df = pd.read_csv(output_file)
                    updated_df = pd.concat([existing_df, pd.DataFrame(results)], ignore_index=True)
                else:
                    updated_df = pd.DataFrame(results)
                updated_df.to_csv(output_file, index=False)

                # Clear results to free memory
                results = []

                # Explicitly call garbage collector and clear cache
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

            # Delete variables and clear cache after each sample
            del context, input_question, prompt, prediction, result
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    else:
        print(f"Output file {output_file} already exists.")

In [None]:
top_k = 5  

def rerank_context(question, contexts, use_relevance=True):
    ranked_contexts = []
    
    for context in contexts:
        if use_relevance:
            relevance_prompt = (
                f"Question:\n{question}\n\n"
                f"Context:\n{context}\n\n"
                "On a scale from 1 to 10, how relevant is this context for answering the given question? "
                "Please respond with a single number between 1 (least relevant) and 10 (most relevant) and do not provide any other reasoning or anything just a single number."
            )
            
            score = azure_model.predict(relevance_prompt).strip()
        
        else:
            usefulness_prompt = (
                f"Question:\n{question}\n\n"
                f"Context:\n{context}\n\n"
                "On a scale from 1 to 10, how useful and supportive is this context for forming a complete and accurate answer to the given question? "
                "Please respond with a single number between 1 (least useful) and 10 (most useful)  and do not provide any other reasoning or anything just a single number."
            )
            
            score = azure_model.predict(usefulness_prompt).strip()
            
        ranked_contexts.append((context, int(score)))

    ranked_contexts = sorted(ranked_contexts, key=lambda x: x[1], reverse=True)
    
    top_k_contexts = [context for context, score in ranked_contexts[:top_k]]
        
    return top_k_contexts

def invoke_azure_model_with_reranker(
    ds, output_file, dataset_name, retrievers=["BM25", "Dragon"], batch_size=10, use_relevance = True
):
    if not os.path.exists(output_file):
        with open("/kaggle/input/cos40011-dataset/dataset2prompt.json", "r") as f:
            dataset2prompt = json.load(f)
        prompt_format = dataset2prompt[dataset_name]

        total_samples = len(ds)
        results = []

        for index in range(total_samples):
            row = ds[index]
            
            contexts = []
            
            for retriever in retrievers:
                context = retrieve_context(
                    row["context"], row["input"], retriever=retriever, top_k = 20
                )
                contexts.extend(context)
            
            input_question = row["input"]
            
            unique_contexts = list(dict.fromkeys(contexts))
            context = rerank_context(input_question, unique_contexts, use_relevance)
            prompt = prompt_builder(prompt_format, context, input_question)

            prediction = azure_model.predict(prompt)
            result = {
                **row,  # Unpack all key-value pairs from row
                "prediction": prediction,  # Add the prediction explicitly
            }
            result["context"] = context
            results.append(result)
            print(f"Process {dataset_name} completed: {index + 1} / {total_samples} with context length: {len(context)} and {[ len(x) for x in context ]}")
            time.sleep(0.5)  # Pauses for 0,5 second between batch requests

            if (index + 1) % batch_size == 0 or (index + 1) == total_samples:
                if os.path.exists(output_file):
                    existing_df = pd.read_csv(output_file)
                    updated_df = pd.concat([existing_df, pd.DataFrame(results)], ignore_index=True)
                else:
                    updated_df = pd.DataFrame(results)
                updated_df.to_csv(output_file, index=False)

                results = []

                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

            del context, input_question, prompt, prediction, result
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    else:
        print(f"Output file {output_file} already exists.")

In [None]:
retrievers=["BM25", "Dragon"]
use_relevance = True
invoke_azure_model_with_reranker(
    multifieldqa_en_ds,
    f"multifieldqa_en_rag_reranker_{'with_relevance_prompt' if use_relevance else 'with_usefulness_prompt'}_llm_predictions.csv",
    "multifieldqa_en",
    retrievers=retrievers,
    use_relevance = use_relevance
)

invoke_azure_model_with_reranker(
    qasper_ds,
    f"qasper_rag_reranker_{'with_relevance_prompt' if use_relevance else 'with_usefulness_prompt'}_llm_predictions.csv",
    "qasper",
    retrievers=retrievers,
    use_relevance = use_relevance
)

invoke_azure_model_with_reranker(
    narrativeqa_ds,
    f"narrativeqa_rag_reranker_{'with_relevance_prompt' if use_relevance else 'with_usefulness_prompt'}_llm_predictions.csv",
    "narrativeqa",
    retrievers=retrievers,
    use_relevance = use_relevance
)

In [None]:
# !rm -rf /kaggle/working

In [None]:
retrievers=["BM25", "Dragon"]
use_relevance = False
invoke_azure_model_with_reranker(
    multifieldqa_en_ds,
    f"multifieldqa_en_rag_reranker_{'with_relevance_prompt' if use_relevance else 'with_usefulness_prompt'}_llm_predictions.csv",
    "multifieldqa_en",
    retrievers=retrievers,
    use_relevance = use_relevance
)

invoke_azure_model_with_reranker(
    qasper_ds,
    f"qasper_rag_reranker_{'with_relevance_prompt' if use_relevance else 'with_usefulness_prompt'}_llm_predictions.csv",
    "qasper",
    retrievers=retrievers,
    use_relevance = use_relevance
)

invoke_azure_model_with_reranker(
    narrativeqa_ds,
    f"narrativeqa_rag_reranker_{'with_relevance_prompt' if use_relevance else 'with_usefulness_prompt'}_llm_predictions.csv",
    "narrativeqa",
    retrievers=retrievers,
    use_relevance = use_relevance
)

In [None]:
retriever = "TextAda002"

# invoke_azure_model(
#     multifieldqa_en_ds,
#     f"multifieldqa_en_rag_{retriever}_llm_predictions.csv",
#     "multifieldqa_en",
#     retriever=retriever,
# )

invoke_azure_model(
    qasper_ds,
    f"qasper_rag_{retriever}_llm_predictions.csv",
    "qasper",
    retriever=retriever,
)

invoke_azure_model(
    narrativeqa_ds,
    f"narrativeqa_rag_{retriever}_llm_predictions.csv",
    "narrativeqa",
    retriever=retriever,
)

In [None]:
retriever = "ContrieverMsmarco"

invoke_azure_model(
    multifieldqa_en_ds,
    f"multifieldqa_en_rag_{retriever}_llm_predictions.csv",
    "multifieldqa_en",
    retriever=retriever,
)

invoke_azure_model(
    qasper_ds,
    f"qasper_rag_{retriever}_llm_predictions.csv",
    "qasper",
    retriever=retriever,
)

invoke_azure_model(
    narrativeqa_ds,
    f"narrativeqa_rag_{retriever}_llm_predictions.csv",
    "narrativeqa",
    retriever=retriever,
)

In [None]:
retriever = "Stella"

invoke_azure_model(
    multifieldqa_en_ds,
    f"multifieldqa_en_rag_{retriever}_llm_predictions.csv",
    "multifieldqa_en",
    retriever=retriever,
)

invoke_azure_model(
    qasper_ds,
    f"qasper_rag_{retriever}_llm_predictions.csv",
    "qasper",
    retriever=retriever,
)

invoke_azure_model(
    narrativeqa_ds,
    f"narrativeqa_rag_{retriever}_llm_predictions.csv",
    "narrativeqa",
    retriever=retriever,
)