LOADING REQUIRED LIBRARIES

In [1]:
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
import numpy as np
import faiss
import requests
from langchain_community.document_loaders import DirectoryLoader, TextLoader

In [2]:
def load_documents(PROBLEM_PATH, EDITORIAL_PATH):

    class UTF8TextLoader(TextLoader):
        def __init__(self, file_path):
            super().__init__(file_path, encoding="utf-8")

    loader1 = DirectoryLoader(PROBLEM_PATH, glob="*.txt", loader_cls = UTF8TextLoader)
    documents1 = loader1.load()
    loader2 = DirectoryLoader(EDITORIAL_PATH, glob="*.txt", loader_cls = UTF8TextLoader)
    documents2 = loader2.load()

    documents = documents2 + documents1
    return documents

EMBEDDER

In [3]:
def generate_embeddings_data(merged_text):

    text_splitter = RecursiveCharacterTextSplitter(    
    chunk_size = 550,
    chunk_overlap = 50,
    length_function = len,
    add_start_index = True,
    )

    chunks = text_splitter.split_text(merged_text)
    print(len(chunks))

    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
    model = AutoModel.from_pretrained("microsoft/codebert-base")

    tokens = tokenizer(chunks, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings, chunks

VECTORSTORE

In [4]:
def vectorstore(embed, query_embed, k):
    np_embed = embed.numpy()
    np.save("embeddings.npy", np_embed)
    embeddings = np.load("embeddings.npy")

    np_query_embed = query_embed.numpy()
    np.save("query.npy", np_query_embed)
    query_vector = np.load("query.npy")

    if len(query_vector.shape) == 1:
        query_vector = np.expand_dims(query_vector, axis=0)

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    faiss.write_index(index, "vector_store.index")

    index = faiss.read_index("vector_store.index")
    distances, indices = index.search(query_vector, k)

    return distances, indices


RETRIEVER

In [5]:

def retrieve_and_generate(tokenizer, model, merged_text, query_text, k):
    
    embeddings, chunks = generate_embeddings_data(merged_text)

    query_tokens = tokenizer(query_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        query_outputs = model(**query_tokens)
    query_embed = query_outputs.last_hidden_state.mean(dim=1)

    distances, indices = vectorstore(embeddings, query_embed, k)

    top_chunks = [chunks[i] for i in indices[0]]

    return top_chunks

CHATBOT

In [8]:

API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom" 
API_TOKEN = "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"  

HEADERS = {
    "Authorization": f"Bearer {API_TOKEN}"
}

PROMPT_TEMPLATE = """
Answer the question using the provided context.

Context:
{context}

Question: {question}

Answer:
"""



def query_huggingface_api(payload):
    """Send request to Hugging Face Inference API."""
    response = requests.post(API_URL, headers=HEADERS, json=payload)
    response.raise_for_status()
    return response.json()


def generate_response_with_prompt(query, context_chunks):
    context = "\n".join(context_chunks)
    prompt = PROMPT_TEMPLATE.format(context=context, question=query)

    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens":125,
            "temperature": 0.9,
            "top_p": 0.7,
            "do_sample": True
        }
    }
    response = query_huggingface_api(payload)
    return response[0]["generated_text"]

if __name__ == "__main__":
    query = "How do I solve Problem ID 2053D?"
    DATA_PATH = "Project\data\problems"
    EDITORIAL_PATH = "Project\data\editorials"
    docs = load_documents(DATA_PATH, EDITORIAL_PATH)
    merged_text = "\n".join(doc.page_content for doc in docs)
    

    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
    model_e = AutoModel.from_pretrained("microsoft/codebert-base")

    context_chunks = retrieve_and_generate(tokenizer, model_e, merged_text, query, k=5)
    
    response = generate_response_with_prompt(query, context_chunks)
    print("Chatbot Response:")
    print(response)

98
Chatbot Response:

Answer the question using the provided context.

Context:
Problem_ID: 2053D The problem makes no difference when both a and b can be rearranged. Let the rearranged arrays of a and b be c and d respectively. If q=0 , we can write c as SORTED( a 1 , a 2 …, a n ) and d as SORTED( b 1 , b 2 …, b n ) . It can be proved that this reaches the maximum value: if not so, then There must be some pair (i,j) such that c i < c j , d i > d j . Since min( c i , d i )⋅min( c j , d j )= c i ⋅min( c j , d j )≤ c i ⋅min( c j , d i )=min( c i , d j )⋅min( c j , d i ) , we can swap d i and d j , and the product does not
Input Specifications:
Each test contains multiple test cases. The first line of input contains a single integer tt (1≤t≤1051≤t≤105) — the number of test cases. The description of test cases follows. The only line of each test case contains two integers nn and kk (1≤k≤n≤2⋅1091≤k≤n≤2⋅109).

Output Specifications:
For each test case, output a single integer — the final luc