In [23]:
import json
from transformers import RobertaTokenizer, RobertaModel
import torch
import faiss
import numpy as np

In [24]:
class CodeBERTEmbedder:
    def __init__(self):
        # Pre-set model name to the standard CodeBERT model
        model_name = "microsoft/codebert-base"
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
        self.model = RobertaModel.from_pretrained(model_name)

    def generate_embedding(self, text):
        # Tokenize input text
        tokens = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        # Generate embeddings using CodeBERT (no gradient computation needed)
        with torch.no_grad():
            output = self.model(**tokens)
        # Perform mean pooling to obtain a single vector representation
        return output.last_hidden_state.mean(dim=1)

    def batch_generate_embeddings(self, texts):
        # Generate embeddings for a batch of texts
        embeddings = [self.generate_embedding(text) for text in texts]
        # Concatenate embeddings into a single tensor
        return torch.cat(embeddings, dim=0)

In [25]:
class VectorStore:
    def __init__(self):
        self.dim = 768  # Pre-set embedding dimension for CodeBERT
        self.index = faiss.IndexFlatIP(self.dim)  # Use inner product for similarity

    def add_vectors(self, embeddings):
        # Convert PyTorch tensors to NumPy for FAISS compatibility
        if isinstance(embeddings, torch.Tensor):
            embeddings = embeddings.cpu().numpy()
        self.index.add(embeddings)

    def search(self, query_embedding, top_k=5):
        if isinstance(query_embedding, torch.Tensor):
            query_embedding = query_embedding.cpu().numpy()
        distances, indices = self.index.search(query_embedding, top_k)
        return distances, indices

In [27]:
class RAGRetriever:
    def __init__(self):
        self.embedder = CodeBERTEmbedder()
        self.vector_store = VectorStore()

    def add_contexts(self, texts):
        # Generate embeddings for the provided contexts and store them
        embeddings = self.embedder.batch_generate_embeddings(texts)
        self.vector_store.add_vectors(embeddings)

    def retrieve_context(self, query, top_k=5):
        # Generate query embedding
        query_embedding = self.embedder.generate_embedding(query)
        distances, indices = self.vector_store.search(query_embedding, top_k)
        return distances, indices

In [90]:
class CPChatbot:
    def __init__(self):
        self.retriever = RAGRetriever()
        self.system_message = "I am here to assist with Competitive Programming problems."

    def add_knowledge_base(self, contexts):
        # Add contexts (editorials or metadata) to the vector store
        self.retriever.add_contexts(contexts)

    def chat(self, query):
        # Retrieve relevant contexts (indices and distances)
        distances, indices = self.retriever.retrieve_context(query, top_k=1)
        
        # Fetch the most relevant problem from the original dataset based on the first index
        most_relevant_problem = data[indices[0][0]]

        # Construct a response that includes all the details of the most relevant problem
        response = "Most relevant problem:\n"
        response += f"Title: {most_relevant_problem['title']}\n"
        response += f"Description: {most_relevant_problem['description']}\n"
        response += f"Time Limit: {most_relevant_problem['time_limit']}\n"
        response += f"Memory Limit: {most_relevant_problem['memory_limit']}\n"
        response += f"Tags: {', '.join(most_relevant_problem['tags'])}\n"
        response += f"Solution: {most_relevant_problem['solution']}\n"
        
        return response

In [89]:
with open('Problem-Data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

In [56]:
chatbot = CPChatbot()
chatbot.add_knowledge_base(data)

In [None]:
query = "Can you tell me about Catch the coin problem?"
response = chatbot.chat(query)
print(response)

Most relevant problem:
Title: A. Catch the Coin
Description: Monocarp visited a retro arcade club with arcade cabinets. There got curious about the "Catch the Coin" cabinet.
The game is pretty simple. The screen represents a coordinate grid such that:
the X-axis is directed from left to right;
the Y-axis is directed from bottom to top;
the center of the screen has coordinates $$$(0, 0)$$$.
At the beginning of the game, the character is located in the center, and $$$n$$$ coins appear on the screen — the $$$i$$$-th coin is at coordinates $$$(x_i, y_i)$$$. The coordinates of all coins are different and not equal to $$$(0, 0)$$$.
In one second, Monocarp can move the character in one of eight directions. If the character is at coordinates $$$(x, y)$$$, then it can end up at any of the coordinates $$$(x, y + 1)$$$, $$$(x + 1, y + 1)$$$, $$$(x + 1, y)$$$, $$$(x + 1, y - 1)$$$, $$$(x, y - 1)$$$, $$$(x - 1, y - 1)$$$, $$$(x - 1, y)$$$, $$$(x - 1, y + 1)$$$.
If the character ends up at the coord