In [6]:
import os
from sklearn.neighbors import NearestNeighbors
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

In [7]:
# Ensure PyTorch uses a single thread for better efficiency in small-scale tasks
torch.set_num_threads(1)

# Load text files from a directory
def load_text_files_from_directory(directory):
    files_content = []
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        if os.path.isfile(file_path) and file_path.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as file:
                files_content.append(file.read().strip())
    return files_content

In [4]:
# Combine problem statements and editorials
def combine_problems_and_editorials(problem_statements_path, editorials_path):
    problems = load_text_files_from_directory(problem_statements_path)
    editorials = load_text_files_from_directory(editorials_path)
    combined = [f"Problem: {p}\n\nEditorial: {e}" for p, e in zip(problems, editorials)]
    return combined

In [3]:
# Embedding Generation
class CodeBERTEmbedder:
    def __init__(self, model_name='microsoft/codebert-base'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def generate_embedding(self, text):
        tokens = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = self.model(**tokens)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
        return embedding.numpy()

    def batch_generate_embeddings(self, texts, batch_size=2):
        all_embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            tokens = self.tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=128)
            with torch.no_grad():
                outputs = self.model(**tokens)
            embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            all_embeddings.append(embeddings)
        return np.vstack(all_embeddings)

In [2]:
# Vector Store Implementation
class VectorStore:
    def __init__(self):
        self.data = []
        self.texts = []

    def add_embeddings(self, texts, embeddings):
        self.data = embeddings
        self.texts = texts
        self.nn = NearestNeighbors(n_neighbors=1, metric='cosine').fit(embeddings)

    def search(self, query_embedding, k=1):
        distances, indices = self.nn.kneighbors([query_embedding], n_neighbors=k)
        results = [(self.texts[idx], 1 - distances[0][i]) for i, idx in enumerate(indices[0])]
        return results

In [1]:
# RAG Implementation
class RAGRetriever:
    def __init__(self, embedder, vector_store):
        self.embedder = embedder
        self.vector_store = vector_store

    def retrieve_context(self, query, top_k=1):
        query_embedding = self.embedder.generate_embedding(query)
        results = self.vector_store.search(query_embedding, k=top_k)
        return results

In [8]:
# Chatbot Integration
class CPChatbot:
    def __init__(self, retriever, system_message):
        self.retriever = retriever
        self.system_message = system_message

    def chat(self, user_query):
        contexts = self.retriever.retrieve_context(user_query)
        response = f"System: {self.system_message}\n\n"
        response += f"Context: {contexts[0][0]}"
        return response

# Main script
if __name__ == "__main__":
    # Set paths
    EDITORIALS_PATH = r"C:\Users\Aaryan\Desktop\Chatbot2\data\editorials"
    PROBLEM_STATEMENTS_PATH = r"C:\Users\Aaryan\Desktop\Chatbot2\data\problem_statements"

    # Combine problem statements and editorials
    documents = combine_problems_and_editorials(PROBLEM_STATEMENTS_PATH, EDITORIALS_PATH)

    # Initialize components
    embedder = CodeBERTEmbedder()
    vector_store = VectorStore()
    retriever = RAGRetriever(embedder, vector_store)

    # Generate embeddings and populate the vector store
    embeddings = embedder.batch_generate_embeddings(documents)
    vector_store.add_embeddings(documents, embeddings)

    # Create chatbot
    system_message = (
        "I am solving a Competitive Programming problem, and I need help understanding its editorial.\n"
        "Answer my questions regarding the editorial.\n"
    )
    chatbot = CPChatbot(retriever, system_message)

    # Single query input
    user_query = input("Enter your query: ")
    print(chatbot.chat(user_query))

System: I am solving a Competitive Programming problem, and I need help understanding its editorial.
Answer my questions regarding the editorial.


Context: Problem: You are given a positive integer n.
In this problem, the MEX of a collection of integers c1,c2,…,ck is defined as the smallest positive integer x which does not occur in the collection c. 
The primality of an array a1,…,an is defined as the number of pairs (l,r) such that 1≤l≤r≤n and MEX(al,…,ar) is a prime number. 
Find any permutation of 1,2,…,n with the maximum possible primality among all permutations of 1,2,…,n. 
Note: 
  A prime number is a number greater than or equal to 2 that is not divisible by any positive integer except 1 and itself. For example, 2,5,13 are prime numbers, but 1 and 6 are not prime numbers.  A permutation of 1,2,…,n is an array consisting of n distinct integers from 1 to n in arbitrary order. For example, [2,3,1,5,4] is a permutation, but [1,2,2] is not a permutation (2 appears twice in the arra