In [1]:
import os
#os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
from sklearn.neighbors import NearestNeighbors
from transformers import AutoTokenizer, AutoModel
import torch
torch.set_num_threads(1)
#import faiss
import numpy as np

In [2]:
import os
#os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
from sklearn.neighbors import NearestNeighbors
from transformers import AutoTokenizer, AutoModel
import torch
torch.set_num_threads(1)
#import faiss
import numpy as np

def load_markdown_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# 3.1 Embedding Generation
class CodeBERTEmbedder:
    def __init__(self, model_name='microsoft/codebert-base'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def generate_embedding(self, text):
        tokens = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = self.model(**tokens)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
        return embedding.numpy()

    def batch_generate_embeddings(self, texts, batch_size=2):
        all_embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            tokens = self.tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=128)
            with torch.no_grad():
                outputs = self.model(**tokens)
            embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            all_embeddings.append(embeddings)
        return np.vstack(all_embeddings)

# 3.2 Vector Store Implementation

class VectorStore:
    def __init__(self, dim=768):
        self.data = []
        self.texts = []

    def add_embeddings(self, texts, embeddings):
        self.data = embeddings
        self.texts = texts
        self.nn = NearestNeighbors(n_neighbors=5, metric='cosine').fit(embeddings)

    def search(self, query_embedding, k=5):
        distances, indices = self.nn.kneighbors([query_embedding], n_neighbors=k)
        results = [(self.texts[idx], 1 - distances[0][i]) for i, idx in enumerate(indices[0])]
        return results

'''class VectorStore:
    def __init__(self, dim=768, index_type='flat_l2'):
        if index_type == 'flat_ip':
            self.index = faiss.IndexFlatIP(dim)
        elif index_type == 'flat_l2':
            self.index = faiss.IndexFlatL2(dim)
        else:
            raise ValueError("Unsupported index type")
        self.id_to_text = {}

    def add_embeddings(self, texts, embeddings):
        if isinstance(embeddings, torch.Tensor):
            embeddings = embeddings.detach().numpy()
        self.index.add(embeddings)
        for idx, text in enumerate(texts):
            self.id_to_text[len(self.id_to_text)] = text

    def search(self, query_embedding, k=5):
        if isinstance(query_embedding, torch.Tensor):
            query_embedding = query_embedding.detach().numpy()
        if len(query_embedding.shape) == 1:
           query_embedding = query_embedding.reshape(1, -1)  # Ensure 2D array
           distances, indices = self.index.search(query_embedding, k)
           results = [(self.id_to_text[idx], distances[0][i]) for i, idx in enumerate(indices[0])]
           return results'''


# 3.3 RAG Implementation
class RAGRetriever:
    def __init__(self, embedder, vector_store):
        self.embedder = embedder
        self.vector_store = vector_store

    def retrieve_context(self, query, top_k=5):
        query_embedding = self.embedder.generate_embedding(query)
        results = self.vector_store.search(query_embedding, k=top_k)
        return results

# 3.4 Chatbot Integration
class CPChatbot:
    def __init__(self, retriever, system_message):
        self.retriever = retriever
        self.system_message = system_message

    def chat(self, user_query):
        contexts = self.retriever.retrieve_context(user_query)
        response = f"System: {self.system_message}\n\n"
        response += "\n".join([f"Context {i+1}: {context}" for i, (context, _) in enumerate(contexts)])
        return response

# 4 Example Usage
if __name__ == "__main__":
    # Initialize components
    embedder = CodeBERTEmbedder()
    vector_store = VectorStore()
    retriever = RAGRetriever(embedder, vector_store)

    # Load Markdown file and split it into lines or paragraphs
    file_path = "C:/Users/91902/Desktop/combined.md"
    markdown_content = load_markdown_file(file_path)
    documents = [doc.strip() for doc in markdown_content.split("\n") if doc.strip()]

    # Add data from the Markdown file
    embeddings = embedder.batch_generate_embeddings(documents)
    vector_store.add_embeddings(documents, embeddings)

    # Create chatbot
    system_message = (
        "I am solving a Competitive Programming problem, and I need help understanding its editorial.\n"
        "Answer my questions regarding the editorial.\n"
        "Let me know if I'm misunderstanding anything.\n"
        "Do not write or debug code."
    )
    chatbot = CPChatbot(retriever, system_message)

    # Chat example
    user_query = input("Enter your query: ")  # Accept user input dynamically
    print(chatbot.chat(user_query))



Enter your query:  How do I solve problem C from Contest#792?


System: I am solving a Competitive Programming problem, and I need help understanding its editorial.
Answer my questions regarding the editorial.
Let me know if I'm misunderstanding anything.
Do not write or debug code.

Context 1: Gmail was in beta for 5 years.  What's the rush?
Context 2: Hmm... Codeforces is not beta anymore. May the picture be changed?
Context 3: Have you yourself written translation of this entry into English?
Context 4: Why doesnt this post have any upvote or downvote @@
Context 5: Hi. Though I don't take this blog a proper place for making new friends...
