In [3]:
# INSTALLING THE NEEDED PACKAGES AND LIBRARIES

!pip install -q langchain
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q faiss-cpu
!pip install -U langchain-community
!pip install -U langchain-huggingface
!pip install gradio
!pip install langchain langchain_openai openai
!pip install langchain langchain_openai openai




Collecting langchain-openai
  Downloading langchain_openai-0.3.4-py3-none-any.whl.metadata (2.3 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading langchain_openai-0.3.4-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.7/54.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken, langchain-openai
Successfully installed langchain-openai-0.3.4 tiktoken-0.8.0


In [6]:
#IMPORTING NEEDED PACKAGES

import os
import json
import re

import gradio as gr
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

from langchain_openai import ChatOpenAI

from langchain.schema import Document
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain import HuggingFacePipeline
from langchain_huggingface import HuggingFaceEmbeddings


In [7]:
# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"


In [19]:
#The problem file
problem_file = "location of the problem file"
#"/content/drive/MyDrive/GDG_Project/GDGChatbot/data/problems.txt"

with open(problem_file, "r",encoding = "utf-8") as file:
    content = file.read().split("-" * 50 + "\n")



problems={}
for index in range(len(content)-1):
    block = content[index].split("\n")
    lines = block
    problem_id = lines[0].split(":")[1].strip()
    problem_text = lines[2].strip()
    problem_input_specs= lines[4][5:]
    problem_output_specs= lines[6][6:]
    l1 = [problem_text,problem_input_specs,problem_output_specs]
    problems[problem_id] =(" ".join(l1))


#The metadata file
def parse_metadata(metadata_file):
    with open(metadata_file, "r",encoding="utf-8") as file:
        return json.load(file)

tutorials = {}
tutorial_file = "location of the tutorial file"
#"/content/drive/MyDrive/GDG_Project/GDGChatbot/data/editorials.txt"
with open(tutorial_file, "r",encoding="utf-8") as file:
    editorial_content = file.read().split("-" * 50 + "\n")


for index in range(len(editorial_content)-1):
            block = editorial_content[index].split("\n")
            lines = block
            problem_id = lines[0].split(":")[1].strip()
            full_explanation = lines[1:]
            for index , str in enumerate(full_explanation):
                if (("code" in str) or ("Code" in str) or ("Implementation" in str) or ("implementation" in str) or ("Solution" in str) or ("solution" in str)) and (len(str)<50):
                    Code_block = full_explanation[index+1:]
                    explanation = full_explanation[1:index]

                    break





            tutorials[problem_id] = {"explanation": explanation, "code": Code_block}


metadata = parse_metadata("location of the metadata file")
    #"/content/drive/MyDrive/GDG_Project/GDGChatbot/data/metadata.json"


ids=[]
for i in range(len(metadata)):
    ids.append(metadata[i]['problem_id'])



metadata_text = " ".join([f"{k}: {v}" for k, v in metadata[0].items()])



# Combine data for use
combined_data = []

for item in metadata:
    problem_id = item["problem_id"]
    tuts = tutorials.get(problem_id, {})

    a = "\n".join(tuts['code'])

    combined_data.append({
        "problem_id": item["problem_id"],
        "problem_title": item["problem_title"],
        "time_limit": item["time_limit"],
        "memory_limit": item["memory_limit"],
        "problem_tags": item["problem_tags"],
        "problem_content": problems.get(problem_id, ""),
        "tutorial": a

    })

# Did this so it will be easy to get the chunks if user just mentioned problem id in the query.
for i in range (len(combined_data)):

    combined_data[i]['problem_content']="( "+combined_data[i]['problem_id'] +" "+ combined_data[i]['problem_title'] +" ) "+ combined_data[i]["problem_content"]

In [20]:
# The splitter using langchains recursive splitter with overlaps for context preservation.

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150
)

In [27]:
# Chunking logic


documents = []
for item in combined_data:
    metadata = {key: value for key, value in item.items()
                if key not in ["problem_content", "tutorial"]}
    problem_id = item["problem_id"]

    # 1. Create dedicated chunk for problem ID and title
    id_title_chunk = Document(
        page_content=f"PROBLEM ID: {problem_id}\nTITLE: {item['problem_title']}",
        metadata=metadata
    )
    documents.append(id_title_chunk)

    # 2. Create chunks for problem content with ID prefix
    problem_content = item["problem_content"].split(")", 1)[-1].strip()

    problem_chunks = text_splitter.split_documents([Document(
        page_content=problem_content,
        metadata=metadata
    )])
    documents.extend(problem_chunks)



In [30]:
docs= documents

In [31]:
# Initialissing the embedder using the huggoing dface trabsformer (as codebert wasnt giving good results and these are specifically trained for natural laguage)

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': False}
)


In [32]:
# Generating and storing embedding using the faiss from langchain
db = FAISS.from_documents(docs, embeddings)

In [35]:
# Initialize LLM
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)

# Initialize Memory for Chat
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="answer"  # Ensure only "answer" is stored in memory
)

# Prompt Template
prompt_template = """You are an intelligent coding assistant that helps users **understand and implement coding solutions step-by-step**.
Your job is to **explain** and **guide** the user, ensuring clarity at every step.

---

### **📝 How You Should Respond:**

🔹 **For First-Time Questions:**
   1️⃣ Explain the problem in simple terms.
   2️⃣ Describe the thought process behind the solution.
   3️⃣ Discuss edge cases and optimizations if relevant.
   4️⃣ If code is involved, guide the user through it step by step rather than just providing it.

🔹 **For Follow-Up Questions (Based on Chat History):**
   - **Do not repeat previous explanations** unless needed.
   - **Answer only what the user has asked** while keeping past context in mind.
   - If the user asks for code explanation, break it down clearly instead of just stating the logic.
   - If the user needs clarification, expand on the specific part they are confused about.

---

### **User Query Details:**
🔹 **Context (Relevant Information):**
{context}

🔹 **User's Question:**
{question}

🔹 **Previous Chat History (for follow-ups):**
{chat_history}

---

### **✏️ Your Answer:**
- **For first-time questions**: Provide a structured, step-by-step explanation.
- **For follow-ups**: Answer **only what is asked**, using previous context to avoid repetition.
- **If unsure, state that clearly** rather than making up an answer.

💡 Always **explain first, then guide**, instead of directing the user to just copy code."""


QA_PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question", "chat_history"]
)

# Initialize QA system with retrieval (for first query only)
qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=db.as_retriever(search_kwargs={"k": 4}),
    memory=memory,
    return_source_documents=True,
    output_key="answer",
    combine_docs_chain_kwargs={"prompt": QA_PROMPT}
)

# Variable to store retrieved context after the first query
stored_context = None

# Start Chat Loop
print("\n🔹 Chatbot Started! Type 'exit' to stop.\n")

# Initialize chat history
chat_history = []

while True:
    # Get user input
    question = input("User: ")

    # Exit condition
    if question.lower() == "exit":
        print("🔹 Chatbot: Ending conversation. Goodbye! 👋")
        break

    try:
        if stored_context is None:  # Retrieve only for the first query
            result = qa.invoke({"question": question})

            # Check if "source_documents" exists before using it
            if "source_documents" not in result:
                print("\n🔹 No relevant documents found. Try rephrasing the question.")
                continue

            # Extract retrieved problem IDs
            problem_ids = {doc.metadata["problem_id"] for doc in result["source_documents"]}

            # Fetch full problem data (tutorials, code, etc.) from combined_data
            stored_context = [item for item in combined_data if item["problem_id"] in problem_ids]



        # Use stored context for follow-up questions (No new retrieval)
        final_prompt = prompt_template.format(
            context=stored_context,
            question=question,
            chat_history=chat_history
        )

        # Get final answer
        final_result = llm.invoke(final_prompt)

        # Update chat history
        chat_history.append((question, final_result.content))

        # Print chatbot response
        print(f"\n🔹 Chatbot: {final_result.content}\n")

    except Exception as e:
        print(f"Error: {repr(e)}")


🔹 Chatbot Started! Type 'exit' to stop.

User: how can i solve coffee break problem?

🔹 Chatbot: To solve the Coffee Break problem, we can approach it using dynamic programming. Here's a step-by-step guide on how to implement the solution:

1. **Understanding the Problem**:
   - In this problem, we are given the number of students around each coffee machine along a corridor.
   - The goal is to find the maximum number of students that can be gathered around a single coffee machine after turning off lights in certain rooms.
   - We can turn off lights in rooms to manipulate the students' positions.

2. **Approach**:
   - We can maintain two arrays, `lhs` and `rhs`, to store the maximum number of students that can be moved to the left and right of each coffee machine respectively.
   - To calculate the maximum number of students that can be moved to the right of each machine, we can use the `get_right_out` function.
   - Similarly, to calculate the maximum number of students that can be