In [None]:
# Only utilize if running in google colab
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# Install everything from requirements.txt
!pip install -r requirements.txt

Collecting bitsandbytes (from -r requirements.txt (line 1))
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting langchain_community (from -r requirements.txt (line 2))
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain_openai (from -r requirements.txt (line 3))
  Downloading langchain_openai-0.3.18-py3-none-any.whl.metadata (2.3 kB)
Collecting pypdf (from -r requirements.txt (line 4))
  Downloading pypdf-5.5.0-py3-none-any.whl.metadata (7.2 kB)
Collecting unstructured (from -r requirements.txt (line 7))
  Downloading unstructured-0.17.2-py3-none-any.whl.metadata (24 kB)
Collecting gradio (from -r requirements.txt (line 8))
  Downloading gradio-5.32.0-py3-none-any.whl.metadata (16 kB)
Collecting python-docx (from -r requirements.txt (line 9))
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community->-r requirements.txt (li

In [3]:
# Import libraries
import os
import yaml
import torch
import gradio as gr
from pathlib import Path

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline as hf_pipeline,
)
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from huggingface_hub import login as hf_login
from langchain.document_loaders import UnstructuredWordDocumentLoader, UnstructuredXMLLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.chat_models import ChatOpenAI
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.runnables import RunnableMap

# Load config.yaml (as before)
config_path = "config.yaml"

with open(config_path, "r") as f:
    config = yaml.safe_load(f)


# Helper to load backend docs & chunk them
def load_and_split_documents():
    docs_list = []
    # XML
    xml_folder = config["data"].get("xml_folder")
    if xml_folder and Path(xml_folder).exists():
        loader = UnstructuredXMLLoader(str(xml_folder))
        docs_list.extend(loader.load())

    # Word doc
    word_doc = config["data"].get("word_doc")
    if word_doc:
        word_folder_path = Path(word_doc)
        for word_file in word_folder_path.glob("*.docx"):
            loader = UnstructuredWordDocumentLoader(word_file)
            docs_list.extend(loader.load())

    # PDF
    pdf = config["data"].get("pdf")
    if pdf:
        pdf_folder_path = Path(pdf)
        for pdf_file in pdf_folder_path.glob("*.pdf"):
            loader = PyPDFLoader(pdf_file)
            docs_list.extend(loader.load())

    # Chunk
    chunk_cfg = config["chunking"]
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_cfg["chunk_size"],
        chunk_overlap=chunk_cfg["chunk_overlap"],
    )
    return splitter.split_documents(docs_list)

# Create RAG function
def initialize_rag(hf_token):
    """
    1) Store tokens in environment (and login to HF hub).
    2) Build (or load) vectorstore with embeddings.
    3) Load the LLM (Mistral 7B) via Transformers + bitsandbytes
    4) Build PromptTemplate + LLMChain.
    """
    # Set env variables for HF
    os.environ["HUGGINGFACE_TOKEN"] = hf_token.strip()
    hf_login(token=hf_token.strip(), add_to_git_credential=False)

    # Build embeddings & vectorstore
    emb_cfg = config["embeddings"]
    if emb_cfg["type"] == "huggingface":
        embedding_model = HuggingFaceEmbeddings(model_name=emb_cfg["model_name"])

    # Load & split documents
    splits = load_and_split_documents()

    # Build InMemoryVectorStore from splits
    vectorstore = InMemoryVectorStore.from_documents(splits, embedding_model)
    retriever = vectorstore.as_retriever()

    # Load & quantize the llm (Mistral 7B)
    llm_cfg = config["llm"]
    if llm_cfg["provider"] == "huggingface":
        model_id = llm_cfg["model_id"]

        bnb_cfg = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16
            if llm_cfg["quantization"].get("compute_dtype") == "float16"
            else torch.float32,
            bnb_4bit_use_double_quant=llm_cfg["quantization"].get("double_quant", False),
            bnb_4bit_quant_type=llm_cfg["quantization"].get("quant_type", "nf4"),
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            quantization_config=bnb_cfg,
            use_auth_token=True,
        )

        tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
        hf_pipe = hf_pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=512,
            temperature=0.7,
            do_sample=True,
            device_map="auto",
            return_full_text=False
        )
        llm = HuggingFacePipeline(pipeline=hf_pipe)

    else:
        raise ValueError("Unsupported LLM provider; must be 'huggingface'")

    # Build the PromptTemplate
    prompts_cfg = config["prompts"]
    instruction_text = prompts_cfg["instruction"].strip()
    example_question = prompts_cfg["format_example"].strip()
    example_answer = prompts_cfg['format_answer'].strip()

    prompt_str = """
{instruction_text}

Question:
{example_question}

Answer:
{example_answer}


––––––––––––––––––––––––––––––––––––––––––––––––

Now use ONLY the CONTEXT below to answer the QUESTION. Follow the same format as the example.

Context:
{context}

Question:
{question}

Answer:
"""
    prompt_template = PromptTemplate(
        input_variables=["instruction_text", "example_question", "example_answer", "context", "question"],
        template=prompt_str
    )
    llm_chain = LLMChain(llm=llm, prompt=prompt_template)

    # Wrap retrieval + prompt/LLM into a RunnableMap, with truncation logic
    top_k = config["retrieval"]["top_k"]
    max_characters = config["retrieval"]["max_context_chars"]

    def build_limited_context(inputs: dict) -> str:
        docs = retriever.get_relevant_documents(inputs["question"])[:top_k]
        texts = [doc.page_content for doc in docs]
        combined = "\n\n".join(texts)
        if len(combined) > max_characters:
            combined = combined[:max_characters].rsplit("\n", 1)[0]
        return combined

    qa_chain = RunnableMap({
        "instruction_text": lambda _: instruction_text,
        "example_question": lambda _: example_question,
        "example_answer": lambda _: example_answer,
        "context": lambda x: build_limited_context(x),
        "question": lambda x: x["question"],
    }) | llm_chain

    # Return all pieces in a single dict
    return {
        "vectorstore": vectorstore,
        "retriever": retriever,
        "llm_chain": qa_chain
    }

# “answer_question” callback uses the chain in state

def answer_question(state, user_question: str) -> str:
    """
    state: dict containing 'llm_chain' from initialize_rag(...)
    """
    if state is None:
        return "❗ Please click ‘Initialize’ after entering your tokens."
    if not user_question.strip():
        return "Please enter a question."

    qa_chain = state["llm_chain"]
    result = qa_chain.invoke({"question": user_question})

    return result.get("text", "No answer returned.")


# Gradio UI
with gr.Blocks(title="InMemory RAG + User Tokens") as demo:
    gr.Markdown("## Enter Your API Token")
    with gr.Row():
        hf_input = gr.Textbox(
            label="Hugging Face Token",
            placeholder="hf_…",
            type="password",
            interactive=True,
        )

    init_btn = gr.Button("Initialize RAG App")
    state = gr.State(None)
    init_output = gr.Textbox(label="Initialization Status", interactive=False)

    def on_init_clicked(hf_token):
        try:
            new_state = initialize_rag(hf_token)
            return new_state, "✅ Initialization successful! You can now prompt the application."
        except Exception as e:
            return None, f"❌ Initialization failed: {e}"

    init_btn.click(
        fn=on_init_clicked,
        inputs=[hf_input],
        outputs=[state, init_output],
    )

    gr.Markdown("## Enter Student Information")

    question_input = gr.Textbox(
        label="Query",
        placeholder="Enter student information ans assessment results if available",
        lines=2,
        interactive=True,
    )
    submit_btn = gr.Button("Submit")

    answer_output = gr.Textbox(label="Answer", interactive=False)
    status_output = gr.Markdown("")  # shows loading or status

    # Wrapped version of answer_question with loading indicator
    def on_question_submit(state, user_question):
        if state is None:
            return "", "Please click ‘Initialize’ after entering your tokens."
        if not user_question.strip():
            return "", "Please enter a question."

        status = "Processing..."
        try:
            result = answer_question(state, user_question)
            return result, ""
        except Exception as e:
            return "", f"Error: {e}"

    # Button click = submit
    submit_btn.click(
        fn=on_question_submit,
        inputs=[state, question_input],
        outputs=[answer_output, status_output],
    )

    # Pressing Enter also triggers submission
    question_input.submit(
        fn=on_question_submit,
        inputs=[state, question_input],
        outputs=[answer_output, status_output],
    )

demo.launch(share=True, debug=True)



Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://90cbcd04dbd06ce090.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://90cbcd04dbd06ce090.gradio.live


