In [None]:
# Only utilize if running in google colab with data in Drive
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# Uncomment to clone the github repo
!git clone https://github.com/cdietrich03/end-2-end-nlp.git

Cloning into 'end-2-end-nlp'...
remote: Enumerating objects: 42, done.[K
remote: Counting objects: 100% (42/42), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 42 (delta 12), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (42/42), 8.74 MiB | 9.37 MiB/s, done.
Resolving deltas: 100% (12/12), done.


In [2]:
# Install everything from requirements.txt
!pip install -r end-2-end-nlp/requirements.txt

Collecting bitsandbytes (from -r end-2-end-nlp/requirements.txt (line 1))
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting langchain_community (from -r end-2-end-nlp/requirements.txt (line 2))
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain_openai (from -r end-2-end-nlp/requirements.txt (line 3))
  Downloading langchain_openai-0.3.18-py3-none-any.whl.metadata (2.3 kB)
Collecting pypdf (from -r end-2-end-nlp/requirements.txt (line 4))
  Downloading pypdf-5.5.0-py3-none-any.whl.metadata (7.2 kB)
Collecting unstructured (from -r end-2-end-nlp/requirements.txt (line 7))
  Downloading unstructured-0.17.2-py3-none-any.whl.metadata (24 kB)
Collecting gradio (from -r end-2-end-nlp/requirements.txt (line 8))
  Downloading gradio-5.32.0-py3-none-any.whl.metadata (16 kB)
Collecting python-docx (from -r end-2-end-nlp/requirements.txt (line 9))
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (

In [3]:
# Import libraries
import os
import yaml
import torch
import gradio as gr
from pathlib import Path

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline as hf_pipeline,
)
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from huggingface_hub import login as hf_login
from langchain.document_loaders import UnstructuredWordDocumentLoader, UnstructuredXMLLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.chat_models import ChatOpenAI
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.runnables import RunnableMap

# Load config.yaml
config_path = "end-2-end-nlp/config.yaml"

with open(config_path, "r") as f:
    config = yaml.safe_load(f)


# Helper to load backend docs and chunk them
def load_and_split_documents():
    docs_list = []
    # XML
    xml_folder = config["data"].get("xml_folder")
    if xml_folder and Path(xml_folder).exists():
        loader = UnstructuredXMLLoader(str(xml_folder))
        docs_list.extend(loader.load())

    # Word doc
    word_doc = config["data"].get("word_doc")
    if word_doc:
        word_folder_path = Path(word_doc)
        for word_file in word_folder_path.glob("*.docx"):
            loader = UnstructuredWordDocumentLoader(word_file)
            docs_list.extend(loader.load())

    # PDF
    pdf = config["data"].get("pdf")
    if pdf:
        pdf_folder_path = Path(pdf)
        for pdf_file in pdf_folder_path.glob("*.pdf"):
            loader = PyPDFLoader(pdf_file)
            docs_list.extend(loader.load())

    # Chunk
    chunk_cfg = config["chunking"]
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_cfg["chunk_size"],
        chunk_overlap=chunk_cfg["chunk_overlap"],
    )
    return splitter.split_documents(docs_list)

# Create RAG function
def initialize_rag(hf_token):

    # Set env variables for HF
    os.environ["HUGGINGFACE_TOKEN"] = hf_token.strip()
    hf_login(token=hf_token.strip(), add_to_git_credential=False)

    # Build embeddings and vectorstore
    emb_cfg = config["embeddings"]
    if emb_cfg["type"] == "huggingface":
        embedding_model = HuggingFaceEmbeddings(model_name=emb_cfg["model_name"])

    # Load and split documents
    splits = load_and_split_documents()

    # Build InMemoryVectorStore from splits
    vectorstore = InMemoryVectorStore.from_documents(splits, embedding_model)
    retriever = vectorstore.as_retriever()

    # Load and quantize llm
    llm_cfg = config["llm"]
    if llm_cfg["provider"] == "huggingface":
        model_id = llm_cfg["model_id"]

        bnb_cfg = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16
            if llm_cfg["quantization"].get("compute_dtype") == "float16"
            else torch.float32,
            bnb_4bit_use_double_quant=llm_cfg["quantization"].get("double_quant", False),
            bnb_4bit_quant_type=llm_cfg["quantization"].get("quant_type", "nf4"),
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            quantization_config=bnb_cfg,
            use_auth_token=True,
        )

        tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
        hf_pipe = hf_pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=512,
            temperature=0.7,
            do_sample=True,
            device_map="auto",
            return_full_text=False
        )
        llm = HuggingFacePipeline(pipeline=hf_pipe)

    else:
        raise ValueError("Unsupported LLM provider; must be 'huggingface'")

    # Build the PromptTemplate
    prompts_cfg = config["prompts"]
    instruction_text = prompts_cfg["instruction"].strip()
    example_question = prompts_cfg["format_example"].strip()
    example_answer = prompts_cfg['format_answer'].strip()

    prompt_str = """
{instruction_text}

Question:
{example_question}

Answer:
{example_answer}


––––––––––––––––––––––––––––––––––––––––––––––––

Now use ONLY the CONTEXT below to answer the QUESTION. Follow the same format as the example.

Context:
{context}

Question:
{question}

Answer:
"""
    prompt_template = PromptTemplate(
        input_variables=["instruction_text", "example_question", "example_answer", "context", "question"],
        template=prompt_str
    )
    llm_chain = LLMChain(llm=llm, prompt=prompt_template)

    # Define top k and max characters
    top_k = config["retrieval"]["top_k"]
    max_characters = config["retrieval"]["max_context_chars"]

    # Retrieve relevant tops (k=3) and join context, max 10000 characters
    def build_limited_context(inputs: dict) -> str:
        docs = retriever.get_relevant_documents(inputs["question"])[:top_k]
        texts = [doc.page_content for doc in docs]
        combined = "\n\n".join(texts)
        if len(combined) > max_characters:
            combined = combined[:max_characters].rsplit("\n", 1)[0]
        return combined

    # Create qa chain to combine all segments of prompt
    qa_chain = RunnableMap({
        "instruction_text": lambda _: instruction_text,
        "example_question": lambda _: example_question,
        "example_answer": lambda _: example_answer,
        "context": lambda x: build_limited_context(x),
        "question": lambda x: x["question"],
    }) | llm_chain

    # Return all pieces in a single dict
    return {
        "vectorstore": vectorstore,
        "retriever": retriever,
        "llm_chain": qa_chain
    }

# “answer_question” callback uses the chain in state

def answer_question(state, user_question: str) -> str:
    """
    state: dict containing 'llm_chain' from initialize_rag(...)
    """
    if state is None:
        return "❗ Please click ‘Initialize’ after entering your tokens."
    if not user_question.strip():
        return "Please enter a question."

    qa_chain = state["llm_chain"]
    result = qa_chain.invoke({"question": user_question})

    return result.get("text", "No answer returned.")


# Gradio UI
with gr.Blocks(title="InMemory RAG + User Tokens") as demo:
    gr.Markdown("## Enter Your API Token")
    with gr.Row():
        # Create password store
        hf_input = gr.Textbox(
            label="Hugging Face Token",
            placeholder="hf_…",
            type="password",
            interactive=True,
        )

    # Initializing app
    init_btn = gr.Button("Initialize RAG App")
    state = gr.State(None)
    init_output = gr.Textbox(label="Initialization Status", interactive=False)

    # Status of app
    def on_init_clicked(hf_token):
        try:
            new_state = initialize_rag(hf_token)
            return new_state, "✅ Initialization successful! You can now prompt the application."
        except Exception as e:
            return None, f"❌ Initialization failed: {e}"

    init_btn.click(
        fn=on_init_clicked,
        inputs=[hf_input],
        outputs=[state, init_output],
    )

    gr.Markdown("## Enter Student Information")

    # Box for entering the query
    question_input = gr.Textbox(
        label="Query",
        placeholder="Enter student information ans assessment results if available",
        lines=2,
        interactive=True,
    )
    submit_btn = gr.Button("Submit")

    answer_output = gr.Textbox(label="Answer", interactive=False)
    status_output = gr.Markdown("")  # shows loading or status

    # Populate answer box
    def on_question_submit(state, user_question):
        if state is None:
            return "", "Please click ‘Initialize’ after entering your tokens."
        if not user_question.strip():
            return "", "Please enter a question."

        status = "Processing..."
        try:
            result = answer_question(state, user_question)
            return result, ""
        except Exception as e:
            return "", f"Error: {e}"

    # Button to click submit
    submit_btn.click(
        fn=on_question_submit,
        inputs=[state, question_input],
        outputs=[answer_output, status_output],
    )

    # Pressing Enter also triggers submission
    question_input.submit(
        fn=on_question_submit,
        inputs=[state, question_input],
        outputs=[answer_output, status_output],
    )

demo.launch(share=True, debug=True)



Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://c736820f493243e140.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  embedding_model = HuggingFaceEmbeddings(model_name=emb_cfg["model_name"])
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=hf_pipe)
  llm_chain = LLMChain(llm=llm, prompt=prompt_template)
  docs = retriever.get_relevant_documents(inputs["question"])[:top_k]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://c736820f493243e140.gradio.live


