In [1]:
# ================== STUDYMATE – PDF Q&A (IBM Granite 3.3 2B) ==================
# This ENTIRE project runs in ONE Colab cell with NO dependency conflicts.
# ==============================================================================

!pip install -q --upgrade pip
!pip uninstall -y yfinance google-genai dataproc-spark-connect google-adk websockets >/dev/null 2>&1

# Install safe versions (Gradio ≥ 4.48 handles latest websockets)
!pip install -q \
    gradio>=4.48 \
    transformers \
    sentence-transformers \
    accelerate \
    pymupdf \
    faiss-cpu \
    torch

# ======================================================================
# IMPORTS
# ======================================================================
import os
import fitz
import numpy as np
import torch
import gradio as gr
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# ======================================================================
# OPTIONAL HF TOKEN (for Granite model)
# ======================================================================
HF_TOKEN = input("Enter your HuggingFace Token (press ENTER to skip): ").strip()
if HF_TOKEN:
    os.environ["HUGGINGFACE_HUB_TOKEN"] = HF_TOKEN

# ======================================================================
# DEVICE SETUP
# ======================================================================
DEVICE = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if DEVICE == 0 else "CPU")

# ======================================================================
# PDF TEXT EXTRACTION
# ======================================================================
def extract_pdf(pdf_bytes):
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    except Exception:
        return ""
    pages = []
    for pg in doc:
        try:
            pages.append(pg.get_text("text"))
        except:
            pages.append("")
    return "\n".join(pages)

# ======================================================================
# TEXT CHUNKING
# ======================================================================
def chunk_text(text, chunk_size=1200, overlap=300):
    text = text.replace("\r", " ").replace("\n", " ")
    chunks = []
    start = 0
    L = len(text)
    if L == 0:
        return []

    while start < L:
        end = min(L, start + chunk_size)
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)

        start = end - overlap
        if start < 0:
            start = 0
        if len(chunks) > 2000:
            break

    return chunks

# ======================================================================
# EMBEDDING MODEL
# ======================================================================
print("Loading embedding model...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def embed_chunks(chunks):
    return embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=False)

def embed_query(q):
    return embedder.encode([q], convert_to_numpy=True)[0]

def cosine_sim(embs, q_emb):
    embs_norm = embs / (np.linalg.norm(embs, axis=1, keepdims=True) + 1e-10)
    q_norm = q_emb / (np.linalg.norm(q_emb) + 1e-10)
    return embs_norm.dot(q_norm)

# ======================================================================
# LOAD IBM GRANITE 3.3 2B INSTRUCT
# ======================================================================
print("Loading IBM Granite 3.3 2B Instruct (HuggingFace)...")
try:
    generator = pipeline(
        "text-generation",
        model="ibm-granite/granite-3.3-2b-instruct",
        trust_remote_code=True,
        device=DEVICE
    )
    print("Model loaded successfully!")
except Exception as e:
    print("ERROR loading Granite model:", e)
    generator = None

# ======================================================================
# ANSWER GENERATION
# ======================================================================
def generate_answer(question, context_chunks):
    if not context_chunks:
        return "No relevant context found."

    if generator is None:
        return "Model not available. Check HF token or runtime environment."

    context = "\n\n".join(context_chunks)

    prompt = f"""You are StudyMate, an academic assistant.
Use ONLY the provided context to answer the question. If the answer is not in the context, say you don't know.

Context:
{context}

Question: {question}
Answer:
"""

    try:
        out = generator(
            prompt,
            max_new_tokens=250,
            temperature=0.1,
            do_sample=False,
        )
    except Exception as e:
        return f"Model error: {e}"

    generated = out[0]["generated_text"]
    if generated.startswith(prompt):
        generated = generated[len(prompt):].strip()
    return generated.strip()

# ======================================================================
# MAIN STUDYMATE FUNCTION
# ======================================================================
def studymate_app(question, files, top_k):
    if not files:
        return "Upload at least one PDF.", ""

    text = ""
    for f in files:
        try:
            pdf_bytes = f.read()
        except:
            with open(f.name, "rb") as fh:
                pdf_bytes = fh.read()
        text += extract_pdf(pdf_bytes) + "\n"

    chunks = chunk_text(text)
    if not chunks:
        return "Could not extract text.", ""

    embeddings = embed_chunks(chunks)
    q_emb = embed_query(question)
    sims = cosine_sim(embeddings, q_emb)

    top_k = min(max(1, int(top_k)), len(chunks))
    top_ids = np.argsort(sims)[-top_k:][::-1]
    retrieved = [chunks[i] for i in top_ids]

    answer = generate_answer(question, retrieved)
    ctx = "\n\n--------------------\n\n".join(retrieved)

    return answer, ctx

# ======================================================================
# GRADIO INTERFACE
# ======================================================================
title = "📘 StudyMate — PDF Q&A (IBM Granite 3.3 2B)"
description = "Upload PDFs and ask questions. StudyMate retrieves relevant text and answers using IBM Granite 3.3 2B."

ui = gr.Interface(
    fn=studymate_app,
    inputs=[
        gr.Textbox(label="Ask a Question"),
        gr.Files(label="Upload PDFs", file_count="multiple", file_types=[".pdf"]),
        gr.Slider(1, 8, value=4, step=1, label="Top-K Chunks")
    ],
    outputs=[
        gr.Textbox(label="Answer", lines=8),
        gr.Textbox(label="Retrieved Context", lines=12)
    ],
    title=title,
    description=description,
    allow_flagging="never"
)

ui.launch()


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hEnter your HuggingFace Token (press ENTER to skip): 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
Using device: GPU
Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading IBM Granite 3.3 2B Instruct (HuggingFace)...


config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

Device set to use cuda:0


Model loaded successfully!




It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0b97c6887b7e00a3af.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


