<a href="https://colab.research.google.com/github/anjani-19/Infinite-Recursions/blob/main/Study_Mate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# ==============================
# 1) Install dependencies
# ==============================
!pip install gradio pymupdf sentence-transformers faiss-cpu ibm-watsonx-ai python-dotenv numpy gTTS -q

import os
import io
import json
import fitz  # PyMuPDF
import numpy as np
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
import faiss
import gradio as gr
from gtts import gTTS

# --- Optional: watsonx ---
_WATSONX_AVAILABLE = True
try:
    from ibm_watsonx_ai.foundation_models import Model
    from ibm_watsonx_ai import Credentials
except Exception:
    _WATSONX_AVAILABLE = False

# ---------------------------
# PDF Extraction
# ---------------------------
def extract_pdf_text(file: io.BytesIO):
    doc = fitz.open(stream=file.read(), filetype="pdf")
    pages = []
    for i, page in enumerate(doc):
        text = page.get_text("text").replace("\u00ad", "")
        pages.append({"text": text, "page": i + 1})
    return pages

def clean_text(t: str) -> str:
    return t.replace("\r", " ").replace("\n\n", "\n").strip()

# ---------------------------
# Chunking
# ---------------------------
def chunk_text(text: str, chunk_size: int = 900, overlap: int = 150):
    words = text.split()
    chunks = []
    step = max(1, chunk_size - overlap)
    for start in range(0, len(words), step):
        chunk_words = words[start:start + chunk_size]
        if not chunk_words:
            break
        chunks.append(" ".join(chunk_words))
        if start + chunk_size >= len(words):
            break
    return chunks

# ---------------------------
# Embeddings + FAISS
# ---------------------------
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
faiss_index = None
all_chunks = []
recent_questions = []  # store last 5 questions

def build_faiss(chunks):
    global faiss_index, all_chunks
    texts = [c["text"] for c in chunks]
    embs = embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
    dim = embs.shape[1]
    faiss_index = faiss.IndexFlatIP(dim)
    faiss_index.add(embs)
    all_chunks = chunks
    return f"✅ {len(chunks)} chunks indexed."

def top_k(query, k=5):
    q = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    D, I = faiss_index.search(q, k)
    results = []
    for idx, score in zip(I[0], D[0]):
        if idx == -1: continue
        results.append({**all_chunks[idx], "score": float(score)})
    return results

# ---------------------------
# LLM: Watsonx (with fallback)
# ---------------------------
class WatsonxAnswerer:
    def __init__(self):
        load_dotenv()
        self.api_key = os.getenv("WATSONX_API_KEY")
        self.url = os.getenv("WATSONX_URL", "https://us-south.ml.cloud.ibm.com")
        self.project_id = os.getenv("WATSONX_PROJECT_ID")
        self.space_id = os.getenv("WATSONX_SPACE_ID")
        self.model_id = os.getenv("WATSONX_MODEL_ID", "mistralai/mixtral-8x7b-instruct-v01")
        self.ok = _WATSONX_AVAILABLE and all([self.api_key, self.url, self.project_id])
        self._model = None
        if self.ok:
            try:
                creds = Credentials(url=self.url, api_key=self.api_key)
                params = {"decoding_method": "greedy", "max_new_tokens": 500, "temperature": 0.2}
                self._model = Model(
                    model_id=self.model_id,
                    params=params,
                    credentials=creds,
                    project_id=self.project_id,
                    space_id=self.space_id,
                )
            except Exception as e:
                self.ok = False
                print("Watsonx init failed:", e)

    def answer(self, question, context_blocks):
        system = "You are StudyMate, a friendly academic tutor. Cite sources as (p.<page>)."
        context_text = "\n\n".join([f"[Source p.{c['page']}] {c['text']}" for c in context_blocks])
        prompt = f"{system}\n\nQuestion: {question}\n\nContext:\n{context_text}\n"

        if self.ok and self._model is not None:
            try:
                res = self._model.generate(prompt)
                if hasattr(res, "get_result"):
                    out = res.get_result()
                    if isinstance(out, dict):
                        text = (
                            out.get("results", [{}])[0].get("generated_text")
                            or out.get("generated_text")
                            or json.dumps(out)
                        )
                        return text
                    return str(out)
                return str(res)
            except Exception as e:
                print("Watsonx error:", e)

        return "🤖 Relevant context:\n" + "\n".join([c["text"][:300] for c in context_blocks])

answerer = WatsonxAnswerer()

# ---------------------------
# Extra Features
# ---------------------------
def summarize_pdf():
    if not all_chunks:
        return "⚠️ Please upload and process PDFs first."
    full_text = " ".join([c["text"] for c in all_chunks])[:3000]
    return f"📑 Summary:\n{full_text[:1000]}..."

def export_notes(answer, context):
    filename = "StudyMate_Notes.txt"
    with open(filename, "w") as f:
        f.write("Answer:\n")
        f.write(answer + "\n\n")
        f.write("Context:\n")
        f.write(context)
    return filename

def generate_flashcards():
    if not all_chunks:
        return "⚠️ Please process PDFs first."
    sample = " ".join([c["text"] for c in all_chunks[:3]])
    return f"🎴 Example Flashcard:\nQ: What is the main idea?\nA: {sample[:200]}..."

def glossary_builder():
    if not all_chunks:
        return "⚠️ Please process PDFs first."
    words = " ".join([c["text"] for c in all_chunks])
    unique_terms = list(set([w for w in words.split() if len(w) > 7]))[:10]
    return "📘 Glossary Terms:\n" + "\n".join(unique_terms)

def tts_answer(answer):
    tts = gTTS(answer)
    filename = "answer.mp3"
    tts.save(filename)
    return filename

def process_pdfs(files):
    docs = []
    for f in files:
        with open(f.name, "rb") as fobj:
            pages = extract_pdf_text(io.BytesIO(fobj.read()))
            for p in pages:
                docs.append({"text": clean_text(p["text"]), "page": p["page"], "doc": os.path.basename(f.name)})
    msg = build_faiss(docs)
    return msg

def ask_question(question):
    if faiss_index is None:
        return "⚠️ Please upload and process PDFs first.", "", []
    hits = top_k(question, k=5)
    answer = answerer.answer(question, hits)
    refs = "\n\n".join([f"📖 {h['doc']} – p.{h['page']} (score {h['score']:.3f})\n{h['text'][:400]}" for h in hits])

    # track recent
    recent_questions.append(question)
    if len(recent_questions) > 5:
        recent_questions.pop(0)

    return answer, refs, recent_questions

# ---------------------------
# 🎨 Themed Gradio Interface
# ---------------------------
custom_css = """
body {
    background: linear-gradient(135deg, #9d50bb, #6e48aa, #ff758c, #6a82fb);
    font-family: 'Segoe UI', sans-serif;
}
.gr-button {
    background: linear-gradient(90deg, #a18cd1, #fbc2eb);
    color: white !important;
    border: none !important;
    font-weight: bold;
    border-radius: 12px !important;
}
.gr-textbox, .gr-file {
    border-radius: 12px !important;
    border: 2px solid #a18cd1 !important;
}
"""

with gr.Blocks(css=custom_css) as demo:
    gr.Markdown("""<div style='text-align: center;
                    padding: 20px;
                    background: linear-gradient(90deg, #ff758c, #6a82fb);
                    color: white;
                    border-radius: 12px;
                    font-size: 28px;
                    font-weight: bold;'>
        📘 Welcome to StudyMate (Student Edition)
        </div>
        """)

    with gr.Tab("📂 Upload PDFs"):
        pdfs = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs")
        process_btn = gr.Button("🚀 Process PDFs")
        status = gr.Textbox(label="Status")
        process_btn.click(process_pdfs, inputs=pdfs, outputs=status)

    with gr.Tab("💬 Ask Questions"):
        q = gr.Textbox(label="❓ Your Question")
        ask_btn = gr.Button("💬 Ask")
        ans = gr.Textbox(label="✅ Answer", lines=5)
        ctx = gr.Textbox(label="📖 Retrieved Context", lines=10)
        recent_out = gr.Textbox(label="🕒 Recent Questions (last 5)")
        ask_btn.click(ask_question, inputs=q, outputs=[ans, ctx, recent_out])

        # Text-to-speech
        tts_btn = gr.Button("🔊 Listen Answer")
        audio_out = gr.Audio()
        tts_btn.click(tts_answer, inputs=ans, outputs=audio_out)

        # Export notes
        export_btn = gr.Button("📝 Export Notes")
        file_out = gr.File()
        export_btn.click(export_notes, inputs=[ans, ctx], outputs=file_out)

    with gr.Tab("⭐ Summarize PDF"):
        sum_btn = gr.Button("📑 Generate Summary")
        summary_out = gr.Textbox(label="Summary", lines=10)
        sum_btn.click(summarize_pdf, outputs=summary_out)

    with gr.Tab("🎴 Flashcards"):
        flash_btn = gr.Button("🎴 Generate Flashcards")
        flash_out = gr.Textbox(label="Flashcards Preview", lines=5)
        flash_btn.click(generate_flashcards, outputs=flash_out)

    with gr.Tab("📘 Glossary"):
        gloss_btn = gr.Button("📘 Build Glossary")
        gloss_out = gr.Textbox(label="Glossary Terms", lines=10)
        gloss_btn.click(glossary_builder, outputs=gloss_out)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0f9ca7391af579863e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


