In [None]:
# StudyMate - AI-Powered Academic Assistant
# Complete Implementation for Google Colab

# ============================================================
# SECTION 1: INSTALLATION & SETUP
# ============================================================

print("üì¶ Installing required packages...")
!pip install -q transformers accelerate sentence-transformers faiss-cpu PyMuPDF gradio pillow torch

print("‚úÖ Packages installed successfully!")

# ============================================================
# SECTION 2: IMPORTS
# ============================================================

import fitz  # PyMuPDF
import faiss
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import gradio as gr
from typing import List, Tuple, Dict
import re

print("‚úÖ All imports successful!")

# ============================================================
# SECTION 3: PDF PROCESSING
# ============================================================

class PDFProcessor:
    """Extract and chunk text from PDF documents"""

    def __init__(self, chunk_size=500, overlap=50):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def extract_text_from_pdf(self, pdf_path: str) -> List[Dict]:
        """Extract text from PDF with page numbers"""
        doc = fitz.open(pdf_path)
        pages_data = []

        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()

            if text.strip():
                pages_data.append({
                    'page_number': page_num + 1,
                    'text': text,
                    'source': pdf_path
                })

        doc.close()
        return pages_data

    def chunk_text(self, pages_data: List[Dict]) -> List[Dict]:
        """Split text into overlapping chunks"""
        chunks = []

        for page_data in pages_data:
            text = page_data['text']
            words = text.split()

            for i in range(0, len(words), self.chunk_size - self.overlap):
                chunk_words = words[i:i + self.chunk_size]
                chunk_text = ' '.join(chunk_words)

                if chunk_text.strip():
                    chunks.append({
                        'text': chunk_text,
                        'page_number': page_data['page_number'],
                        'source': page_data['source']
                    })

        return chunks

# ============================================================
# SECTION 4: VECTOR SEARCH ENGINE
# ============================================================

class VectorSearchEngine:
    """FAISS-based semantic search"""

    def __init__(self):
        print("üîÑ Loading embedding model...")
        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        self.index = None
        self.chunks = []
        print("‚úÖ Embedding model loaded!")

    def build_index(self, chunks: List[Dict]):
        """Build FAISS index from text chunks"""
        print(f"üîÑ Building search index for {len(chunks)} chunks...")

        self.chunks = chunks
        texts = [chunk['text'] for chunk in chunks]

        # Generate embeddings
        embeddings = self.embedder.encode(texts, show_progress_bar=True)
        embeddings = np.array(embeddings).astype('float32')

        # Build FAISS index
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(embeddings)

        print(f"‚úÖ Index built with {self.index.ntotal} vectors!")

    def search(self, query: str, top_k: int = 3) -> List[Dict]:
        """Search for relevant chunks"""
        if self.index is None:
            return []

        query_embedding = self.embedder.encode([query])
        query_embedding = np.array(query_embedding).astype('float32')

        distances, indices = self.index.search(query_embedding, top_k)

        results = []
        for idx, dist in zip(indices[0], distances[0]):
            if idx < len(self.chunks):
                chunk = self.chunks[idx].copy()
                chunk['similarity_score'] = float(1 / (1 + dist))
                results.append(chunk)

        return results

# ============================================================
# SECTION 5: LLM ANSWER GENERATOR
# ============================================================

class AnswerGenerator:
    """Generate answers using IBM Granite model"""

    def __init__(self):
        print("üîÑ Loading IBM Granite model...")
        model_name = "ibm-granite/granite-3.3-2b-instruct"

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto"
        )

        print("‚úÖ Granite model loaded successfully!")

    def generate_answer(self, question: str, context_chunks: List[Dict]) -> Tuple[str, List[Dict]]:
        """Generate answer from question and context"""

        if not context_chunks:
            return "‚ùå No relevant content found in the uploaded PDFs.", []

        # Prepare context
        context = "\n\n".join([
            f"[Page {chunk['page_number']}]: {chunk['text'][:300]}..."
            for chunk in context_chunks[:3]
        ])

        # Create prompt
        prompt = f"""You are StudyMate, an AI academic assistant. Answer the student's question based ONLY on the provided context from their study materials.

Context from PDFs:
{context}

Student's Question: {question}

Instructions:
- Provide a clear, accurate answer based on the context
- If the context doesn't contain the answer, say so
- Be concise but informative
- Reference page numbers when relevant

Answer:"""

        # Generate response
        messages = [{"role": "user", "content": prompt}]

        inputs = self.tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        ).to(self.model.device)

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=300,
                temperature=0.7,
                top_p=0.9,
                do_sample=True
            )

        answer = self.tokenizer.decode(
            outputs[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )

        return answer.strip(), context_chunks

# ============================================================
# SECTION 6: FEATURE GENERATORS
# ============================================================

class FeatureGenerator:
    """Generate summaries, flashcards, and MCQs"""

    def __init__(self, answer_generator: AnswerGenerator):
        self.answer_generator = answer_generator

    def generate_summary(self, chunks: List[Dict], summary_type: str = "brief") -> str:
        """Generate document summary"""

        if not chunks:
            return "No content available for summary."

        # Take sample chunks
        sample_text = "\n\n".join([chunk['text'][:400] for chunk in chunks[:5]])

        prompt = f"""Summarize the following academic content in a {summary_type} format:

{sample_text}

Provide a clear, structured summary with key points."""

        messages = [{"role": "user", "content": prompt}]
        inputs = self.answer_generator.tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        ).to(self.answer_generator.model.device)

        with torch.no_grad():
            outputs = self.answer_generator.model.generate(
                **inputs,
                max_new_tokens=400,
                temperature=0.7
            )

        summary = self.answer_generator.tokenizer.decode(
            outputs[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )

        return summary.strip()

    def generate_flashcards(self, chunks: List[Dict], num_cards: int = 5) -> str:
        """Generate flashcards from content"""

        if not chunks:
            return "No content available for flashcards."

        sample_text = "\n\n".join([chunk['text'][:300] for chunk in chunks[:3]])

        prompt = f"""Create {num_cards} educational flashcards from this content:

{sample_text}

Format each flashcard as:
Q: [Question]
A: [Answer]

Make questions clear and answers concise."""

        messages = [{"role": "user", "content": prompt}]
        inputs = self.answer_generator.tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        ).to(self.answer_generator.model.device)

        with torch.no_grad():
            outputs = self.answer_generator.model.generate(
                **inputs,
                max_new_tokens=500,
                temperature=0.8
            )

        flashcards = self.answer_generator.tokenizer.decode(
            outputs[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )

        return flashcards.strip()

    def generate_mcqs(self, chunks: List[Dict], num_questions: int = 5) -> str:
        """Generate MCQ questions"""

        if not chunks:
            return "No content available for MCQs."

        sample_text = "\n\n".join([chunk['text'][:300] for chunk in chunks[:3]])

        prompt = f"""Create {num_questions} multiple choice questions from this content:

{sample_text}

Format each question as:
Q: [Question]
A) [Option]
B) [Option]
C) [Option]
D) [Option]
Correct Answer: [Letter]

Make questions challenging but fair."""

        messages = [{"role": "user", "content": prompt}]
        inputs = self.answer_generator.tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        ).to(self.answer_generator.model.device)

        with torch.no_grad():
            outputs = self.answer_generator.model.generate(
                **inputs,
                max_new_tokens=600,
                temperature=0.8
            )

        mcqs = self.answer_generator.tokenizer.decode(
            outputs[0][inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        )

        return mcqs.strip()

# ============================================================
# SECTION 7: STUDYMATE MAIN CLASS
# ============================================================

class StudyMate:
    """Main StudyMate application"""

    def __init__(self):
        self.pdf_processor = PDFProcessor()
        self.search_engine = VectorSearchEngine()
        self.answer_generator = AnswerGenerator()
        self.feature_generator = FeatureGenerator(self.answer_generator)
        self.all_chunks = []

    def upload_pdfs(self, pdf_files):
        """Process uploaded PDFs"""
        if not pdf_files:
            return "‚ùå Please upload at least one PDF file."

        self.all_chunks = []

        for pdf_file in pdf_files:
            try:
                pages = self.pdf_processor.extract_text_from_pdf(pdf_file.name)
                chunks = self.pdf_processor.chunk_text(pages)
                self.all_chunks.extend(chunks)
            except Exception as e:
                return f"‚ùå Error processing {pdf_file.name}: {str(e)}"

        if self.all_chunks:
            self.search_engine.build_index(self.all_chunks)
            return f"‚úÖ Successfully processed {len(pdf_files)} PDF(s) with {len(self.all_chunks)} text chunks!"
        else:
            return "‚ùå No text could be extracted from the PDFs."

    def answer_question(self, question: str) -> Tuple[str, str]:
        """Answer a question"""
        if not question.strip():
            return "‚ùå Please enter a question.", ""

        if not self.all_chunks:
            return "‚ùå Please upload PDFs first.", ""

        # Search for relevant chunks
        relevant_chunks = self.search_engine.search(question, top_k=3)

        # Generate answer
        answer, sources = self.answer_generator.generate_answer(question, relevant_chunks)

        # Format sources
        source_text = "\n\nüìö **Sources:**\n"
        for i, chunk in enumerate(sources[:3], 1):
            source_text += f"\n{i}. Page {chunk['page_number']}: {chunk['text'][:150]}...\n"

        return answer, source_text

# ============================================================
# SECTION 8: GRADIO INTERFACE
# ============================================================

def create_interface():
    """Create Gradio UI"""

    studymate = StudyMate()

    with gr.Blocks(theme=gr.themes.Soft(), title="StudyMate") as app:

        gr.Markdown("""
        # üìö StudyMate - AI Academic Assistant
        ### Upload your study materials and get instant answers, summaries, flashcards, and more!
        """)

        with gr.Tab("üì§ Upload PDFs"):
            pdf_input = gr.File(
                label="Upload PDF Files",
                file_count="multiple",
                file_types=[".pdf"]
            )
            upload_btn = gr.Button("Process PDFs", variant="primary")
            upload_status = gr.Textbox(label="Status", lines=2)

            upload_btn.click(
                fn=studymate.upload_pdfs,
                inputs=[pdf_input],
                outputs=[upload_status]
            )

        with gr.Tab("üí¨ Ask Questions"):
            question_input = gr.Textbox(
                label="Your Question",
                placeholder="e.g., What is photosynthesis?",
                lines=2
            )
            ask_btn = gr.Button("Get Answer", variant="primary")
            answer_output = gr.Textbox(label="Answer", lines=8)
            sources_output = gr.Textbox(label="Source References", lines=6)

            ask_btn.click(
                fn=studymate.answer_question,
                inputs=[question_input],
                outputs=[answer_output, sources_output]
            )

        with gr.Tab("üìù Generate Summary"):
            summary_type = gr.Radio(
                choices=["brief", "detailed"],
                value="brief",
                label="Summary Type"
            )
            summary_btn = gr.Button("Generate Summary", variant="primary")
            summary_output = gr.Textbox(label="Summary", lines=10)

            summary_btn.click(
                fn=lambda t: studymate.feature_generator.generate_summary(studymate.all_chunks, t),
                inputs=[summary_type],
                outputs=[summary_output]
            )

        with gr.Tab("üé¥ Flashcards"):
            num_cards = gr.Slider(
                minimum=3,
                maximum=10,
                value=5,
                step=1,
                label="Number of Flashcards"
            )
            flashcard_btn = gr.Button("Generate Flashcards", variant="primary")
            flashcard_output = gr.Textbox(label="Flashcards", lines=12)

            flashcard_btn.click(
                fn=lambda n: studymate.feature_generator.generate_flashcards(studymate.all_chunks, int(n)),
                inputs=[num_cards],
                outputs=[flashcard_output]
            )

        with gr.Tab("üìä MCQ Generator"):
            num_mcqs = gr.Slider(
                minimum=3,
                maximum=10,
                value=5,
                step=1,
                label="Number of Questions"
            )
            mcq_btn = gr.Button("Generate MCQs", variant="primary")
            mcq_output = gr.Textbox(label="Multiple Choice Questions", lines=15)

            mcq_btn.click(
                fn=lambda n: studymate.feature_generator.generate_mcqs(studymate.all_chunks, int(n)),
                inputs=[num_mcqs],
                outputs=[mcq_output]
            )

        gr.Markdown("""
        ---
        ### üöÄ Features:
        - **Conversational Q&A**: Ask natural language questions
        - **Multi-PDF Support**: Upload multiple documents
        - **Source References**: See page numbers and context
        - **Smart Summaries**: Generate brief or detailed summaries
        - **Flashcards**: Auto-generate study flashcards
        - **MCQ Generator**: Create practice questions

        *Powered by IBM Granite 3.3B + FAISS + Sentence Transformers*
        """)

    return app

# ============================================================
# SECTION 9: LAUNCH APPLICATION
# ============================================================

print("\n" + "="*60)
print("üéì StudyMate - AI Academic Assistant")
print("="*60)
print("\n‚úÖ All systems ready!")
print("üöÄ Launching interface...\n")

app = create_interface()
app.launch(share=True, debug=True)

üì¶ Installing required packages...
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m23.6/23.6 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m24.1/24.1 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25h‚úÖ Packages installed successfully!
‚úÖ All imports successful!

üéì StudyMate - AI Academic Assistant

‚úÖ All systems ready!
üöÄ Launching interface...

üîÑ Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

‚úÖ Embedding model loaded!
üîÑ Loading IBM Granite model...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

‚úÖ Granite model loaded successfully!
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://4119a1385cf24612f3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


üîÑ Building search index for 30 chunks...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ Index built with 30 vectors!


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [None]:
# @title AI prompt cell

import ipywidgets as widgets
from IPython.display import display, HTML, Markdown,clear_output
from google.colab import ai

dropdown = widgets.Dropdown(
    options=[],
    layout={'width': 'auto'}
)

def update_model_list(new_options):
    dropdown.options = new_options
update_model_list(ai.list_models())

text_input = widgets.Textarea(
    placeholder='Ask me anything....',
    layout={'width': 'auto', 'height': '100px'},
)

button = widgets.Button(
    description='Submit Text',
    disabled=False,
    tooltip='Click to submit the text',
    icon='check'
)

output_area = widgets.Output(
     layout={'width': 'auto', 'max_height': '300px','overflow_y': 'scroll'}
)

def on_button_clicked(b):
    with output_area:
        output_area.clear_output(wait=False)
        accumulated_content = ""
        for new_chunk in ai.generate_text(prompt=text_input.value, model_name=dropdown.value, stream=True):
            if new_chunk is None:
                continue
            accumulated_content += new_chunk
            clear_output(wait=True)
            display(Markdown(accumulated_content))

button.on_click(on_button_clicked)
vbox = widgets.GridBox([dropdown, text_input, button, output_area])

display(HTML("""
<style>
.widget-dropdown select {
    font-size: 18px;
    font-family: "Arial", sans-serif;
}
.widget-textarea textarea {
    font-size: 18px;
    font-family: "Arial", sans-serif;
}
</style>
"""))
display(vbox)


In [None]:
# @title AI prompt cell

import ipywidgets as widgets
from IPython.display import display, HTML, Markdown,clear_output
from google.colab import ai

dropdown = widgets.Dropdown(
    options=[],
    layout={'width': 'auto'}
)

def update_model_list(new_options):
    dropdown.options = new_options
update_model_list(ai.list_models())

text_input = widgets.Textarea(
    placeholder='Ask me anything....',
    layout={'width': 'auto', 'height': '100px'},
)

button = widgets.Button(
    description='Submit Text',
    disabled=False,
    tooltip='Click to submit the text',
    icon='check'
)

output_area = widgets.Output(
     layout={'width': 'auto', 'max_height': '300px','overflow_y': 'scroll'}
)

def on_button_clicked(b):
    with output_area:
        output_area.clear_output(wait=False)
        accumulated_content = ""
        for new_chunk in ai.generate_text(prompt=text_input.value, model_name=dropdown.value, stream=True):
            if new_chunk is None:
                continue
            accumulated_content += new_chunk
            clear_output(wait=True)
            display(Markdown(accumulated_content))

button.on_click(on_button_clicked)
vbox = widgets.GridBox([dropdown, text_input, button, output_area])

display(HTML("""
<style>
.widget-dropdown select {
    font-size: 18px;
    font-family: "Arial", sans-serif;
}
.widget-textarea textarea {
    font-size: 18px;
    font-family: "Arial", sans-serif;
}
</style>
"""))
display(vbox)
