In [39]:
# Cell 1: Imports and Setup
import os
import json
import pandas as pd
from pathlib import Path
from datetime import datetime
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Safe path handling for notebook vs script
if "__file__" in globals():
    PROJECT_ROOT = Path(__file__).resolve().parents[1]
else:
    PROJECT_ROOT = Path().resolve().parent

OUTLINE_DIR = PROJECT_ROOT / "output"
PDF_INPUT_DIR = PROJECT_ROOT / "input"
FINAL_OUTPUT_PATH = OUTLINE_DIR / "05_rag_output.json"

# Define embedding model
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
print("✅ Loaded embedding model")


✅ Loaded embedding model


In [40]:
# Cell 2: Load structured outlines (support optional filenames)
def load_outlines(folder, file_names=None):
    outlines = []

    # If specific file names are passed, use those
    if file_names:
        paths = [folder / name for name in file_names]
    else:
        paths = list(folder.glob("*.json"))

    for path in paths:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
            if isinstance(data, dict) and "outline" in data:
                title = data.get("title", "Untitled")
                for block in data["outline"]:
                    outlines.append({
                        "document": path.stem.replace("_headings", ""),
                        "title": title,
                        **block
                    })
            else:
                print(f"⚠️ Skipped: {path.name} (unexpected format)")

    return pd.DataFrame(outlines)

# Example usage
# outline_df = load_outlines(OUTLINE_DIR)  # Load all
outline_df = load_outlines(OUTLINE_DIR, 
                           file_names=["sample_text.json",
                                       "sample.json",
                                    #    "sample_ja.json",
                                       "sample_git.json",
                                       "sample5.json"])  # Only specific ones
print(f"📚 Loaded {len(outline_df)} heading blocks")


📚 Loaded 339 heading blocks


In [41]:
# Cell 3: Chunk Sections by Heading

def chunk_sections(df):
    chunks = []
    current_chunk = None

    for i, row in df.iterrows():
        if row["level"] == "H1":
            if current_chunk:
                chunks.append(current_chunk)
            current_chunk = {
                "document": row["document"],
                "page": row["page"],
                "section_title": row["text"],
                "chunk_text": row["text"]
            }
        elif current_chunk:
            current_chunk["chunk_text"] += " " + row["text"]

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

chunks = chunk_sections(outline_df)
print(f"🔪 Chunked into {len(chunks)} sections")


🔪 Chunked into 9 sections


In [42]:
# Cell 4: Rank Chunks by Similarity to Task
persona = "PhD Researcher in Computational Biology"
job = "Prepare a literature review on GNNs for drug discovery"
query = f"{persona} needs to: {job}"
query_emb = model.encode(query, convert_to_tensor=True)

for c in chunks:
    c["embedding"] = model.encode(c["chunk_text"], convert_to_tensor=True)
    c["similarity"] = util.cos_sim(query_emb, c["embedding"]).item()

# Sort by relevance
ranked_chunks = sorted(chunks, key=lambda x: x["similarity"], reverse=True)



In [43]:
# Cell 5: Assemble Final Output
final = {
    "metadata": {
        "documents": list(set([c["document"] for c in ranked_chunks])),
        "persona": persona,
        "job_to_be_done": job,
        "processed_at": datetime.now().isoformat()
    },
    "extracted_sections": [
        {
            "document": c["document"],
            "page": c["page"],
            "section_title": c["section_title"],
            "importance_rank": i + 1
        }
        for i, c in enumerate(ranked_chunks[:5])
    ],
    "subsection_analysis": [
        {
            "document": c["document"],
            "page": c["page"],
            "section_title": c["section_title"],
            "refined_text": c["chunk_text"]
        }
        for c in ranked_chunks[:5]
    ]
}

with open(FINAL_OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(final, f, indent=2, ensure_ascii=False)

print(f"✅ RAG Output saved to: {FINAL_OUTPUT_PATH}")

✅ RAG Output saved to: C:\Users\Adi Awaskar\Documents\GitHub\Adobe-Hackathon\output\05_rag_output.json


In [44]:
# Cell 6: Preview
print(json.dumps(final, indent=2, ensure_ascii=False)[:1500])

{
  "metadata": {
    "documents": [
      "sample_text",
      "sample",
      "sample5",
      "sample_git"
    ],
    "persona": "PhD Researcher in Computational Biology",
    "job_to_be_done": "Prepare a literature review on GNNs for drug discovery",
    "processed_at": "2025-07-20T12:58:14.008157"
  },
  "extracted_sections": [
    {
      "document": "sample_text",
      "page": 1,
      "section_title": "Mid Internship Report",
      "importance_rank": 1
    },
    {
      "document": "sample_git",
      "page": 1,
      "section_title": "RFP: R RFP: R RFP: R RFP: Request f quest f quest f quest for Pr r Pr r Pr r Proposal oposal oposal oposal",
      "importance_rank": 2
    },
    {
      "document": "sample_git",
      "page": 1,
      "section_title": "To Present a Proposal for Developing the Business Plan for the Ontario Digital Library",
      "importance_rank": 3
    },
    {
      "document": "sample",
      "page": 1,
      "section_title": "Chevron Engineering",
      