# 🚀 ResearchGPT Quickstart

This notebook walks through the full pipeline on the sample paper:
- Load PDF
- Extract metadata
- Clean & chunk text
- Build index & run search
- Summarize & analyze chunks
- Save metadata JSON


In [None]:
import os, sys, json
from pathlib import Path
from dotenv import load_dotenv

# --- 1) Project root handling ---
project_root = Path.cwd().parent  # from notebooks/ → go up one level
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print("✅ Project root:", project_root)

# --- 2) Load environment variables ---
load_dotenv()
print("✅ MISTRAL_API_KEY loaded?", bool(os.getenv("MISTRAL_API_KEY")))

# --- 3) Import local modules ---
from src.config import MISTRAL_API_KEY
from src.pdf_utils import load_all_pdfs_text
from src.text_utils import clean_text, chunk_text
from src.indexer import build_index, search
from src.summarizer import summarize_chunks
from src.analyst import analyze_chunks
from src.metadata_utils import extract_metadata
from src.io_utils import safe_stem

print("✅ Imports successful")

In [None]:
# Path to the sample paper
pdf_path = project_root / "data/sample_papers/attention_is_all_you_need.pdf"

# Load all PDFs in that folder
pairs = load_all_pdfs_text(pdf_path.parent)

if not pairs:
    raise FileNotFoundError(f"No PDFs found in {pdf_path.parent.resolve()}")

pdf_path, raw_text = pairs[0]
print("✅ Loaded PDF:", pdf_path.name)
print("\n--- First 500 chars of raw text ---\n")
print(raw_text[:500])

In [None]:
meta = extract_metadata(pdf_path)

print("✅ Metadata extracted:")
print(json.dumps(meta, indent=2))

In [None]:
# Clean text
cleaned = clean_text(raw_text)

# Chunk text
chunks = chunk_text(cleaned, max_chars=1500, overlap=150)

print(f"✅ Total chunks: {len(chunks)}")
print("\n--- First 2 chunks ---\n")
for i, ch in enumerate(chunks[:2]):
    print(f"[Chunk {i+1}]\n{ch[:400]}...\n")

In [None]:
index = build_index([(f"chunk {i+1}", ch) for i, ch in enumerate(chunks)])
hits = search(index, "Summarize contributions and limitations.", k=5)

print("✅ Top hits:")
for score, (lbl, txt) in hits:
    print(f"- {lbl} (score={score:.3f})")
    print(txt[:200], "\n")

In [None]:
top_chunks = [txt for _s, (_lbl, txt) in hits]

summary = summarize_chunks(MISTRAL_API_KEY, "Attention Is All You Need", top_chunks)
analysis = analyze_chunks(MISTRAL_API_KEY, "Attention Is All You Need", top_chunks)

print("✅ Summary (first 500 chars):\n", summary[:500])
print("\n---\n")
print("✅ Analysis (first 500 chars):\n", analysis[:500])

In [None]:
meta_out = {
    "file": pdf_path.name,
    "title": meta.get("title", pdf_path.stem),
    "authors": meta.get("authors", "Unknown"),
    "abstract": meta.get("abstract"),
    "query_used": "Summarize contributions and limitations.",
    "outputs": {
        "summary_md": str(project_root / "results/summaries" / f"{safe_stem(pdf_path)}_summary.md"),
        "analysis_md": str(project_root / "results/analyses" / f"{safe_stem(pdf_path)}_analysis.md"),
    }
}

meta_dir = project_root / "results/metadata"
meta_dir.mkdir(parents=True, exist_ok=True)

meta_path = meta_dir / f"{safe_stem(pdf_path)}_meta.json"
meta_path.write_text(json.dumps(meta_out, indent=2), encoding="utf-8")

print("✅ Metadata JSON saved to:", meta_path)
print(json.dumps(meta_out, indent=2))