In [2]:

!git clone https://github.com/adikan2k/Final-Project-Group-LexiCore.git

%cd /content/Final-Project-Group-LexiCore
!ls

Cloning into 'Final-Project-Group-LexiCore'...
remote: Enumerating objects: 439, done.[K
remote: Counting objects: 100% (55/55), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 439 (delta 23), reused 18 (delta 9), pack-reused 384 (from 2)[K
Receiving objects: 100% (439/439), 319.90 MiB | 30.02 MiB/s, done.
Resolving deltas: 100% (181/181), done.
/content/Final-Project-Group-LexiCore
 code  'Final Group Presentation'     LICENSE
 data  'Final Group Project Report'   README.md


In [4]:
%%writefile run_pipeline.py
"""
Final Pipeline Orchestrator

This script:
  1. Loads the cleaned corpus (output of Day 1+2).
  2. Runs retrieval over title + abstract.
  3. Calls the summarization engine (generate_summaries).
  4. Saves a final digest JSON.
"""

import argparse
import json
import os
import time
from pathlib import Path
from typing import Dict, Any, List

import numpy as np
import pandas as pd
import sys



ROOT = Path(__file__).resolve().parent
CODE_DIR = ROOT / "code"
sys.path.append(str(CODE_DIR))

try:

    from summarization import generate_summaries  # type: ignore
except Exception:

    def generate_summaries(text: str) -> Dict[str, Any]:
        """Fallback summary used only if real summarization.py is not available."""
        snippet = (text or "").strip()
        one_sentence = snippet[:250]
        return {
            "one_sentence": one_sentence,
            "three_sentence": one_sentence,
            "five_bullets": [
                f"- {one_sentence}"
            ]
        }


def load_corpus() -> pd.DataFrame:
    """
    Load cleaned paper corpus produced by Day 1/2.

    Tries both:
      data/processed/cleaned_paper.parquet
      data/processed/cleaned_papers.parquet
    because naming sometimes differs.
    """
    base = ROOT / "data" / "processed"
    candidates = [
        base / "cleaned_paper.parquet",
        base / "cleaned_papers.parquet",
    ]
    last_err = None
    for path in candidates:
        try:
            print(f"Trying to load: {path}")
            df = pd.read_parquet(path)
            print(f"Loaded {len(df)} rows from {path.name}")
            return df
        except Exception as e:
            last_err = e
            continue
    raise FileNotFoundError(
        f"Could not load cleaned parquet. Tried {', '.join(str(c) for c in candidates)}.\n"
        f"Last error: {last_err}"
    )


def simple_retrieval(query: str, df: pd.DataFrame, top_k: int = 5) -> List[int]:
    """
    Very simple keyword retrieval over title + abstract.

    (If you later want to plug in BM25/FAISS hybrid, replace this
     with calls to your Day-2 retrieval engine.)
    """
    text_series = (
        df.get("title", "").fillna("").astype(str)
        + " "
        + df.get("original_abstract", df.get("abstract", "")).fillna("").astype(str)
    )

    mask = text_series.str.contains(query, case=False, na=False)
    indices = df[mask].index.tolist()

    if not indices:
        print("No exact keyword matches found; falling back to first top_k papers.")
        indices = df.index.tolist()

    return indices[:top_k]


def build_digest(query: str, df: pd.DataFrame, top_k: int = 5) -> Dict[str, Any]:
    """
    Build digest object:
      - retrieval to get top_k docs
      - summarization for each abstract
    """
    indices = simple_retrieval(query, df, top_k)
    papers_out = []

    for rank, idx in enumerate(indices, start=1):
        row = df.loc[idx]
        abstract = row.get("original_abstract", row.get("abstract", ""))

        summaries = generate_summaries(str(abstract))

        papers_out.append(
            {
                "rank": rank,
                "paper_id": row.get("paper_id", int(idx)),
                "title": row.get("title", "Untitled"),
                "venue": row.get("venue"),
                "year": int(row["year"]) if "year" in row and pd.notna(row["year"]) else None,
                "summaries": summaries,
            }
        )

    return {
        "query": query,
        "num_results": len(papers_out),
        "papers": papers_out,
    }


def save_digest(digest: Dict[str, Any], output_path: str) -> None:
    """
    Save digest to JSON, converting numpy/pandas types to plain Python types.
    """
    folder = os.path.dirname(output_path)
    if folder:
        os.makedirs(folder, exist_ok=True)

    def convert(o):
        if isinstance(o, (np.integer,)):
            return int(o)
        if isinstance(o, (np.floating,)):
            return float(o)
        if isinstance(o, np.ndarray):
            return o.tolist()
        return str(o)

    with open(output_path, "w") as f:
        json.dump(digest, f, indent=2, default=convert)




def run_pipeline(
    query: str,
    top_k: int = 5,
    output_path: str = "Outputs/pipeline_digest.json",
) -> Dict[str, Any]:
    """
    End-to-end pipeline:
      1. Load corpus
      2. Retrieve top_k docs
      3. Summarize each
      4. Save digest JSON
    """
    t0 = time.time()
    print(f" Running pipeline for query: '{query}'")
    print(f"   top_k       = {top_k}")
    print(f"   output_path = {output_path}")

    print("\n[1] Loading corpus...")
    df = load_corpus()
    print(f"    Loaded {len(df)} papers.")

    print("\n[2] Building digest (retrieval + summarization)...")
    digest = build_digest(query, df, top_k=top_k)

    print("\n[3] Saving digest JSON...")
    save_digest(digest, output_path)

    elapsed = time.time() - t0
    print(f"\n Done in {elapsed:.2f} seconds.")
    print(f"   Papers in digest: {digest['num_results']}")
    print(f"   JSON saved to:    {output_path}")

    return digest




def main():
    parser = argparse.ArgumentParser(description="Run Day 3 digest pipeline.")
    parser.add_argument(
        "--query",
        type=str,
        required=True,
        help="Search query, e.g. 'transformer models for NLP'",
    )
    parser.add_argument(
        "--top_k",
        type=int,
        default=5,
        help="Number of papers to include in the digest",
    )
    parser.add_argument(
        "--output",
        type=str,
        default="Outputs/pipeline_digest.json",
        help="Output path for JSON digest",
    )
    args = parser.parse_args()

    run_pipeline(
        query=args.query,
        top_k=args.top_k,
        output_path=args.output,
    )


if __name__ == "__main__":
    main()

Overwriting run_pipeline.py


In [5]:
%cd /content/Final-Project-Group-LexiCore

!python run_pipeline.py --query "transformer models" --top_k 3 --output Outputs/transformer_pipeline_digest.json

print("\nFiles in Outputs/:")
!ls Outputs

/content/Final-Project-Group-LexiCore
 Running pipeline for query: 'transformer models'
   top_k       = 3
   output_path = Outputs/transformer_pipeline_digest.json

[1] Loading corpus...
Trying to load: /content/Final-Project-Group-LexiCore/data/processed/cleaned_paper.parquet
Trying to load: /content/Final-Project-Group-LexiCore/data/processed/cleaned_papers.parquet
Loaded 500 rows from cleaned_papers.parquet
    Loaded 500 papers.

[2] Building digest (retrieval + summarization)...

[3] Saving digest JSON...

 Done in 0.22 seconds.
   Papers in digest: 3
   JSON saved to:    Outputs/transformer_pipeline_digest.json

Files in Outputs/:
transformer_pipeline_digest.json


In [6]:
import json

with open("Outputs/transformer_pipeline_digest.json", "r") as f:
    digest = json.load(f)

print("Query:", digest["query"])
print("Num results:", digest["num_results"])
print("\nTop 2 paper titles:")

for p in digest["papers"][:2]:
    print(f"- #{p['rank']} {p['title']}")
    if isinstance(p["summaries"], dict):
        print("  1-sentence summary:", p["summaries"].get("one_sentence", "")[:200])
    print()

Query: transformer models
Num results: 3

Top 2 paper titles:
- #1 Controlling changes to attention logits
  1-sentence summary: Stability of neural network weights is critical when training transformer models. The query and key weights are particularly problematic, as they tend to grow large without any intervention. Applying 

- #2 IntAttention: A Fully Integer Attention Pipeline for Efficient Edge Inference
  1-sentence summary: Deploying Transformer models on edge devices is limited by latency and energy budgets. While INT8 quantization effectively accelerates the primary matrix multiplications, it exposes the softmax as the

