In [18]:
# all the imports
import os
import re
import json
import math
import textwrap
import warnings
from dataclasses import dataclass
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import s3fs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import io


In [6]:
warnings.filterwarnings("ignore")

# --- PARAMETERS YOU MAY TUNE ---
SUBSET_MAX_TXT = 80      # keep <100 as requested
TOP_K = 12               # top results per query
SUMMARY_MAX_WORDS = 70   # aim ~2-3 sentences
KEYWORDS_TOP_N = 6

# Models (small/CPU-friendly)
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
# For summarization we'll try a distilled BART checkpoint fine-tuned for CNN/DM.
# If unavailable locally, fallback to t5-small with the "summarize: " prefix.
SUMM_MODEL_PRIMARY = "sshleifer/distilbart-cnn-12-6"
SUMM_MODEL_FALLBACK = "t5-small"

# Optional verifier (zero-shot)
ZS_MODEL = "facebook/bart-large-mnli"  # ok on CPU for small batches

# Themes for optional verifier
THEMES = ["Deep Learning", "Clinical Trial", "Traditional Methods"]

# Example input queries (from prompt)
QUERIES = [
    "Adverse events with mRNA vaccines in pediatrics",
    "Transformer-based models for protein folding",
    "Clinical trial outcomes for monoclonal antibodies in oncology",
]

In [13]:
# Public S3 bucket per brief:
BUCKET = "pmc-oa-opendata"
FILELIST_KEY = "oa_comm/txt/metadata/csv/oa_comm.filelist.csv"
TXT_PREFIX = "oa_comm/txt/all/"

def get_fs():
    # Anonymous access to the public bucket
    # If your network blocks anon, set AWS creds and drop anon=True.
    try:
        return s3fs.S3FileSystem(anon=True)
    except Exception:
        return s3fs.S3FileSystem(anon=False)

fs = get_fs()
print(fs)

<s3fs.core.S3FileSystem object at 0x14a3e8830>


In [21]:
fs = get_fs()

def sample_txt_paths(max_txt=SUBSET_MAX_TXT, seed=7, prefer_csv=True):
    """
    Optimal strategy for this case study:
      1) Try the official file-list CSV to collect oa_comm/txt/all/*.txt keys.
      2) If that yields nothing (format/network issue), fall back to listing the TXT prefix.
    Returns a deduplicated, seeded random sample of s3:// paths.
    """
    rng = np.random.default_rng(seed)
    candidates = []

    # --- 1) CSV route (preferred) ---
    if prefer_csv:
        s3_csv = f"s3://{BUCKET}/{FILELIST_KEY}"
        if fs.exists(s3_csv):
            with fs.open(s3_csv, "rb") as f:
                raw = f.read()

            # Parse CSV with unknown schema; scan every column for txt/all/*.txt
            df = pd.read_csv(io.BytesIO(raw), dtype=str, low_memory=False)
            patt = re.compile(r"(?:s3://pmc-oa-opendata/)?oa_comm/txt/all/[^\s,]+?\.txt")

            for col in df.columns:
                s = df[col].dropna().astype(str)

                # direct match (column already contains keys/uris)
                direct = s[s.str.contains(r"oa_comm/txt/all/.+\.txt", regex=True)].tolist()
                if direct:
                    candidates.extend(direct)
                    continue

                # otherwise, extract with regex from arbitrary text
                for val in s:
                    candidates.extend(patt.findall(val))

    # Normalize to s3:// paths
    s3_paths = [
        p if str(p).startswith("s3://")
        else f"s3://{BUCKET}/{str(p).lstrip('/')}"
        for p in pd.Series(candidates).dropna().drop_duplicates().tolist()
    ]

    # --- 2) Fallback: list the txt prefix directly ---
    if not s3_paths:
        prefix = f"s3://{BUCKET}/{TXT_PREFIX}"
        # This can be large; we’ll just list and sample immediately.
        listed = [p for p in fs.ls(prefix, detail=False) if str(p).endswith(".txt")]
        s3_paths = listed

    if not s3_paths:
        raise RuntimeError("Could not find any TXT files via CSV or prefix listing.")

    # Seeded sample to keep <100 docs as required
    k = min(len(s3_paths), max_txt)
    idx = rng.choice(len(s3_paths), size=k, replace=False)
    sample = [s3_paths[i] for i in idx]
    return sample

# Run the sampler
sample_paths = sample_txt_paths()
len(sample_paths), sample_paths[:3]

(80,
 ['s3://pmc-oa-opendata/oa_comm/txt/all/PMC4893408.txt',
  's3://pmc-oa-opendata/oa_comm/txt/all/PMC9760518.txt',
  's3://pmc-oa-opendata/oa_comm/txt/all/PMC9924556.txt'])

In [22]:
assert sample_paths, "sample_paths is empty"
with fs.open(sample_paths[0], "r", encoding="utf-8", errors="ignore") as f:
    print(f.read(600)[:600].replace("\n", " "), "...")


 ==== Front ISRN PainISRN PainISRN.PAINISRN Pain2314-4718Hindawi Publishing Corporation 10.1155/2013/726891Research ArticleCharacterization of the Visceral Antinociceptive Effect of Glial Glutamate Transporter GLT-1 Upregulation by Ceftriaxone Roman K.  1 Yang M.  2 Stephens Robert L. Jr. 1  * 1Department of Physiology and Cell Biology, The Ohio State University, 304 Hamilton Hall, 1645 Neil Avenue, Columbus, OH 43210, USA2Department of Gastroenterology, Daping Hospital, Third Military Medical University, Chongqing 400042, China*Robert L. Stephens Jr.: stephens.6@osu.eduAcademic Editors: A. Ul ...


In [23]:
TITLE_PAT = re.compile(r"^\s*(title|article title)\s*[:\-]\s*(.+)$", re.I)
ABSTRACT_PAT = re.compile(r"^\s*(abstract)\s*[:\-]?\s*$", re.I)

@dataclass
class Doc:
    s3_path: str
    title: str | None
    abstract: str
    raw: str

def read_txt(path: str) -> str:
    with fs.open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def extract_title_and_abstract(raw: str) -> tuple[str | None, str]:
    """
    Heuristics:
    - Title: prefer a line like "Title: ..."
    - Abstract: if a block begins with 'Abstract', take until a blank line;
      else take the first ~1800 chars as a pseudo-abstract.
    """
    lines = [ln.strip() for ln in raw.splitlines()]

    # Title
    title = None
    for ln in lines[:50]:
        m = TITLE_PAT.match(ln)
        if m:
            title = m.group(2).strip()
            break
    if not title:
        # Fallback: first non-empty line
        for ln in lines:
            if ln:
                title = ln[:240]
                break

    # Abstract
    abstract = ""
    for i, ln in enumerate(lines):
        if ABSTRACT_PAT.match(ln):
            # collect until blank line or section break
            chunk = []
            for j in range(i+1, min(i+200, len(lines))):
                if not lines[j]:  # stop at blank
                    break
                chunk.append(lines[j])
            abstract = " ".join(chunk).strip()
            break

    # fallback abstract
    if not abstract:
        abstract = raw.strip().split("\n\n")[0]
        abstract = abstract[:1800]

    # clean whitespace
    abstract = re.sub(r"\s+", " ", abstract).strip()
    return title, abstract

def load_docs(paths: list[str]) -> list[Doc]:
    docs = []
    for p in tqdm(paths, desc="Reading & parsing"):
        try:
            raw = read_txt(p)
            title, abstract = extract_title_and_abstract(raw)
            if len(abstract) < 40:
                continue
            docs.append(Doc(p, title, abstract, raw))
        except Exception:
            continue
    return docs

docs = load_docs(sample_paths)
len(docs)


Reading & parsing: 100%|██████████| 80/80 [00:34<00:00,  2.33it/s]


80