In [18]:
# all the imports
import os
import re
import json
import math
import textwrap
import warnings
from dataclasses import dataclass
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import s3fs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import io


In [6]:
warnings.filterwarnings("ignore")

# --- PARAMETERS YOU MAY TUNE ---
SUBSET_MAX_TXT = 80      # keep <100 as requested
TOP_K = 12               # top results per query
SUMMARY_MAX_WORDS = 70   # aim ~2-3 sentences
KEYWORDS_TOP_N = 6

# Models (small/CPU-friendly)
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
# For summarization we'll try a distilled BART checkpoint fine-tuned for CNN/DM.
# If unavailable locally, fallback to t5-small with the "summarize: " prefix.
SUMM_MODEL_PRIMARY = "sshleifer/distilbart-cnn-12-6"
SUMM_MODEL_FALLBACK = "t5-small"

# Optional verifier (zero-shot)
ZS_MODEL = "facebook/bart-large-mnli"  # ok on CPU for small batches

# Themes for optional verifier
THEMES = ["Deep Learning", "Clinical Trial", "Traditional Methods"]

# Example input queries (from prompt)
QUERIES = [
    "Adverse events with mRNA vaccines in pediatrics",
    "Transformer-based models for protein folding",
    "Clinical trial outcomes for monoclonal antibodies in oncology",
]

In [13]:
# Public S3 bucket per brief:
BUCKET = "pmc-oa-opendata"
FILELIST_KEY = "oa_comm/txt/metadata/csv/oa_comm.filelist.csv"
TXT_PREFIX = "oa_comm/txt/all/"

def get_fs():
    # Anonymous access to the public bucket
    # If your network blocks anon, set AWS creds and drop anon=True.
    try:
        return s3fs.S3FileSystem(anon=True)
    except Exception:
        return s3fs.S3FileSystem(anon=False)

fs = get_fs()
print(fs)

<s3fs.core.S3FileSystem object at 0x14a3e8830>


In [21]:
fs = get_fs()

def sample_txt_paths(max_txt=SUBSET_MAX_TXT, seed=7, prefer_csv=True):
    """
    Optimal strategy for this case study:
      1) Try the official file-list CSV to collect oa_comm/txt/all/*.txt keys.
      2) If that yields nothing (format/network issue), fall back to listing the TXT prefix.
    Returns a deduplicated, seeded random sample of s3:// paths.
    """
    rng = np.random.default_rng(seed)
    candidates = []

    # --- 1) CSV route (preferred) ---
    if prefer_csv:
        s3_csv = f"s3://{BUCKET}/{FILELIST_KEY}"
        if fs.exists(s3_csv):
            with fs.open(s3_csv, "rb") as f:
                raw = f.read()

            # Parse CSV with unknown schema; scan every column for txt/all/*.txt
            df = pd.read_csv(io.BytesIO(raw), dtype=str, low_memory=False)
            patt = re.compile(r"(?:s3://pmc-oa-opendata/)?oa_comm/txt/all/[^\s,]+?\.txt")

            for col in df.columns:
                s = df[col].dropna().astype(str)

                # direct match (column already contains keys/uris)
                direct = s[s.str.contains(r"oa_comm/txt/all/.+\.txt", regex=True)].tolist()
                if direct:
                    candidates.extend(direct)
                    continue

                # otherwise, extract with regex from arbitrary text
                for val in s:
                    candidates.extend(patt.findall(val))

    # Normalize to s3:// paths
    s3_paths = [
        p if str(p).startswith("s3://")
        else f"s3://{BUCKET}/{str(p).lstrip('/')}"
        for p in pd.Series(candidates).dropna().drop_duplicates().tolist()
    ]

    # --- 2) Fallback: list the txt prefix directly ---
    if not s3_paths:
        prefix = f"s3://{BUCKET}/{TXT_PREFIX}"
        # This can be large; we’ll just list and sample immediately.
        listed = [p for p in fs.ls(prefix, detail=False) if str(p).endswith(".txt")]
        s3_paths = listed

    if not s3_paths:
        raise RuntimeError("Could not find any TXT files via CSV or prefix listing.")

    # Seeded sample to keep <100 docs as required
    k = min(len(s3_paths), max_txt)
    idx = rng.choice(len(s3_paths), size=k, replace=False)
    sample = [s3_paths[i] for i in idx]
    return sample

# Run the sampler
sample_paths = sample_txt_paths()
len(sample_paths), sample_paths[:3]

(80,
 ['s3://pmc-oa-opendata/oa_comm/txt/all/PMC4893408.txt',
  's3://pmc-oa-opendata/oa_comm/txt/all/PMC9760518.txt',
  's3://pmc-oa-opendata/oa_comm/txt/all/PMC9924556.txt'])

In [22]:
assert sample_paths, "sample_paths is empty"
with fs.open(sample_paths[0], "r", encoding="utf-8", errors="ignore") as f:
    print(f.read(600)[:600].replace("\n", " "), "...")


 ==== Front ISRN PainISRN PainISRN.PAINISRN Pain2314-4718Hindawi Publishing Corporation 10.1155/2013/726891Research ArticleCharacterization of the Visceral Antinociceptive Effect of Glial Glutamate Transporter GLT-1 Upregulation by Ceftriaxone Roman K.  1 Yang M.  2 Stephens Robert L. Jr. 1  * 1Department of Physiology and Cell Biology, The Ohio State University, 304 Hamilton Hall, 1645 Neil Avenue, Columbus, OH 43210, USA2Department of Gastroenterology, Daping Hospital, Third Military Medical University, Chongqing 400042, China*Robert L. Stephens Jr.: stephens.6@osu.eduAcademic Editors: A. Ul ...


In [29]:
@dataclass
class Doc:
    s3_path: str
    title: str | None
    abstract: str
    raw: str

def clean(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

# Things we don't want to treat as title/abstract content
META_NOISE = re.compile(
    r"(issn|doi|license|copyright|journal|publisher|bmj|hindawi|frontiers|springer|nature publishing|"
    r"plos|bmc|medrxiv|orcid|ror\.org|grid\.[0-9]|received:|accepted:|correspondence|@|http|www\.)",
    re.I,
)
SECTION_HEAD = re.compile(
    r"^\s*(abstract|summary|background|introduction|methods?|materials|results?|discussion|conclusion[s]?|keywords?)\b[:\s-]*$",
    re.I,
)
CONTENT_TYPE = re.compile(r"\b(Research|Review|Original Article|Case Report|News and Commentary|Article)\b", re.I)
AUTHOR_LINE = re.compile(r"^[A-Z][a-z]+(?:[-\s][A-Z][a-z]+)+(?:\s+\d+.*)?$")  # crude author line

def is_plausible_title(ln: str) -> bool:
    if not (20 <= len(ln) <= 220): return False
    if META_NOISE.search(ln): return False
    if AUTHOR_LINE.match(ln): return False
    letters = sum(c.isalpha() for c in ln)
    if letters / max(1, len(ln)) < 0.6: return False
    if ln.isupper(): return False
    if len(ln.split()) < 5: return False
    return True

def extract_title(lines: list[str]) -> str | None:
    # 1) Anchor on a content-type line (e.g., "Research") and take the first plausible long line after it
    for i, ln in enumerate(lines[:250]):
        if CONTENT_TYPE.search(ln):
            for j in range(i+1, min(i+20, len(lines))):
                cand = lines[j].strip()
                if cand and is_plausible_title(cand):
                    return clean(cand)[:240]
            break  # don't keep scanning beyond first content-type block

    # 2) Otherwise, look near the top for a plausible title
    for ln in lines[:160]:
        cand = ln.strip()
        if is_plausible_title(cand):
            return clean(cand)[:240]

    return None

def extract_abstract(lines: list[str], title_idx_hint: int | None) -> str:
    # A) True "Abstract" section anywhere near the top half
    for i, ln in enumerate(lines[:800]):
        m = re.match(r"^\s*abstract\b[:\s-]*", ln, re.I)
        if m:
            chunk = []
            rest = ln[m.end():].strip()
            if rest:
                chunk.append(rest)
            for j in range(i+1, min(i+260, len(lines))):
                s = lines[j].strip()
                if not s: break
                if s.startswith("=") or SECTION_HEAD.match(s): break
                if META_NOISE.search(s): continue
                chunk.append(s)
            txt = clean(" ".join(chunk))
            if len(txt) >= 40:
                return txt

    # B) Fallback: first substantial paragraph after the title block but before the next section header
    start = (title_idx_hint or 0) + 1
    chunk = []
    for j in range(start, min(start+120, len(lines))):
        s = lines[j].strip()
        if not s: 
            if len(" ".join(chunk)) > 200: break
            continue
        if s.startswith("=") or SECTION_HEAD.match(s):  # don't cross into body sections
            if len(" ".join(chunk)) > 200: break
            else: continue
        if META_NOISE.search(s) or AUTHOR_LINE.match(s): 
            continue
        chunk.append(s)
        if len(" ".join(chunk)) > 800:  # ~ 2–3 sentences worth
            break
    txt = clean(" ".join(chunk))[:1800]
    return txt

def read_txt(path: str) -> str:
    with fs.open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def extract_title_and_abstract(raw: str) -> tuple[str | None, str]:
    lines = [ln.rstrip() for ln in raw.splitlines()]

    # find title and its index
    title = None
    title_idx = None
    for i in range(min(300, len(lines))):
        t = extract_title(lines[i:i+60])
        if t:
            title = t
            title_idx = i
            break

    abstract = extract_abstract(lines, title_idx)
    return title, abstract

def load_docs(paths: list[str]) -> list[Doc]:
    docs = []
    for p in tqdm(paths, desc="Reading & parsing (robust)"):
        try:
            raw = read_txt(p)
            title, abstract = extract_title_and_abstract(raw)
            if len(abstract) < 40:
                continue
            docs.append(Doc(p, title, abstract, raw))
        except Exception:
            continue
    return docs

docs = load_docs(sample_paths)
print("Parsed docs:", len(docs))
print("Example title:", (docs[0].title or "(untitled)")[:120])
print("Abstract preview:", docs[0].abstract[:220], "…")

Reading & parsing (robust): 100%|██████████| 80/80 [00:33<00:00,  2.42it/s]

Parsed docs: 80
Example title: 2.3. Electromyographic (EMG) Electrode Implantation
Abstract preview: 1 Yang M. 2 Stephens Robert L. Jr. 1 * 1. Introduction Interstitial cystitis/painful bladder syndrome (IC/PBS) is associated with several symptoms that include changes in bladder function and pain hypersensitivity that c …





In [30]:
embedder = SentenceTransformer(EMBED_MODEL)
doc_texts = [d.abstract for d in docs]
doc_vecs = embedder.encode(doc_texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)

# Build a small in-memory index
doc_vecs = np.asarray(doc_vecs, dtype=np.float32)

def retrieve(query: str, top_k: int = TOP_K):
    qv = embedder.encode([query], normalize_embeddings=True)
    sims = (doc_vecs @ qv[0])  # cosine because normalized
    idx = np.argsort(-sims)[:top_k]
    results = []
    for r in idx:
        results.append({
            "rank": len(results)+1,
            "score": float(sims[r]),
            "s3_path": docs[r].s3_path,
            "title": docs[r].title,
            "abstract": docs[r].abstract,
            "doc_idx": int(r),
        })
    return results

# Quick smoke test
retrieve(QUERIES[0])[:3]


Batches: 100%|██████████| 2/2 [00:00<00:00,  3.55it/s]


[{'rank': 1,
  'score': 0.2588670253753662,
  's3_path': 's3://pmc-oa-opendata/oa_comm/txt/all/PMC9962628.txt',
  'title': 'PAX2 and CAKUT Phenotypes: Report on Two New Variants and a Review of Mutations from the Leiden Open Variation Database',
  'abstract': 'Int J Mol Sci Int J Mol Sci ijms 1422-0067 MDPI 36835576 10.3390/ijms24044165 ijms-24-04165 Article PAX2 and CAKUT Phenotypes: Report on Two New Variants and a Review of Mutations from the Leiden Open Variation Database Negrisolo Susanna Methodology Formal analysis Investigation Writing – original draft Funding acquisition 12* Benetti Elisa Resources Writing – review & editing 123 1 Laboratory of Immunopathology and Molecular Biology of the Kidney, Department of Women’s and Children’s Health, University of Padova, 35127 Padua, Italy 2 Pediatric Research Institute “IRP Città della Speranza”, 35127 Padua, Italy 3 Pediatric Nephrology, Department of Women’s and Children’s Health, Padua University Hospital, 35128 Padua, Italy 19 2 20

In [28]:
# Use a hit from your last retrieve() or just the first sampled path
hits = retrieve(QUERIES[0], top_k=3)

def peek_file(s3_path, start=0, n=120):
    with fs.open(s3_path, "r", encoding="utf-8", errors="ignore") as f:
        lines = f.read().splitlines()
    for i, ln in enumerate(lines[start:start+n], start=start):
        print(f"{i:04d} | {ln}")
    return lines

_ = peek_file(hits[0]["s3_path"], start=0, n=140)


0000 | 
0001 | ==== Front
0002 | Parasit Vectors
0003 | Parasit Vectors
0004 | Parasites & Vectors
0005 | 1756-3305
0006 | BioMed Central London
0007 | 
0008 | 6559
0009 | 10.1186/s13071-024-06559-0
0010 | Research
0011 | End-point diagnostics of Giardia duodenalis assemblages A and B by combining RPA with CRISPR/Cas12a from human fecal samples
0012 | Wang Yilin 12
0013 | Yu Fuchang 123
0014 | Fu Yin 12
0015 | Zhang Qian 4
0016 | Zhao Jinfeng 12
0017 | Qin Ziyang 12
0018 | Shi Ke 5
0019 | Wu Yayun 12
0020 | Li Junqiang 12
0021 | Li Xiaoying 12
0022 | Zhang Longxian zhanglx8999@henau.edu.cn
0023 | 
0024 | 12
0025 | 1 https://ror.org/04eq83d71 grid.108266.b 0000 0004 1803 0494 College of Veterinary Medicine, Henan Agricultural University, Zhengzhou, Henan 450000 People’s Republic of China
0026 | 2 National International Joint Research Center for Animal Immunology, Zhengzhou, 450000 Henan People’s Republic of China
0027 | 3 https://ror.org/05202v862 grid.443240.5 0000 0004 1760 4679 Colle

In [31]:
def build_summarizer():
    try:
        return pipeline("summarization", model=SUMM_MODEL_PRIMARY, device=-1)
    except Exception:
        # Fallback to t5-small; prepend "summarize: " to inputs
        tok = AutoTokenizer.from_pretrained(SUMM_MODEL_FALLBACK)
        mdl = AutoModelForSeq2SeqLM.from_pretrained(SUMM_MODEL_FALLBACK)
        return pipeline("summarization", model=mdl, tokenizer=tok, device=-1)

summarizer = build_summarizer()

def summarize_text(text: str, max_words=SUMMARY_MAX_WORDS):
    # Convert a rough word limit to token-ish max_length
    # CNN/DM checkpoints do better with ~100-150 tokens; we keep small for speed.
    max_length = 120
    min_length = 40
    txt = text.strip()

    # If fallback is T5, many fine-tuned checkpoints require "summarize: " prefix
    if SUMM_MODEL_FALLBACK in getattr(summarizer.model.config, "_name_or_path", ""):
        txt = "summarize: " + txt

    out = summarizer(txt, max_length=max_length, min_length=min_length, do_sample=False, truncation=True)
    summary = out[0]["summary_text"].strip()
    # Ensure ~2–3 sentences by truncating a bit gently
    summary = " ".join(summary.split())  # normalize spaces
    return summary


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: b2378122-708a-4792-ae71-fd36305721fb)')' thrown while requesting HEAD https://huggingface.co/sshleifer/distilbart-cnn-12-6/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Device set to use cpu


In [32]:
STOPWORDS = frozenset((
    "the a an and or of in for on to with without at from by as is are was were be been being that this those these "
    "we our their there such using use used based via towards toward among amongs during between across under over "
    "into out within across per vs et al study studies method methods results conclusion conclusions objective objectives"
).split())

def top_keywords(text: str, n=KEYWORDS_TOP_N):
    vec = TfidfVectorizer(ngram_range=(1,2), max_features=4000, stop_words="english")
    try:
        X = vec.fit_transform([text])
        feats = np.array(vec.get_feature_names_out())
        scores = X.toarray().ravel()
        idx = np.argsort(-scores)
        kws = []
        for i in idx:
            token = feats[i]
            if any(tok in STOPWORDS for tok in token.split()):
                continue
            if re.search(r"^\d+$", token):
                continue
            kws.append(token)
            if len(kws) >= n:
                break
        return kws
    except Exception:
        return []


In [33]:
def build_verifier():
    try:
        return pipeline("zero-shot-classification", model=ZS_MODEL, device=-1)
    except Exception:
        return None

verifier = build_verifier()

def classify_theme(text: str, labels=THEMES):
    if verifier is None:
        return None, None
    res = verifier(text, candidate_labels=labels, multi_label=False)
    return res["labels"][0], float(res["scores"][0])


Device set to use cpu


In [34]:
def run_query(query: str, top_k: int = TOP_K):
    print(f"\n=== QUERY: {query} ===")
    hits = retrieve(query, top_k=top_k)
    rows = []
    for h in tqdm(hits, desc="Summarizing"):
        summary = summarize_text(h["abstract"])
        kws = top_keywords(h["abstract"])
        theme, theme_score = classify_theme(summary) if verifier else (None, None)
        rows.append({
            "query": query,
            "rank": h["rank"],
            "score": round(h["score"], 4),
            "title": h["title"],
            "s3_path": h["s3_path"],
            "summary": summary,
            "keywords": ", ".join(kws),
            "theme": theme,
            "theme_conf": round(theme_score, 4) if theme_score is not None else None,
        })
    df = pd.DataFrame(rows)
    return df

df_demo = run_query(QUERIES[0], top_k=TOP_K)
df_demo.head(3)



=== QUERY: Adverse events with mRNA vaccines in pediatrics ===


Summarizing:  33%|███▎      | 4/12 [00:15<00:28,  3.56s/it]Your max_length is set to 120, but your input_length is only 71. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Summarizing:  42%|████▏     | 5/12 [00:18<00:23,  3.36s/it]Your max_length is set to 120, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Summarizing:  92%|█████████▏| 11/12 [00:35<00:03,  3.14s/it]Your max_length is set to 120, but your input_length is only 82. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Summarizing: 100%|██████████| 12/12 [00:37<00:00,  3.16s/it]


Unnamed: 0,query,rank,score,title,s3_path,summary,keywords,theme,theme_conf
0,Adverse events with mRNA vaccines in pediatrics,1,0.2589,PAX2 and CAKUT Phenotypes: Report on Two New V...,s3://pmc-oa-opendata/oa_comm/txt/all/PMC996262...,PAX2 and CAKUT Phenotypes: Report on Two New V...,"padua, padua italy, italy, ijms, 2023 2023, 35...",Clinical Trial,0.5224
1,Adverse events with mRNA vaccines in pediatrics,2,0.2435,A novel assay based on DNA melting temperature...,s3://pmc-oa-opendata/oa_comm/txt/all/PMC107627...,A novel assay based on DNA melting temperature...,"china, beijing, beijing china, hospital, frien...",Clinical Trial,0.6494
2,Adverse events with mRNA vaccines in pediatrics,3,0.2345,Possible manufacture of test allergens in publ...,s3://pmc-oa-opendata/oa_comm/txt/all/PMC112703...,Possible manufacture of test allergens in publ...,"erlangen, bundeswehr, department, universität,...",Clinical Trial,0.5009


In [35]:
all_reports = []
for q in QUERIES:
    dfq = run_query(q, top_k=TOP_K)
    all_reports.append(dfq)

report = pd.concat(all_reports, ignore_index=True)
display(report.head(10))

# Save to disk (CSV + JSON)
os.makedirs("outputs", exist_ok=True)
csv_path = "outputs/report.csv"
json_path = "outputs/report.json"
report.to_csv(csv_path, index=False)
report.to_json(json_path, orient="records", indent=2)
csv_path, json_path



=== QUERY: Adverse events with mRNA vaccines in pediatrics ===


Summarizing:  33%|███▎      | 4/12 [00:12<00:25,  3.24s/it]Your max_length is set to 120, but your input_length is only 71. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Summarizing:  42%|████▏     | 5/12 [00:15<00:21,  3.13s/it]Your max_length is set to 120, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Summarizing:  92%|█████████▏| 11/12 [00:33<00:03,  3.15s/it]Your max_length is set to 120, but your input_length is only 82. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)
Summarizing: 100%|██████████| 12/12 [00:35<00:00,  2.97s/it]



=== QUERY: Transformer-based models for protein folding ===


Summarizing:  17%|█▋        | 2/12 [00:05<00:27,  2.70s/it]Your max_length is set to 120, but your input_length is only 74. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
Summarizing:  25%|██▌       | 3/12 [00:07<00:23,  2.58s/it]Your max_length is set to 120, but your input_length is only 68. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)
Summarizing:  33%|███▎      | 4/12 [00:10<00:21,  2.73s/it]Your max_length is set to 120, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
Summarizing:  83%|████████▎ | 10/12 [00:30<00:06,  3.17s/it]Your max_length 


=== QUERY: Clinical trial outcomes for monoclonal antibodies in oncology ===


Summarizing:   0%|          | 0/12 [00:00<?, ?it/s]Your max_length is set to 120, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Summarizing:  25%|██▌       | 3/12 [00:08<00:24,  2.70s/it]Your max_length is set to 120, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Summarizing:  42%|████▏     | 5/12 [00:14<00:22,  3.20s/it]Your max_length is set to 120, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Summarizing:  83%|████████▎ | 10/12 [00:31<00:06,  3.43s/it]Your max_length is set 

Unnamed: 0,query,rank,score,title,s3_path,summary,keywords,theme,theme_conf
0,Adverse events with mRNA vaccines in pediatrics,1,0.2589,PAX2 and CAKUT Phenotypes: Report on Two New V...,s3://pmc-oa-opendata/oa_comm/txt/all/PMC996262...,PAX2 and CAKUT Phenotypes: Report on Two New V...,"padua, padua italy, italy, ijms, 2023 2023, 35...",Clinical Trial,0.5224
1,Adverse events with mRNA vaccines in pediatrics,2,0.2435,A novel assay based on DNA melting temperature...,s3://pmc-oa-opendata/oa_comm/txt/all/PMC107627...,A novel assay based on DNA melting temperature...,"china, beijing, beijing china, hospital, frien...",Clinical Trial,0.6494
2,Adverse events with mRNA vaccines in pediatrics,3,0.2345,Possible manufacture of test allergens in publ...,s3://pmc-oa-opendata/oa_comm/txt/all/PMC112703...,Possible manufacture of test allergens in publ...,"erlangen, bundeswehr, department, universität,...",Clinical Trial,0.5009
3,Adverse events with mRNA vaccines in pediatrics,4,0.2287,"2 Medical Faculty, 9171 Friedrich-Alexander-Un...",s3://pmc-oa-opendata/oa_comm/txt/all/PMC976051...,The development of children is strongly affect...,"parental, closures, erlangen, germany, stress,...",Clinical Trial,0.3956
4,Adverse events with mRNA vaccines in pediatrics,5,0.2084,End-point diagnostics of Giardia duodenalis as...,s3://pmc-oa-opendata/oa_comm/txt/all/PMC115589...,Parasites & Vectors 1756-3305 BioMed Central L...,"duodenalis, duodenalis assemblages, end, end p...",Clinical Trial,0.7117
5,Adverse events with mRNA vaccines in pediatrics,6,0.2027,Familial autoimmunity in patients with idiopat...,s3://pmc-oa-opendata/oa_comm/txt/all/PMC100928...,W. I. Che et al. Familial autoimmunity in pati...,"familial autoimmunity, intern, intern med, aut...",Clinical Trial,0.5509
6,Adverse events with mRNA vaccines in pediatrics,7,0.2016,"Stigma, Post-traumatic Stress, and COVID-19 Va...",s3://pmc-oa-opendata/oa_comm/txt/all/PMC991511...,Sattler David N. Conceptualization Methodology...,"writing, writing review, writing original, met...",Traditional Methods,0.4852
7,Adverse events with mRNA vaccines in pediatrics,8,0.192,The authors declare no conflict of interest.,s3://pmc-oa-opendata/oa_comm/txt/all/PMC538649...,Granzymes are a family of homologous serine pr...,"cells, granzymes, target cell, perforin, cell,...",Traditional Methods,0.4805
8,Adverse events with mRNA vaccines in pediatrics,9,0.185,Identification of transmission foci of Schisto...,s3://pmc-oa-opendata/oa_comm/txt/all/PMC724588...,Schistosoma mansoni infection is a significant...,"mansoni, schistosoma mansoni, schistosoma, tra...",Clinical Trial,0.5824
9,Adverse events with mRNA vaccines in pediatrics,10,0.1815,Predictors of Mortality in COVID-19 Patients i...,s3://pmc-oa-opendata/oa_comm/txt/all/PMC852567...,Pulmonology Predictors of Mortality in COVID-1...,"hemet, cureus, global medical, usa, hemet glob...",Clinical Trial,0.802


('outputs/report.csv', 'outputs/report.json')