In [18]:
"""
Title: Retrieve All Citing Papers Using OpenAlex API (Paginated Version)
Description: Input by DOI or Title, retrieves citing papers with open access metadata,
saves results to a timestamped CSV, and stores pointer files for reuse in downstream steps.
Author: Karthik Anand Balasubramanian
"""

import requests
import pandas as pd
import os, time, json
from datetime import datetime
from pathlib import Path
from IPython.display import display  # for display(df.head())

# === CONFIGURATION ===
EMAIL = "karthik.nb@yahoo.com"
BASE_URL = "https://api.openalex.org"
OUTPUT_FOLDER = r"C:\Users\Karthik\Desktop\MSc Project\data"  # CSVs & pointers live here
DATA_DIR = Path(OUTPUT_FOLDER)

def get_openalex_id_from_title(title: str):
    """
    Search for a paper on OpenAlex using its title.

    Args:
        title (str): The title of the paper to search for.

    Returns:
        tuple:
            - str: OpenAlex Work ID (without URL prefix) if found, else None.
            - dict: Full OpenAlex JSON metadata for the best-matching paper, else None.
    """
    url = f"{BASE_URL}/works"
    params = {"search": title, "per_page": 1, "mailto": EMAIL}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    results = r.json().get("results", [])
    if not results:
        return None, None
    rec = results[0]
    return rec["id"].split("/")[-1], rec

def fetch_work_by_openalex_id(openalex_id: str):
    """
    Fetch the full OpenAlex metadata for a paper using its Work ID.

    Args:
        openalex_id (str): OpenAlex Work ID (e.g., 'W123456789').

    Returns:
        dict: Full metadata JSON from OpenAlex.
    """
    url = f"{BASE_URL}/works/{openalex_id}"
    r = requests.get(url, params={"mailto": EMAIL}, timeout=30)
    r.raise_for_status()
    return r.json()

def get_all_citing_papers(openalex_id: str):
    """
    Retrieve all papers that cite a given OpenAlex Work ID, using cursor-based pagination.

    Args:
        openalex_id (str): OpenAlex Work ID of the target paper.

    Returns:
        list[dict]: List of citing paper metadata records from OpenAlex.
    """
    all_results, cursor, per_page = [], "*", 25
    while True:
        url = f"{BASE_URL}/works"
        params = {"filter": f"cites:{openalex_id}", "per_page": per_page, "cursor": cursor, "mailto": EMAIL}
        r = requests.get(url, params=params, timeout=60)
        if r.status_code != 200:
            print("Error fetching data:", r.status_code)
            break
        data = r.json()
        all_results.extend(data.get("results", []))
        cursor = data.get("meta", {}).get("next_cursor")
        if not cursor:
            break
        time.sleep(1)  # polite delay
    return all_results

def save_to_csv(papers, filename_prefix="citing_papers", output_folder=OUTPUT_FOLDER):
    """
    Save a list of citing paper metadata to a timestamped CSV file.

    Args:
        papers (list[dict]): List of citing papers' metadata dictionaries.
        filename_prefix (str): Prefix for the generated CSV filename.
        output_folder (str): Directory to save the CSV.

    Returns:
        tuple:
            - pandas.DataFrame: The saved dataframe.
            - str: Full path to the saved CSV file.
    """
    os.makedirs(output_folder, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{filename_prefix}_{timestamp}.csv"
    full_path = os.path.join(output_folder, filename)

    records = []
    for p in papers:
        records.append({
            "title": p.get("title"),
            "year": p.get("publication_year"),
            "doi": p.get("doi"),
            "openalex_id": p.get("id"),
            "is_oa": (p.get("open_access") or {}).get("is_oa"),
            "oa_status": (p.get("open_access") or {}).get("oa_status"),
            "oa_url": (p.get("open_access") or {}).get("oa_url"),
        })

    df = pd.DataFrame(records)
    df.to_csv(full_path, index=False)
    print(f"\nSaved {len(df)} citing papers to: {full_path}")
    return df, full_path

def _get_first_author(paper_json):
    """
    Extract the first author's name from OpenAlex metadata.

    Args:
        paper_json (dict): Full OpenAlex JSON record for a paper.

    Returns:
        str: First author display name, or empty string if unavailable.
    """
    auths = paper_json.get("authorships") or []
    if not auths:
        return ""
    return (auths[0].get("author") or {}).get("display_name", "") or ""

# === MAIN EXECUTION ===
choice = input("Search using (1) DOI or (2) Title? Enter 1 or 2: ").strip()

selected_paper = None
openalex_id = None

if choice == "1":
    doi = input("Enter DOI of the target paper: ").strip()
    # Works-by-DOI returns the full record directly
    url = f"{BASE_URL}/works/https://doi.org/{doi}"
    r = requests.get(url, params={"mailto": EMAIL}, timeout=30)
    if r.status_code == 200:
        selected_paper = r.json()
        openalex_id = selected_paper["id"].split("/")[-1]
    else:
        print("DOI lookup failed:", r.status_code)

elif choice == "2":
    title = input("Enter the title of the target paper: ").strip()
    openalex_id, selected_paper = get_openalex_id_from_title(title)
    if openalex_id and not selected_paper:
        # Fallback: fetch full record if search helper didn't return it
        selected_paper = fetch_work_by_openalex_id(openalex_id)
else:
    print("Invalid input. Exiting.")

if openalex_id:
    citing_papers = get_all_citing_papers(openalex_id)
    df, output_csv_path = save_to_csv(citing_papers, output_folder=OUTPUT_FOLDER)
    display(df.head())
else:
    print("Could not retrieve citing papers due to missing or invalid ID.")

# --- Save a pointer to the latest CSV and the selected target ---
DATA_DIR.mkdir(parents=True, exist_ok=True)

if openalex_id and selected_paper:
    # 1) Save path to latest CSV file
    (DATA_DIR / "latest_citing_papers_path.txt").write_text(str(output_csv_path), encoding="utf-8")

    # 2) Save metadata for the target paper for later pipeline steps
    target_meta = {
        "openalex_id": (selected_paper.get("id") or "").split("/")[-1],
        "doi": (selected_paper.get("doi") or "").replace("https://doi.org/", "") or None,
        "title": selected_paper.get("title"),
        "year": selected_paper.get("publication_year"),
        "first_author": _get_first_author(selected_paper),
    }
    (DATA_DIR / "target_work.json").write_text(json.dumps(target_meta, ensure_ascii=False, indent=2), encoding="utf-8")
    print("[TARGET SAVED]", target_meta)
else:
    print("[WARN] target not saved (missing openalex_id or selected_paper).")



Saved 34 citing papers to: C:\Users\Karthik\Desktop\MSc Project\data\citing_papers_20250913_140840.csv


Unnamed: 0,title,year,doi,openalex_id,is_oa,oa_status,oa_url
0,Analogy Powered by Prediction and Structural I...,2022,https://doi.org/10.1021/jacs.2c02653,https://openalex.org/W4281888261,True,hybrid,https://doi.org/10.1021/jacs.2c02653
1,Average minimum distances of periodic point se...,2021,https://doi.org/10.46793/match.87-3.529w,https://openalex.org/W3214916886,True,bronze,https://match.pmf.kg.ac.rs/electronic_versions...
2,Recognizing Rigid Patterns of Unlabeled Point ...,2023,https://doi.org/10.1109/cvpr52729.2023.00129,https://openalex.org/W4386076138,True,green,https://arxiv.org/pdf/2303.15385
3,Material Property Prediction Using Graphs Base...,2024,https://doi.org/10.1007/s40192-024-00351-9,https://openalex.org/W4394843284,True,hybrid,https://link.springer.com/content/pdf/10.1007/...
4,Generic families of finite metric spaces with ...,2024,https://doi.org/10.1007/s41468-024-00177-6,https://openalex.org/W4396719438,True,hybrid,https://link.springer.com/content/pdf/10.1007/...


[TARGET SAVED] {'openalex_id': 'W3162861102', 'doi': '10.1007/978-3-030-76657-3_16', 'title': 'An Isometry Classification of Periodic Point Sets', 'year': 2021, 'first_author': 'Olga Anosova'}


In [19]:
from pathlib import Path
import pandas as pd

pointer_path = Path(r"C:\Users\Karthik\Desktop\MSc Project\data\latest_citing_papers_path.txt")
print("Pointer file exists:", pointer_path.exists())

if pointer_path.exists():
    csv_path = Path(pointer_path.read_text().strip())
    print("CSV path from pointer:", csv_path)
    print("CSV file exists:", csv_path.exists())
    if csv_path.exists():
        df = pd.read_csv(csv_path, nrows=1)
        print("Columns in CSV:", list(df.columns))

Pointer file exists: True
CSV path from pointer: C:\Users\Karthik\Desktop\MSc Project\data\citing_papers_20250913_140840.csv
CSV file exists: True
Columns in CSV: ['title', 'year', 'doi', 'openalex_id', 'is_oa', 'oa_status', 'oa_url']


In [20]:
"""
Step 2 (FINAL, patched): Download OA PDFs for citing papers
- Robust against 403s (MDPI/Wiley fixes), uses Unpaywall early for strict publishers,
  retries with a single Session, validates saved PDFs, and logs manual fetches as JSONL.

Inputs (from Step 1 CSV):
    title, year, doi, openalex_id, is_oa, oa_status, oa_url

Outputs:
    downloaded_files_<timestamp>/...  (PDFs)
    manual_download_required.txt      (JSONL: items to fetch manually)
    latest_download_dir.txt           (pointer for Step 3 auto-detect)
"""

# ======= IMPORTS =======
import os, re, time, random, shutil, json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
from urllib.parse import urljoin, quote, urlparse
from requests.adapters import HTTPAdapter, Retry

# ======= CONFIGURATION =======
DATA_DIR = Path(r"C:\Users\Karthik\Desktop\MSc Project\data")
if not DATA_DIR.exists():
    DATA_DIR = Path.cwd() / "data"
    DATA_DIR.mkdir(parents=True, exist_ok=True)

POINTER = DATA_DIR / "latest_citing_papers_path.txt"  # written by Step 1
REQUIRED_COLS = {"title","year","doi","openalex_id","is_oa","oa_status","oa_url"}

YOUR_EMAIL = "karthik.nb@yahoo.com"  # REQUIRED for Unpaywall/OpenAlex etiquette
if "@" not in YOUR_EMAIL:
    print("[WARN] Set YOUR_EMAIL to a real address for Unpaywall/OpenAlex etiquette.")

BROWSER_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
}

# Strict pacing/strategies per host
STRICT_DOMAINS = {"pubs.acs.org", "onlinelibrary.wiley.com", "tandfonline.com", "mdpi.com"}
STRICT_PUBS     = ("mdpi.com","onlinelibrary.wiley.com","sciencedirect.com","elsevier.com","springer.com")

DEMO = False    # set True during recording to cap runtime
DEMO_MAX = 12   # max rows to attempt when DEMO=True

# ======= SESSION (reused) with retries =======
def make_session():
    s = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=0.8,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD","GET","OPTIONS"]
    )
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.headers.update(BROWSER_HEADERS.copy())
    return s

SESSION = make_session()

# ======= HELPERS =======
def safe_str(x):
    s = str(x or "").strip()
    return None if s.lower() in {"", "nan", "none", "null"} else s

def _find_latest_citing_csv(data_dir: Path, pointer: Path) -> Path:
    # 1) Prefer pointed file if present and valid
    if pointer.exists():
        p = Path(pointer.read_text(encoding="utf-8").strip())
        if p.exists():
            return p
    # 2) Else pick newest matching file in data_dir (then CWD)
    patterns = ["citing_papers_*.csv", "*citing*papers*.csv"]
    candidates = []
    for pat in patterns:
        candidates.extend(data_dir.glob(pat))
    if not candidates:
        candidates = list(Path(".").glob("citing_papers_*.csv"))
    if not candidates:
        raise FileNotFoundError(f"No citing_papers CSV found in {data_dir} or current directory.")
    candidates.sort(key=lambda p: p.stat().st_mtime, reverse=True)
    # 3) Choose newest with expected columns
    for p in candidates:
        try:
            cols = {c.strip().lower() for c in pd.read_csv(p, nrows=0).columns}
            if REQUIRED_COLS.issubset(cols):
                return p
        except Exception:
            continue
    return candidates[0]

def sanitize_filename(title: str) -> str:
    t = (title or "").strip()
    t = re.sub(r"[\\/:*?\"<>|\r\n]+", "_", t)
    t = re.sub(r"_+", "_", t).strip("_")
    return (t[:120] or "paper")

def normalize_doi(doi):
    #remove redundant part of url for doi
    if not doi:
        return None
    d = doi.strip()
    if d.lower().startswith("http"):
        d = d.replace("https://doi.org/", "").replace("http://doi.org/", "").replace("doi.org/", "")
    return d.strip() or None

def _resolve_pdf_links_from_html(html: str, base_url: str) -> list:
    
    links = re.findall(r'href=["\']([^"\']+\.pdf[^"\']*)["\']', html, flags=re.I)
    links += re.findall(r'href=["\']([^"\']+(?:/pdf|format=pdf|type=pdf)[^"\']*)["\']', html, flags=re.I)
    out = []
    for href in links:
        full = urljoin(base_url, href)
        if full not in out:
            out.append(full)
    return out

def domain_sleep(url: str):
    try:
        host = urlparse(url).hostname or ""
    except Exception:
        host = ""
    if any(h in host for h in STRICT_DOMAINS):
        time.sleep(3 + random.random()*2)  # 3–5s
    else:
        time.sleep(0.5 + random.random()*0.5)

MIN_BYTES = 30_000
def is_valid_pdf(path: str) -> bool:
    #Checks the file exists, is at least 30KB (filters junk), and starts with %PDF- (magic header). Prevents saving HTML as .pdf.
    try:
        if not os.path.exists(path): return False
        if os.path.getsize(path) < MIN_BYTES: return False
        with open(path, "rb") as f:
            head = f.read(5)
        return head == b"%PDF-"
    except Exception:
        return False

def _stream_save_resp_to_pdf(resp: requests.Response, dest_path: str) -> bool:
    #Verifies the server is actually sending a PDF (or an octet-stream + .pdf URL). Returns True if it looked like a PDF based on headers/URL.
    
    ctype = (resp.headers.get("Content-Type") or "").lower()
    # Accept octet-stream if URL ends with .pdf
    if ("pdf" not in ctype) and ("octet-stream" not in ctype) and (not resp.url.lower().endswith(".pdf")):
        return False
    with open(dest_path, "wb") as f:
        for chunk in resp.iter_content(1 << 14):
            if chunk:
                f.write(chunk)
    return True

def save_and_validate(resp: requests.Response, dest_path: str) -> bool:
    
    """Writes the response with _stream_save_resp_to_pdf, then validates with is_valid_pdf. If invalid, it removes the file. 
    Returns True only if all checks pass.
    """
    
    if not _stream_save_resp_to_pdf(resp, dest_path):
        return False
    if not is_valid_pdf(dest_path):
        try: os.remove(dest_path)
        except Exception: pass
        return False
    return True

def _download_pdf_with_headers(session: requests.Session, pdf_url: str, dest_path: str, referer: str | None) -> bool:
    """Requests the pdf_url with Accept: application/pdf. If referer is provided, it adds it (some sites require a valid referrer). 
       Then calls save_and_validate.
    """
    headers = {"Accept": "application/pdf"}
    if referer: headers["Referer"] = referer
    r = session.get(pdf_url, headers=headers, timeout=30, stream=True, allow_redirects=True)
    r.raise_for_status()
    return save_and_validate(r, dest_path)

def mdpi_fix(u: str) -> str:
    #MDPI often requires download=1 in the query string for direct PDF. This function appends it when appropriate.
    try:
        host = urlparse(u).hostname or ""
    except Exception:
        host = ""
    if "mdpi.com" in host and "pdf" in u and "download=" not in u:
        sep = "&" if "?" in u else "?"
        return f"{u}{sep}download=1"
    return u

def try_wiley_epdf(session: requests.Session, doi: str, dest_path: str) -> bool:
    """Wiley sometimes serves PDFs via an ePDF viewer page. This function loads the ePDF page, scrapes it for actual PDF links, 
    tries to download those PDFs with the correct Referer. Returns True if it manages to save a valid PDF.
    """
    
    if not doi: return False
    epdf = f"https://onlinelibrary.wiley.com/doi/epdf/{doi}"
    try:
        r = session.get(epdf, headers={"Referer":"https://onlinelibrary.wiley.com/"}, timeout=25, allow_redirects=True)
        if r.status_code != 200:
            return False
        links = _resolve_pdf_links_from_html(r.text, r.url)
        for l in links:
            try:
                if _download_pdf_with_headers(session, l, dest_path, referer=r.url):
                    print(f"[WILEY EPDF → PDF SAVED] {dest_path}")
                    return True
            except Exception:
                continue
    except Exception:
        return False
    return False

def prefer_unpaywall_first(url: str | None) -> bool:
    #Returns True if the OA URL looks like it belongs to a strict publisher. 
    #If so, the code will try Unpaywall first (repositories are often easier/freer to fetch).

    if not url: return False
    try:
        host = urlparse(url).hostname or ""
    except Exception:
        return False
    return any(h in host for h in STRICT_PUBS)

def get_unpaywall_candidate_pdfs(session: requests.Session, doi: str | None, email=YOUR_EMAIL) -> list:
    #Calls Unpaywall API for the DOI.
    # Collects repository url_for_pdf first (best), then repository url, then non-repo url_for_pdf, then non-repo url.
    # Deduplicates and returns a list (repository first). (Works without paywalls.)
    
    if not doi:
        return []
    url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
    try:
        r = session.get(url, timeout=20)
        if r.status_code != 200:
            return []
        j = r.json()
    except Exception:
        return []
    locs = j.get("oa_locations") or []
    def is_repo(loc): return (loc or {}).get("host_type") == "repository"
    def get_pdf(loc): return (loc or {}).get("url_for_pdf")
    def get_url(loc): return (loc or {}).get("url")

    cands = []
    for loc in locs:
        if is_repo(loc) and get_pdf(loc): cands.append(get_pdf(loc))
    for loc in locs:
        if is_repo(loc) and get_url(loc): cands.append(get_url(loc))
    for loc in locs:
        if (not is_repo(loc)) and get_pdf(loc): cands.append(get_pdf(loc))
    for loc in locs:
        if (not is_repo(loc)) and get_url(loc): cands.append(get_url(loc))

    out, seen = [], set()
    for u in cands:
        if u and (u not in seen):
            seen.add(u); out.append(u)
    return out

def get_openalex_repository_pdfs(session: requests.Session, openalex_id: str | None, email=YOUR_EMAIL) -> list:
    #Calls OpenAlex for the work, looks at primary_location + locations, and collects pdf_url only where host_type == "repository". Returns a deduped list.
    
    if not openalex_id:
        return []
    wid = openalex_id.split("/")[-1] if "/" in openalex_id else openalex_id
    try:
        j = session.get(
            f"https://api.openalex.org/works/{wid}",
            params={"mailto": email}, timeout=20
        ).json()
    except Exception:
        return []
    locs = []
    if isinstance(j.get("primary_location"), dict): locs.append(j["primary_location"])
    locs.extend(j.get("locations") or [])
    out, seen = [], set()
    for loc in locs:
        if isinstance(loc, dict) and loc.get("host_type") == "repository":
            u = loc.get("pdf_url") or ""
            if u and u not in seen:
                seen.add(u); out.append(u)
    return out

def download_pdf_from_doi(session: requests.Session, doi_or_url: str, dest_path: str) -> bool:
    """
    Robust DOI→PDF:
      1) DOI content-negotiation (Accept: application/pdf)
      2) Resolve landing and scrape PDF links
      3) Try publisher direct PDF endpoints (ACS/Springer/Wiley/T&F)
      4) Landing again with Accept: application/pdf
    """
    plain_doi = normalize_doi(doi_or_url)
    if not plain_doi:
        return False
    doi_url = f"https://doi.org/{plain_doi}"

    # 1) Content negotiation (direct pdf download if possible)
    try:
        r = session.get(doi_url, headers={"Accept":"application/pdf"}, timeout=25, allow_redirects=True, stream=True)
        if r.status_code == 200 and save_and_validate(r, dest_path):
            print(f"[PDF SAVED via DOI content-negotiation] {dest_path}")
            return True
    except Exception:
        pass

    # 2) If HTML, scrape for PDF links
    landing_url, html = None, None
    try:
        r = session.get(doi_url, timeout=25, allow_redirects=True)
        r.raise_for_status()
        landing_url, html = r.url, r.text
    except Exception:
        landing_url, html = None, None

    if html and landing_url:
        for pdf_url in _resolve_pdf_links_from_html(html, landing_url):
            try:
                if _download_pdf_with_headers(session, pdf_url, dest_path, referer=landing_url):
                    print(f"[PDF SAVED via DOI landing scrape] {dest_path}")
                    return True
            except Exception:
                pass

    # 3) try known direct PDF URL shapes for ACS, Springer, Wiley, Taylor & Francis.
    candidates = []
    # ACS
    candidates += [f"https://pubs.acs.org/doi/pdf/{plain_doi}",
                   f"https://pubs.acs.org/doi/pdfdirect/{plain_doi}"]
    # Springer
    springer_doi = quote(plain_doi, safe="")
    candidates += [f"https://link.springer.com/content/pdf/{plain_doi}.pdf",
                   f"https://link.springer.com/content/pdf/{springer_doi}.pdf"]
    # Wiley
    candidates += [f"https://onlinelibrary.wiley.com/doi/pdfdirect/{plain_doi}?download=true",
                   f"https://onlinelibrary.wiley.com/doi/pdf/{plain_doi}"]
    # Taylor & Francis
    candidates += [f"https://www.tandfonline.com/doi/pdf/{plain_doi}?needAccess=true"]

    for cand in candidates:
        try:
            resp = session.get(cand, headers={"Accept":"application/pdf"}, timeout=25, allow_redirects=True, stream=True)
            if resp.status_code == 200 and save_and_validate(resp, dest_path):
                print(f"[PDF SAVED via publisher pattern] {cand} -> {dest_path}")
                return True
        except Exception:
            pass

    # 4) Try landing again with Accept: application/pdf (sometimes serves PDF on second try with the right header).
    if landing_url:
        try:
            r2 = session.get(landing_url, headers={"Accept":"application/pdf"}, timeout=25, allow_redirects=True, stream=True)
            if r2.status_code == 200 and save_and_validate(r2, dest_path):
                print(f"[PDF SAVED via landing accept] {dest_path}")
                return True
        except Exception:
            pass

    print(f"[GAVE UP] Couldn’t fetch PDF via DOI workflow for {plain_doi}")
    return False

def log_manual_download(row: pd.Series, output_dir: str):
    #to-do list for manual fetching
    
    log_path = os.path.join(output_dir, "manual_download_required.txt")
    fields = {
        "title": row.get("title",""),
        "doi": row.get("doi",""),
        "openalex_id": row.get("openalex_id",""),
        "oa_url": row.get("oa_url",""),
        "publisher": row.get("host_venue.publisher","") if "host_venue.publisher" in row else row.get("publisher",""),
        "oa_status": row.get("oa_status",""),
        "year": row.get("year",""),
    }
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(json.dumps(fields, ensure_ascii=False) + "\n")

# ======= LOAD INPUT CSV =======
latest_csv = _find_latest_citing_csv(DATA_DIR, POINTER)
csv_path = str(latest_csv)
print(f"[AUTO] Using latest citing CSV: {csv_path}")

df = pd.read_csv(csv_path)

# sanitize key columns to avoid 'nan' or "None" requests
df["doi"] = df["doi"].map(safe_str)
df["oa_url"] = df["oa_url"].map(safe_str)
df["openalex_id"] = df["openalex_id"].map(safe_str)

# Candidates: (OA AND has oa_url) OR has a DOI (Unpaywall/OpenAlex repo may still work)
is_oa_bool = df["is_oa"].astype(str).str.lower().isin({"true", "1", "yes"}) | (df["is_oa"] == True)
oa_url_valid = df["oa_url"].astype(str).str.strip().ne("") & df["oa_url"].notna()
doi_ok = df["doi"].astype(str).str.strip().ne("") & df["doi"].notna()

candidates = df[(is_oa_bool & oa_url_valid) | doi_ok].copy()
oa_url_hits = int(((is_oa_bool & oa_url_valid)).sum())
doi_only_hits = int((doi_ok & ~oa_url_valid).sum())
print(f"Rows in CSV: {len(df)} | candidates: {len(candidates)}  (OA+URL={oa_url_hits}, DOI-only={doi_only_hits})")

# ======= OUTPUT DIR =======
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"downloaded_files_{timestamp}"
os.makedirs(output_dir, exist_ok=True)

# ======= MAIN DOWNLOAD FUNCTION =======
def download_and_save_file(row: pd.Series):
    url = safe_str(row.get("oa_url"))
    title = safe_str(row.get("title")) or ""
    openalex_id = safe_str(row.get("openalex_id")) or ""
    # handle full OpenAlex URLs
    if openalex_id and "/" in openalex_id:
        openalex_id = openalex_id.split("/")[-1]
    filename = sanitize_filename(title or openalex_id or "paper")
    dest_pdf = os.path.join(output_dir, f"{filename}.pdf")

    # Skip if a valid PDF already exists
    if os.path.exists(dest_pdf) and is_valid_pdf(dest_pdf):
        # print(f"[SKIP] Already downloaded: {filename}")
        return

    doi = normalize_doi(safe_str(row.get("doi")))

    # 0) Domain-friendly pacing (if we have a URL)
    if url:
        domain_sleep(url)

    # 1) Unpaywall first for strict publishers or when URL missing
    tried_unpaywall_early = False
    if doi and (prefer_unpaywall_first(url) or not url):
        tried_unpaywall_early = True
        for cand in get_unpaywall_candidate_pdfs(SESSION, doi):
            domain_sleep(cand)
            try:
                rpdf = SESSION.get(cand, timeout=30, stream=True, allow_redirects=True)
                if save_and_validate(rpdf, dest_pdf):
                    print(f"[UNPAYWALL LOCATION SAVED] {dest_pdf}")
                    return
                if "html" in (rpdf.headers.get("Content-Type") or "").lower():
                    links = _resolve_pdf_links_from_html(rpdf.text, rpdf.url)
                    for l in links:
                        domain_sleep(l)
                        try:
                            if _download_pdf_with_headers(SESSION, l, dest_pdf, referer=rpdf.url):
                                print(f"[UNPAYWALL LANDING → PDF SAVED] {dest_pdf}")
                                return
                        except Exception:
                            pass
            except Exception:
                pass

    # 2) Try OA URL directly (with MDPI fix and strong Referer)
    if url:
        fixed = mdpi_fix(url)
        try:
            hdrs = {"Accept":"application/pdf"}
            # set a referer if domain might care
            try_host = urlparse(fixed).hostname or ""
            if "mdpi.com" in try_host:
                hdrs["Referer"] = "https://www.mdpi.com/"
            r = SESSION.get(fixed, headers=hdrs, timeout=25, allow_redirects=True, stream=True)
            if save_and_validate(r, dest_pdf):
                print(f"[PDF SAVED] {dest_pdf}")
                return
            # if HTML, scrape embedded PDF links
            if "html" in (r.headers.get("Content-Type") or "").lower():
                soup = BeautifulSoup(r.content, "html.parser")
                for a in soup.find_all("a", href=True):
                    href = a["href"]
                    if (".pdf" in href) or ("format=pdf" in href) or href.rstrip("/").endswith("/pdf"):
                        pdf_url = urljoin(r.url, href)
                        domain_sleep(pdf_url)
                        try:
                            if _download_pdf_with_headers(SESSION, pdf_url, dest_pdf, referer=r.url):
                                print(f"[PDF FOUND IN HTML & SAVED] {dest_pdf}")
                                return
                        except Exception:
                            pass
        except Exception as e:
            print(f"[DOWNLOAD ERROR] {fixed} — {e}")

    # 3) Wiley EPDF viewer fallback (if Wiley)
    if doi and url:
        host = urlparse(url).hostname or ""
        if "onlinelibrary.wiley.com" in host:
            if try_wiley_epdf(SESSION, doi, dest_pdf):
                return

    # 4) DOI fallback (content-negotiation + landing scrape + publisher patterns)
    if doi:
        try:
            if download_pdf_from_doi(SESSION, doi, dest_pdf):
                return
        except Exception as e:
            print(f"[DOI FALLBACK ERROR] {doi} — {e}")

    # 5) Unpaywall (if not tried earlier)
    if doi and not tried_unpaywall_early:
        for cand in get_unpaywall_candidate_pdfs(SESSION, doi):
            domain_sleep(cand)
            try:
                rpdf = SESSION.get(cand, timeout=30, stream=True, allow_redirects=True)
                if save_and_validate(rpdf, dest_pdf):
                    print(f"[UNPAYWALL LOCATION SAVED] {dest_pdf}")
                    return
                if "html" in (rpdf.headers.get("Content-Type") or "").lower():
                    links = _resolve_pdf_links_from_html(rpdf.text, rpdf.url)
                    for l in links:
                        domain_sleep(l)
                        try:
                            if _download_pdf_with_headers(SESSION, l, dest_pdf, referer=rpdf.url):
                                print(f"[UNPAYWALL LANDING → PDF SAVED] {dest_pdf}")
                                return
                        except Exception:
                            pass
            except Exception:
                pass

    # 6) OpenAlex repository pdf_url(s) (e.g., institutional repositories).
    for cand in get_openalex_repository_pdfs(SESSION, openalex_id):
        domain_sleep(cand)
        try:
            rpdf = SESSION.get(cand, timeout=30, stream=True, allow_redirects=True)
            if save_and_validate(rpdf, dest_pdf):
                print(f"[OPENALEX REPO PDF SAVED] {dest_pdf}")
                return
        except Exception:
            pass

    # 7) Give up → log for manual
    log_manual_download(row, output_dir)

# ======= DOWNLOAD LOOP (demo-cap aware) =======
iter_df = candidates.head(DEMO_MAX) if DEMO else candidates
for _, row in iter_df.iterrows():
    download_and_save_file(row)

print(f"\n Done downloading files to: {output_dir}")

# ======= SUMMARY & POINTER FOR STEP 3 =======
saved = len([f for f in os.listdir(output_dir) if f.lower().endswith(".pdf") and is_valid_pdf(os.path.join(output_dir,f))])
manual_log = os.path.join(output_dir, "manual_download_required.txt")
manual = sum(1 for _ in open(manual_log, "r", encoding="utf-8")) if os.path.exists(manual_log) else 0
print(f"Summary: PDFs saved = {saved}, manual downloads logged = {manual}")

(DATA_DIR / "latest_download_dir.txt").write_text(str(Path(output_dir).resolve()), encoding="utf-8")

# ======= OPTIONAL: MANUAL UPLOADS PROCESSING =======
manual_uploads_dir = "manual_uploads"  # drop files manually here
os.makedirs(manual_uploads_dir, exist_ok=True)

def process_manual_uploads():
    print("\n=== Processing manual uploads ===")
    for filename in os.listdir(manual_uploads_dir):
        file_path = os.path.join(manual_uploads_dir, filename)
        name, ext = os.path.splitext(filename)
        out_pdf = os.path.join(output_dir, f"{name}.pdf")
        out_txt = os.path.join(output_dir, f"{name}.txt")

        if ext.lower() == ".pdf":
            if not os.path.exists(out_pdf):
                shutil.copy(file_path, out_pdf)
                print(f"[PDF COPIED] {filename}")
            else:
                print(f"[SKIP] Exists: {filename}")
        elif ext.lower() == ".txt":
            if not os.path.exists(out_txt):
                shutil.copy(file_path, out_txt)
                print(f"[TEXT COPIED] {filename}")
            else:
                print(f"[SKIP] Exists: {filename}")
        elif ext.lower() == ".html":
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    html = f.read()
                soup = BeautifulSoup(html, "html.parser")
                text = "\n".join(p.get_text() for p in soup.find_all("p"))
                with open(out_txt, "w", encoding="utf-8") as f_out:
                    f_out.write(text.strip())
                print(f"[HTML TEXT EXTRACTED] {filename}")
            except Exception as e:
                print(f"[ERROR] HTML extract failed {filename} — {e}")
        else:
            print(f"[IGNORED] Unsupported type: {filename}")

process_manual_uploads()


[AUTO] Using latest citing CSV: C:\Users\Karthik\Desktop\MSc Project\data\citing_papers_20250913_140840.csv
Rows in CSV: 34 | candidates: 32  (OA+URL=29, DOI-only=2)
[PDF SAVED via DOI landing scrape] downloaded_files_2025-09-13_14-09-30\Analogy Powered by Prediction and Structural Invariants_ Computationally Led Discovery of a Mesoporous Hydrogen-Bonded O.pdf
[PDF SAVED] downloaded_files_2025-09-13_14-09-30\Average minimum distances of periodic point sets – foundational invariants for mapping periodic crystals.pdf
[PDF SAVED] downloaded_files_2025-09-13_14-09-30\Recognizing Rigid Patterns of Unlabeled Point Clouds by Complete and Continuous Isometry Invariants with no False Negati.pdf
[UNPAYWALL LOCATION SAVED] downloaded_files_2025-09-13_14-09-30\Material Property Prediction Using Graphs Based on Generically Complete Isometry Invariants.pdf
[UNPAYWALL LANDING → PDF SAVED] downloaded_files_2025-09-13_14-09-30\Generic families of finite metric spaces with identical or trivial 1-dimensi

In [9]:
# Health check: GROBID should return HTTP 200
import requests
GROBID_URL = "http://localhost:8070"
r = requests.get(f"{GROBID_URL}/api/isalive", timeout=5)
print("Status:", r.status_code, "| Body:", r.text[:80])

Status: 200 | Body: true


In [11]:
# === CONFIG: PDF → TEI pipeline (auto-pick latest; pointer-aware) ===
import json
from pathlib import Path
from datetime import datetime
import requests

# Paths/pointers
DATA_DIR = Path(r"C:\Users\Karthik\Desktop\MSc Project\data")
POINTER_DL = DATA_DIR / "latest_download_dir.txt"   # written by Step 2

# 1) Find the PDF folder created in Cell  2 (using a pointer file; otherwise guessing sensibly).
PDF_DIR = None
if POINTER_DL.exists():
    try:
        hinted = Path(POINTER_DL.read_text(encoding="utf-8").strip())
        if hinted.exists():
            PDF_DIR = hinted
            print(f"[AUTO] Using PDF dir from pointer: {PDF_DIR}")
    except Exception:
        pass

if PDF_DIR is None:
    candidates = sorted(Path.cwd().glob("downloaded_files_*"),
                        key=lambda p: p.stat().st_mtime, reverse=True)
    if not candidates:
        vs_code_dir = Path(r"C:\Users\Karthik\Documents\VS Code")
        if vs_code_dir.exists():
            candidates = sorted(vs_code_dir.glob("downloaded_files_*"),
                                key=lambda p: p.stat().st_mtime, reverse=True)
    PDF_DIR = candidates[0] if candidates else Path(r"C:\Users\Karthik\Documents\VS Code\downloaded_files_2025-08-18_03-04-47")
    print(f"[AUTO] Using PDF dir (glob/fallback): {PDF_DIR}")

assert PDF_DIR.exists(), f"PDF_DIR not found: {PDF_DIR}"

# 2) Outputs (Create output folder, timestamped to avoid overwriting) + pointer for Step 4
OUT_DIR = PDF_DIR / f"grobid_outputs_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
TEI_DIR = OUT_DIR / "tei"
TEI_DIR.mkdir(parents=True, exist_ok=True)
print(f"[OUT] TEI dir: {TEI_DIR}")
(DATA_DIR / "latest_tei_dir.txt").write_text(str(TEI_DIR.resolve()), encoding="utf-8")

# 3) GROBID server health check
GROBID_URL = "http://localhost:8070"
try:
    alive = requests.get(f"{GROBID_URL}/api/isalive", timeout=5).text.strip().lower()
    print("GROBID alive:", alive)
except Exception as e:
    raise SystemExit(f"GROBID not reachable at {GROBID_URL}. Start it, then re-run. ({e})")

# 4) Auto-load TARGET from Step 1 (with fallback)
target_path = DATA_DIR / "target_work.json"
if target_path.exists():
    T = json.loads(target_path.read_text(encoding="utf-8"))
    TARGET = {
        "doi": T.get("doi"),
        "title": T.get("title"),
        "authors": [T.get("first_author","")],
        "year": T.get("year"),
    }
    print(f"[AUTO] Loaded TARGET: {TARGET['title']}  (DOI={TARGET['doi']})")
else:
    TARGET = {
        "doi": "10.1007/978-3-319-08434-3_7",
        "title": "Detecting unknots via equational reasoning, I: Exploration",
        "authors": ["Andrew Fish", "Alexei Lisitsa"],
        "year": 2014,
    }
    print("[FALLBACK TARGET] Using hardcoded Lisitsa paper.")

# 5) Quick preview of PDFs detected
pdfs = sorted([p for p in PDF_DIR.glob("*.pdf") if p.stat().st_size > 0])
print(f"PDFs found: {len(pdfs)}")
if not pdfs:
    print(" No PDFs in this folder. Did Step 2 finish? Check pointer or directory.")
for p in pdfs[:8]:
    print(" -", p.name)

[AUTO] Using PDF dir from pointer: C:\Users\Karthik\Desktop\downloaded_files_2025-09-13_08-42-10
[OUT] TEI dir: C:\Users\Karthik\Desktop\downloaded_files_2025-09-13_08-42-10\grobid_outputs_2025-09-13_08-52-18\tei
GROBID alive: true
[AUTO] Loaded TARGET: An Isometry Classification of Periodic Point Sets  (DOI=10.1007/978-3-030-76657-3_16)
PDFs found: 25
 - A continuous map of 2.6+ million 2D lattices from the Cambridge Structural Database.pdf
 - A unique and continuous code of all periodic crystals.pdf
 - Analogy Powered by Prediction and Structural Invariants_ Computationally Led Discovery of a Mesoporous Hydrogen-Bonded O.pdf
 - Average minimum distances of periodic point sets – foundational invariants for mapping periodic crystals.pdf
 - Bounds for the Regularity Radius of Delone Sets.pdf
 - Continuous chiral distances for two‐dimensional lattices.pdf
 - Continuous Invariant-Based Maps of the Cambridge Structural Database.pdf
 - Continuous maps of molecules and atomic clouds in large

In [12]:
# === PDFs → TEI via GROBID ===
# This cell converts *all* detected PDFs into TEI XML using a locally running GROBID server.
# It assumes the previous cell already defined:
#   - `pdfs`   : a list of Path objects pointing to PDFs to process
#   - `TEI_DIR`: a Path to the output directory where TEI files should be written
#   - `GROBID_URL`: the base URL of your local GROBID service, e.g., "http://localhost:8070"
#
# Flow:
#   1) Prepare a per-file converter function `pdf_to_tei(...)`.
#   2) Build the list of PDFs to run (all non-empty PDFs).
#   3) Convert each PDF → TEI, skipping ones already converted.
#   4) Print a summary and log failures to a text file.

import time, requests
from pathlib import Path
from requests.exceptions import RequestException

# Process all PDFs; set to an integer only if ever want to cap the count.
# Keeping a small pause between files avoids overwhelming GROBID on large batches.
MAX_PDFS = None
PER_FILE_PAUSE = 0.3  # keep a short pause so GROBID isn't hammered; set 0 if you want

def pdf_to_tei(
    pdf_path: Path,
    out_path: Path,
    base_url: str = GROBID_URL,
    consolidate_citations: int = 1,
    retry: int = 2,
    sleep_s: float = 1.5,
) -> bool:
    """
    Convert a single PDF to TEI using a running GROBID service.

    This function sends the PDF to GROBID's /api/processFulltextDocument endpoint and,
    on success, writes the returned TEI XML to `out_path`. It includes basic robustness:
    - small retry loop on network or non-5xx issues,
    - early bail-out on 5xx (server) errors,
    - simple validation that the response contains a <TEI tag.

    Args:
        pdf_path (Path): Absolute or relative path to the input PDF file.
        out_path (Path): Path where the resulting TEI XML should be saved.
        base_url (str): Base URL of the GROBID server (e.g., "http://localhost:8070").
        consolidate_citations (int): GROBID option for citation normalization (1 is a solid default).
        retry (int): Number of retries after the initial attempt (total attempts = retry + 1).
        sleep_s (float): Base seconds to sleep between retries. Increases slightly each attempt.

    Returns:
        bool: True if TEI was received (HTTP 200 and contains '<TEI') and saved; False otherwise.

    """
    # Fulltext conversion endpoint exposed by GROBID
    url = f"{base_url}/api/processFulltextDocument"

    # Attempt the request up to (retry + 1) times
    for attempt in range(retry + 1):
        try:
            # Send the PDF as multipart form-data with expected field name "input"
            with open(pdf_path, "rb") as f:
                files = {"input": (pdf_path.name, f, "application/pdf")}
                data = {
                    "consolidateCitations": consolidate_citations,
                    "includeRawCitations": 0,
                    "includeRawAffiliations": 0,
                    "teiCoordinates": "ref,biblStruct",
                }
                # Generous timeout for large or complex PDFs
                r = requests.post(url, files=files, data=data, timeout=120)

            # Read response body as text (TEI is XML text). Strip BOM/leading whitespace if present.
            txt = (r.text or "")
            txt_l = txt.lstrip("\ufeff \t\r\n")  # trim BOM/whitespace

            # Minimal check: valid TEI should contain a '<TEI' element somewhere.
            # (We don't enforce XML prolog since some servers omit it.)
            looks_like_tei = ("<TEI" in txt_l)

            # Success path: HTTP 200 + TEI detected → write to disk and return True
            if r.status_code == 200 and looks_like_tei:
                out_path.write_text(txt, encoding="utf-8")
                return True

            # Prepare a short snippet for logging/warnings (first ~140 chars, single line)
            snippet = txt_l[:140].replace("\n", " ")

            # For 5xx server errors, retries rarely help; log a warning and break out early.
            if r.status_code >= 500:
                print(f"[WARN] {pdf_path.name}: HTTP {r.status_code} — {snippet}…")
                break

            # For other statuses (e.g., 4xx or odd HTML), warn and allow the retry loop to continue.
            print(f"[WARN] {pdf_path.name}: HTTP {r.status_code} — {snippet}…")

        except RequestException as e:
            # Network-level errors (timeouts, connection issues) get logged; we may retry.
            print(f"[ERR]  {pdf_path.name}: {e}")

        # Backoff before the next attempt (if any remain). Increases a bit each time.
        if attempt < retry:
            time.sleep(sleep_s * (1 + attempt))

    # If we exit the loop without returning True, the conversion failed.
    return False

# Build the list of PDFs to run. We skip zero-byte files (they're not real PDFs).
# With MAX_PDFS=None we do NOT apply any cap: all PDFs will be considered.
pdfs_to_run = [p for p in pdfs if p.stat().st_size > 0]
print("Converting:", len(pdfs_to_run), "files")

ok = bad = 0
fails = []

# Process each PDF → TEI.
for pdf in pdfs_to_run:
    # Output TEI path mirrors the PDF name but ends with ".tei.xml"
    tei_path = TEI_DIR / (pdf.stem + ".tei.xml")

    # If TEI already exists and is non-empty, skip to avoid rework on re-runs.
    if tei_path.exists() and tei_path.stat().st_size > 0:
        print(f"[SKIP] {tei_path.name}")
        ok += 1
        continue

    # Attempt conversion and log outcome.
    if pdf_to_tei(pdf, tei_path):
        print(f"[OK]   {pdf.name} → {tei_path.name}")
        ok += 1
    else:
        print(f"[FAIL] {pdf.name}")
        bad += 1
        fails.append(pdf.name)

    # Small pause between files to keep GROBID responsive.
    time.sleep(PER_FILE_PAUSE)

# Print a concise summary and write failures (if any) to a text file for easy re-tries later.
print(f"\nTEI done. ok={ok}, failed={bad}. Out: {TEI_DIR}")
if fails:
    fail_log = TEI_DIR.parent / "grobid_failures.txt"
    fail_log.write_text("\n".join(fails), encoding="utf-8")
    print(f"[LOG] Failure list → {fail_log}")


Converting: 25 files
[OK]   A continuous map of 2.6+ million 2D lattices from the Cambridge Structural Database.pdf → A continuous map of 2.6+ million 2D lattices from the Cambridge Structural Database.tei.xml
[OK]   A unique and continuous code of all periodic crystals.pdf → A unique and continuous code of all periodic crystals.tei.xml
[OK]   Analogy Powered by Prediction and Structural Invariants_ Computationally Led Discovery of a Mesoporous Hydrogen-Bonded O.pdf → Analogy Powered by Prediction and Structural Invariants_ Computationally Led Discovery of a Mesoporous Hydrogen-Bonded O.tei.xml
[OK]   Average minimum distances of periodic point sets – foundational invariants for mapping periodic crystals.pdf → Average minimum distances of periodic point sets – foundational invariants for mapping periodic crystals.tei.xml
[OK]   Bounds for the Regularity Radius of Delone Sets.pdf → Bounds for the Regularity Radius of Delone Sets.tei.xml
[OK]   Continuous chiral distances for two‐dimensi

In [13]:
# === TEI robust matcher + counter (regex targets; any ref/ptr; arXiv-in-text; authors/year; xml:id fix) ===
# This cell parses GROBID TEI XML files, matches the correct target reference,
# counts in-text citations, and saves outputs (counts, contexts, debug info).

from pathlib import Path
from xml.etree import ElementTree as ET
import re, csv, json, difflib
from datetime import datetime

# --- Preconditions ---
assert 'TEI_DIR' in globals() and TEI_DIR.exists(), "TEI_DIR missing. Run the PDFs→TEI step first."
assert 'OUT_DIR' in globals() and OUT_DIR.exists(), "OUT_DIR missing. Run the CONFIG cell first."
assert 'TARGET' in globals(), "TARGET not defined."

XML_ID = "{http://www.w3.org/XML/1998/namespace}id"  # fully-qualified xml:id

# ---------- Helpers ----------

def _norm(s: str) -> str:
    """Normalize a string: lowercase, collapse whitespace."""
    return re.sub(r"\s+", " ", (s or "").lower()).strip()

def _fuzzy(a: str, b: str) -> float:
    """Returns similarity score between two strings (0 to 1). Used for title matching.."""
    return difflib.SequenceMatcher(None, a, b).ratio()

def _get_text(el) -> str:
    """Safely pull all visible text from an XML node (including nested).."""
    return " ".join(el.itertext()).strip() if el is not None else ""

# extract all #bXXX ids from a @target string (handles spaces, commas, semicolons, ranges)
_TARGET_ID_RE = re.compile(r"#([A-Za-z0-9_-]+)")

def extract_target_ids(targ_attr: str):
    #In TEI, in-text citations use target="#b12" (or multiple IDs like "#b3 #b12").
    # This finds all IDs referenced inside a target attribute and returns them as a list (without the #).
    
    if not targ_attr:
        return []
    return _TARGET_ID_RE.findall(targ_attr)

def collect_bibl_structs(root: ET.Element):
    """
    Collect all <biblStruct> nodes (the references).
    Returns (list_of_bibls, set_of_all_ids).
    Each bibl dict contains: id, title, authors, year, idnos, text.
    """
    
    out = []
    all_ids = set()
    for b in root.findall(".//{*}biblStruct"):
        bid = b.get(XML_ID)
        all_ids.add(bid)

        # Title (prefer analytic/monogr titles)
        tnode = (b.find(".//{*}analytic/{*}title") or
                 b.find(".//{*}monogr/{*}title") or
                 b.find(".//{*}title"))
        title = _get_text(tnode)

        # Authors (surnames only)
        authors = []
        for pers in b.findall(".//{*}author"):
            sname = pers.find(".//{*}surname")
            if sname is not None and (sname.text or "").strip():
                authors.append(sname.text.strip())

        # Year
        year = None
        d = b.find(".//{*}date")
        if d is not None:
            y = (d.get("when") or d.get("notBefore") or d.get("notAfter") or "").strip()
            m = re.search(r"(19|20)\d{2}", y)
            if m:
                year = int(m.group(0))
        if year is None:
            m = re.search(r"(19|20)\d{2}", _get_text(b))
            if m:
                year = int(m.group(0))

        # idnos (DOI, arXiv, URL, etc.)
        idnos = {}
        for idno in b.findall(".//{*}idno"):
            idtype = (idno.get("type") or "").lower()
            idnos.setdefault(idtype, set()).add(_norm(idno.text))

        out.append({
            "id": bid,
            "title": title,
            "authors": authors,
            "year": year,
            "idnos": idnos,
            "text": _get_text(b)
        })
    return out, all_ids

def _find_bnode_by_id(root: ET.Element, bid: str):
    """Find the <biblStruct> with a given xml:id. 
    Utility for scanning inside a known reference node (used for arXiv pointer checks)."""
    
    if not bid:
        return None
    for cand in root.findall(".//{*}biblStruct"):
        if cand.get(XML_ID) == bid:
            return cand
    return None

def score_bibl(bibl: dict, target: dict, root_for_attr_scan=None):
    """
    Multi-cue score:
      - DOI exact (+1.0)
      - arXiv ID present (+0.9)
      - fuzzy title (+ratio)
      - author surname overlap (+0.15 each)
      - year match (+0.1)
    """
    score = 0.0
    reasons = []

    # DOI exact
    tgt_doi = _norm(target.get("doi", ""))
    if tgt_doi and "doi" in bibl["idnos"] and tgt_doi in bibl["idnos"]["doi"]:
        score += 1.0; reasons.append("doi")

    # arXiv id detection
    arx = None
    if tgt_doi:
        m = re.search(r"arxiv\.(\d{4}\.\d{4,5})", tgt_doi, flags=re.I)
        arx = m.group(1).lower() if m else None

    if arx:
        arx_norm = arx
        has_arxiv_idno = ("arxiv" in bibl["idnos"] and any(arx_norm in x for x in bibl["idnos"]["arxiv"]))
        has_arxiv_url  = ("url" in bibl["idnos"]   and any(arx_norm in x for x in bibl["idnos"]["url"]))
        has_arxiv_ptr  = False
        if root_for_attr_scan is not None and bibl["id"]:
            bnode = _find_bnode_by_id(root_for_attr_scan, bibl["id"])
            if bnode is not None:
                for el in bnode.findall(".//{*}ptr"):
                    tgt = _norm(el.get("target", "") or "")
                    if arx_norm in tgt: has_arxiv_ptr = True; break
                if not has_arxiv_ptr:
                    for el in bnode.findall(".//{*}ref"):
                        tgt = _norm(el.get("target", "") or "")
                        if arx_norm in tgt: has_arxiv_ptr = True; break
        has_arxiv_text = bool(re.search(re.escape(arx_norm), _norm(bibl["text"])))
        if has_arxiv_idno or has_arxiv_url or has_arxiv_ptr or has_arxiv_text:
            score += 0.9; reasons.append("arxiv_id")

    # fuzzy title
    tgt_title = _norm(target.get("title", ""))
    if tgt_title and bibl["title"]:
        r = _fuzzy(tgt_title, _norm(bibl["title"]))
        if r >= 0.40:
            score += r; reasons.append(f"title_fuzzy:{r:.3f}")

    # author surnames
    tgt_authors = [a.lower() for a in (target.get("authors") or [])]
    if tgt_authors and bibl["authors"]:
        overlap = len(set(tgt_authors) & set(a.lower() for a in bibl["authors"]))
        if overlap:
            score += 0.15 * overlap; reasons.append(f"authors:{overlap}")

    # year
    if target.get("year") and bibl["year"] and int(target["year"]) == int(bibl["year"]):
        score += 0.1; reasons.append("year")

    return score, reasons

# --------- Block helpers ---------
_BLOCK_TAGS = {"p","head","note","figure","figDesc","table","cell","item","caption","s","div"}

def build_parent_map(root: ET.Element):
    """Map child->parent for all nodes."""
    mp = {}
    for parent in root.iter():
        for child in list(parent):
            mp[child] = parent
    return mp

def climb_to_block(elem, parent_map):
    """Climb from element up to nearest block (paragraph, figure, table, etc.)."""
    cur = elem
    while cur is not None:
        tag = cur.tag.split('}')[-1]
        if tag in _BLOCK_TAGS:
            return cur
        cur = parent_map.get(cur)
    return None

# --- Math-index deflation (to avoid false counts from [1] in formulas) ---
_NUMERIC_1 = re.compile(r"^\s*1\s*$")
_LETTER_CLASS = r"A-Za-z\u0370-\u03FF"
_STOPWORDS = {"in","as","of","by","we","on","at","to","if","or","and","for","with","from",
              "see","cf","eg","e.g","via","per","vs","et","al","fig","fig.","eq","eq.",
              "sec","section","theorem","lemma","prop","proof"}

def _make_markerized_block_text(parent, ref_el):
    """Return block text with markers around a specific <ref> element."""
    parts = []
    def walk(node):
        if node is ref_el:
            parts.append(">>>REF<<<")
            parts.append("".join(ref_el.itertext()) or "")
            parts.append(">>>/REF<<<")
        else:
            if node.text:
                parts.append(node.text)
            for ch in list(node):
                walk(ch)
                if ch.tail:
                    parts.append(ch.tail)
    walk(parent)
    return "".join(parts)

def _looks_like_b0_math_index(ref_el, parent_block, bibl_id: str) -> bool:
    """Return True if ref=b0 with text '1' and preceded by variable-like token."""
    if bibl_id != "b0":
        return False
    ref_txt = ("".join(ref_el.itertext()) or "").strip()
    if not _NUMERIC_1.match(ref_txt):
        return False

    s = _make_markerized_block_text(parent_block, ref_el)
    s_norm = re.sub(r"\s+", " ", s)

    pat_out = re.compile(rf"([{_LETTER_CLASS}]{{1,2}})\s*\[\s*>>>REF<<<\s*1\s*>>>/REF<<<\s*\]")
    pat_in  = re.compile(rf"([{_LETTER_CLASS}]{{1,2}})\s*>>>REF<<<\s*\[\s*1\s*\]\s*>>>/REF<<<")

    m = pat_out.search(s_norm) or pat_in.search(s_norm)
    if not m:
        return False
    token = (m.group(1) or "").lower()
    if token in _STOPWORDS:
        return False
    if len(token) <= 2:
        return True
    return False

def count_refs_and_contexts(root: ET.Element, bibl_id: str, valid_ids: set, max_ctx=20):
    """
    Count all <ref>/<ptr> pointing to bibl_id.
    Applies targeted math deflation for [1] false positives.
    """
    if not bibl_id:
        return 0, []
    parent_map = build_parent_map(root)
    kept, ctx = 0, []
    for xp in (".//{*}ref", ".//{*}ptr"):
        for el in root.findall(xp):
            ids = [i for i in extract_target_ids(el.get("target","") or "") if i in valid_ids]
            if bibl_id in ids:
                blk = climb_to_block(el, parent_map)
                if blk is not None and _looks_like_b0_math_index(el, blk, bibl_id):
                    continue
                kept += 1
                text = " ".join(blk.itertext()).strip() if blk is not None else " ".join(el.itertext()).strip()
                if text:
                    ctx.append(text)
                    if len(ctx) >= max_ctx:
                        return kept, ctx
    return kept, ctx

def quick_count(root: ET.Element, bid: str, valid_ids: set) -> int:
    """Fast count of <ref>/<ptr> for a given id (no contexts)."""
    if not bid: return 0
    n = 0
    for xp in (".//{*}ref", ".//{*}ptr"):
        for el in root.findall(xp):
            ids = [i for i in extract_target_ids(el.get("target","") or "") if i in valid_ids]
            if bid in ids: n += 1
    return n

# ---------- Run ----------
tei_files = sorted(TEI_DIR.glob("*.tei.xml"))
print("TEI files:", len(tei_files))

rows = []
match_debug = []
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
csv_counts = OUT_DIR / f"tei_counts_robust_{ts}.csv"
jsonl_ctx  = OUT_DIR / f"tei_contexts_robust_{ts}.jsonl"
csv_dbg    = OUT_DIR / f"tei_match_debug_{ts}.csv"

with open(jsonl_ctx, "w", encoding="utf-8") as jout:
    for tei in tei_files:
        root = ET.parse(tei).getroot()
        bibls, all_ids = collect_bibl_structs(root)

        # score all candidates
        scored = []
        for b in bibls:
            s, reasons = score_bibl(b, TARGET, root_for_attr_scan=root)
            scored.append((s, reasons, b))
        scored.sort(key=lambda x: x[0], reverse=True)

        # debug: keep top 5 candidates
        for rank, (s, reasons, b) in enumerate(scored[:5], start=1):
            match_debug.append({
                "file": tei.name, "rank": rank, "bibl_id": b["id"] or "",
                "score": f"{s:.3f}", "reasons": ";".join(reasons),
                "title": b["title"], "authors": ", ".join(b["authors"]),
                "year": b["year"] or "",
                "idnos_doi": ", ".join(sorted(b["idnos"].get("doi", []))),
                "idnos_arxiv": ", ".join(sorted(b["idnos"].get("arxiv", []))),
                "idnos_url": ", ".join(sorted(b["idnos"].get("url", []))),
            })

        # select: top score, but fallback to one with actual in-text hits
        topK = scored[:8]
        if topK:
            best_s, best_reasons, best_b = topK[0]
            best_id = best_b["id"]
            best_count = quick_count(root, best_id, all_ids)

            if best_count == 0:
                for s, reasons, b in topK[1:]:
                    c = quick_count(root, b["id"], all_ids)
                    if c > best_count or (c == best_count and s > best_s):
                        best_s, best_reasons, best_b = s, reasons, b
                        best_id, best_count = b["id"], c

            accept = (best_count > 0) or (best_s >= 0.45)
            bibl_id    = best_id if (best_id and accept) else None
            matched_by = ",".join(best_reasons) if bibl_id else ""
            score_val  = float(f"{best_s:.3f}") if bibl_id else 0.0
        else:
            bibl_id = None; matched_by = ""; score_val = 0.0

        total, contexts = count_refs_and_contexts(root, bibl_id, valid_ids=all_ids)

        rows.append({
            "file": tei.name.replace(".tei.xml",""),
            "bibl_id": bibl_id or "",
            "matched_by": matched_by,
            "match_score": f"{score_val:.3f}" if score_val else "",
            "total_intext": total,
        })

        jout.write(json.dumps({
            "file": tei.name.replace(".tei.xml",""),
            "bibl_id": bibl_id, "matched_by": matched_by,
            "match_score": score_val, "total_intext": total,
            "contexts": contexts
        }, ensure_ascii=False) + "\n")

# save CSVs
with open(csv_counts, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["file","bibl_id","matched_by","match_score","total_intext"])
    w.writeheader(); w.writerows(rows)

with open(csv_dbg, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["file","rank","bibl_id","score","reasons","title","authors","year","idnos_doi","idnos_arxiv","idnos_url"])
    w.writeheader(); w.writerows(match_debug)

print(f" Robust TEI counts saved:\n- CSV: {csv_counts}\n- JSONL: {jsonl_ctx}\n- Debug: {csv_dbg}\nFiles: {len(rows)}")


TEI files: 25
 Robust TEI counts saved:
- CSV: C:\Users\Karthik\Desktop\downloaded_files_2025-09-13_08-42-10\grobid_outputs_2025-09-13_08-52-18\tei_counts_robust_2025-09-13_08-58-28.csv
- JSONL: C:\Users\Karthik\Desktop\downloaded_files_2025-09-13_08-42-10\grobid_outputs_2025-09-13_08-52-18\tei_contexts_robust_2025-09-13_08-58-28.jsonl
- Debug: C:\Users\Karthik\Desktop\downloaded_files_2025-09-13_08-42-10\grobid_outputs_2025-09-13_08-52-18\tei_match_debug_2025-09-13_08-58-28.csv
Files: 25


In [16]:
BACKEND = "ollama"
OLLAMA_MODEL = "phi3:mini"



In [None]:
# === LLM stance classification using local Ollama (supportive / rebuttal / neutral) ===
# Requirements:
#   - Ollama running locally (http://localhost:11434)
#   - Model pulled, e.g.: `ollama pull phi3:mini`  (or "llama3:8b-instruct" if your machine can handle it)
#
# What it does:
#   1) Loads latest TEI contexts + counts from OUT_DIR
#   2) Extracts citation windows (citing sentence ±1)
#   3) Calls Ollama with a strict JSON instruction
#   4) Aggregates per paper and saves CSV + JSONL
#
# Outputs:
#   - stance_llm_<timestamp>.csv
#   - stance_llm_evidence_<timestamp>.jsonl

from pathlib import Path
import os, re, json, time
from datetime import datetime
import pandas as pd
import requests

# ---------------- CONFIG ----------------
assert 'OUT_DIR' in globals() and OUT_DIR.exists(), "Set OUT_DIR first (folder with TEI outputs)."

OLLAMA_HOST  = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
OLLAMA_MODEL = "phi3:mini"        # or "llama3:8b-instruct" (bigger, stronger)
WINDOW_SIZE  = 1                  # citing sentence ±1 (0 = only the citing sentence)
ONLY_FILES_MIN_CITES = 0          # set to 2 to classify only papers with >=2 in-text mentions
MAX_WINDOWS_PER_FILE = 30         # cap per file to keep it fast; set None to disable
CALL_PAUSE_S = 0.3                # gentle pacing between calls
TIMEOUT_S    = 60                 # HTTP timeout per call

# -------------- Load latest outputs --------------
ctx_path    = sorted(OUT_DIR.glob("tei_contexts_robust_*.jsonl"))[-1]
counts_path = sorted(OUT_DIR.glob("tei_counts_robust_*.csv"))[-1]
print("Using:", ctx_path.name, "and", counts_path.name)

counts_df = pd.read_csv(counts_path)
if ONLY_FILES_MIN_CITES > 0:
    keep_files = set(counts_df.loc[counts_df["total_intext"] >= ONLY_FILES_MIN_CITES, "file"])
else:
    keep_files = set(counts_df["file"])

# -------------- Utilities --------------
def split_sentences(text: str) -> list[str]:
    """
    Minimal, regex-only sentence splitter (fixed-width lookbehind).
    Splits on [.?!] followed by space + capital or '('.
    Normalizes patterns like 'text.) Next' -> 'text. Next'
    """
    t = re.sub(r'\s+', ' ', (text or '').strip())
    if not t:
        return []
    t = re.sub(r'([.!?])\s*\)+', r'\1 ', t)
    parts = re.split(r'(?<=[.!?])\s+(?=[A-Z(])', t)
    return [s.strip() for s in parts if s.strip()]

def extract_cited_windows(paragraph: str, window: int = 1) -> list[str]:
    """
    Return sentence windows centered on a sentence that looks like it contains a citation.
    Numeric [..] or author-year (...) patterns.
    Window=1 => prev + current + next as one string.
    """
    sents = split_sentences(paragraph)
    out = []
    for i, s in enumerate(sents):
        looks_numeric = bool(re.search(r'\[\s*\d+(?:\s*[,–-]\s*\d+)*\s*\]', s))
        looks_authyr  = bool(re.search(r'\([A-Z][A-Za-z\-]+.*(?:19|20)\d{2}[a-z]?\)', s))
        if looks_numeric or looks_authyr:
            i0 = max(0, i - window); i1 = min(len(sents), i + 1 + window)
            out.append(" ".join(sents[i0:i1]))
    # de-dup and keep order
    seen, uniq = set(), []
    for w in out:
        k = re.sub(r'\s+', ' ', w.strip())
        if k and k not in seen:
            seen.add(k); uniq.append(k)
    return uniq

SYSTEM_INSTRUCTIONS = (
    "You label the attitude of a citing passage toward a TARGET paper.\n"
    "Labels:\n"
    "- SUPPORTIVE: uses/adopts/extends/validates the TARGET’s method/findings.\n"
    "- REBUTTAL: contradicts, shows error/failure, provides counterexample, disputes claims.\n"
    "- NEUTRAL: background or mention without clear positive/negative judgment.\n"
    "Rules:\n"
    "- Use ONLY the provided text window; judge stance toward the cited TARGET.\n"
    "- If ambiguous or mixed, choose NEUTRAL.\n"
    "Respond in compact JSON: {\"label\":\"supportive|rebuttal|neutral\",\"rationale\":\"<=25 words\"}."
)

def extract_json(text: str):
    """Robustly pull the first JSON object out of a model reply."""
    m = re.search(r'\{.*\}', text, flags=re.S)
    if not m:
        return None
    try:
        return json.loads(m.group(0))
    except Exception:
        return None

def call_ollama_json(text: str, model: str = OLLAMA_MODEL, host: str = OLLAMA_HOST, retries: int = 2) -> dict:
    """
    Call Ollama's local /api/chat and return a dict with keys: label, rationale.
    On any error or malformed output, returns {'label':'neutral','rationale':'fallback'}.
    """
    url = f"{host}/api/chat"
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": SYSTEM_INSTRUCTIONS},
            {"role": "user",   "content": f"Text:\n{text}"}
        ],
        "stream": False,
        "options": {"temperature": 0.0}
    }
    for r in range(retries + 1):
        try:
            resp = requests.post(url, json=payload, timeout=TIMEOUT_S)
            resp.raise_for_status()
            content = resp.json().get("message", {}).get("content", "")
            data = extract_json(content) or {}
            label = str(data.get("label", "")).strip().lower()
            rationale = str(data.get("rationale", "")).strip()
            if label not in {"supportive", "rebuttal", "neutral"}:
                label = "neutral"
            return {"label": label, "rationale": rationale}
        except Exception:
            if r == retries:
                return {"label": "neutral", "rationale": "fallback (error)"}
            time.sleep(0.8 * (r + 1))

def assign_paper_label(group: pd.DataFrame) -> pd.Series:
    """
    Aggregate window-level labels into a paper-level label with a simple margin rule.
    """
    sup = int((group["label"] == "supportive").sum())
    reb = int((group["label"] == "rebuttal").sum())
    neu = int((group["label"] == "neutral").sum())
    if reb >= sup + 1:
        stance = "rebuttal"
    elif sup >= reb + 1:
        stance = "supportive"
    else:
        stance = "neutral"
    return pd.Series({
        "supportive_windows": sup,
        "rebuttal_windows":   reb,
        "neutral_windows":    neu,
        "stance_label":        stance,
        "evidence_support":   group.loc[group["label"]=="supportive","text"].head(3).tolist(),
        "evidence_rebuttal":  group.loc[group["label"]=="rebuttal","text"].head(3).tolist(),
    })

# -------------- Build windows from contexts --------------
windows = []  # rows: {file, text}
with open(ctx_path, encoding="utf-8") as f:
    for line in f:
        o = json.loads(line)
        file = o["file"]
        if file not in keep_files:
            continue
        paras = o.get("contexts", []) or []
        file_windows = []
        for p in paras:
            file_windows.extend(extract_cited_windows(p, window=WINDOW_SIZE))
        # cap per file (optional)
        if MAX_WINDOWS_PER_FILE is not None and len(file_windows) > MAX_WINDOWS_PER_FILE:
            file_windows = file_windows[:MAX_WINDOWS_PER_FILE]
        for w in file_windows:
            windows.append({"file": file, "text": w})

win_df = pd.DataFrame(windows).drop_duplicates()
print(f"Windows extracted: {len(win_df)} across {win_df['file'].nunique()} files")

# -------------- Classify with Ollama --------------
results = []
for i, row in win_df.iterrows():
    out = call_ollama_json(row["text"])
    results.append({"file": row["file"], "text": row["text"], **out})
    if CALL_PAUSE_S > 0:
        time.sleep(CALL_PAUSE_S)

pred_df = pd.DataFrame(results)
print("Windows classified:", len(pred_df))

# -------------- Aggregate to paper-level --------------
if len(pred_df):
    agg = pred_df.groupby("file").apply(assign_paper_label).reset_index()
else:
    # handle empty gracefully
    agg = pd.DataFrame(columns=["file","supportive_windows","rebuttal_windows","neutral_windows",
                                "stance_label","evidence_support","evidence_rebuttal"])

final = counts_df.merge(agg, on="file", how="left").fillna({
    "supportive_windows":0, "rebuttal_windows":0, "neutral_windows":0, "stance_label":"neutral"
})

# -------------- Save --------------
ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
stance_csv  = OUT_DIR / f"stance_llm_{ts}.csv"
ev_jsonl    = OUT_DIR / f"stance_llm_evidence_{ts}.jsonl"

final.to_csv(stance_csv, index=False)
with open(ev_jsonl, "w", encoding="utf-8") as f:
    if len(pred_df):
        for _, r in pred_df.groupby("file").apply(lambda g: pd.Series({
            "file": g["file"].iloc[0],
            "evidence": g[["text","label","rationale"]].to_dict(orient="records")
        })).reset_index(drop=True).iterrows():
            f.write(json.dumps(r._asdict() if hasattr(r, "_asdict") else dict(r), ensure_ascii=False) + "\n")

print(f"Saved:\n- CSV:   {stance_csv}\n- JSONL: {ev_jsonl}")

Using: tei_contexts_robust_2025-09-13_08-58-28.jsonl and tei_counts_robust_2025-09-13_08-58-28.csv
Windows extracted: 46 across 22 files
Windows classified: 46
✅ Saved:
- CSV:   C:\Users\Karthik\Desktop\downloaded_files_2025-09-13_08-42-10\grobid_outputs_2025-09-13_08-52-18\stance_llm_2025-09-13_09-21-39.csv
- JSONL: C:\Users\Karthik\Desktop\downloaded_files_2025-09-13_08-42-10\grobid_outputs_2025-09-13_08-52-18\stance_llm_evidence_2025-09-13_09-21-39.jsonl


  agg = pred_df.groupby("file").apply(assign_paper_label).reset_index()
  for _, r in pred_df.groupby("file").apply(lambda g: pd.Series({
