In [None]:
# Step 1 PMID_to_PMCID_mapping
import requests
import time

def read_pmids_from_file(input_file):
    try:
        with open(input_file, 'r') as file:
            data = file.read()
        pmids = [pmid.strip() for pmid in data.split(',') if pmid.strip()]
        return pmids
    except FileNotFoundError:
        print(f"Error: The file '{input_file}' does not exist.")
        return []
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return []

def fetch_pmc_metadata(pmcid):
    """
    Fetch full text XML metadata from Europe PMC for a given PMCID.
    """
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            return response.text
        else:
            print(f"[{pmcid}] Metadata not found or inaccessible (HTTP {response.status_code})")
            return None
    except Exception as e:
        print(f"[{pmcid}] Error fetching metadata: {e}")
        return None

def validate_pmc_matches_pmid(pmcid, pmid):
    """
    Validate that the PMC article metadata contains the correct PMID.
    """
    xml_text = fetch_pmc_metadata(pmcid)
    if xml_text:
        # Check if PMID is present anywhere in the XML metadata
        if pmid in xml_text:
            return True
    return False

def convert_pmid_to_pmcid(pmids):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    params = {
        "dbfrom": "pubmed",
        "db": "pmc",
        "retmode": "json",
        "linkname": "pubmed_pmc",
    }

    pmcid_map = {}

    for pmid in pmids:
        print(f"Processing PMID: {pmid}...")
        params["id"] = pmid
        valid_pmcids = []

        for attempt in range(3):
            try:
                response = requests.get(base_url, params=params, timeout=10)
                if response.status_code == 200:
                    data = response.json()
                    linksets = data.get("linksets", [])
                    if linksets and "linksetdbs" in linksets[0]:
                        for linksetdb in linksets[0]["linksetdbs"]:
                            if linksetdb.get("linkname") == "pubmed_pmc":
                                pmc_ids = linksetdb.get("links", [])
                                for pmc in pmc_ids:
                                    pmc_full = f"PMC{pmc}"
                                    print(f"Validating PMCID {pmc_full} for PMID {pmid} ...")
                                    if validate_pmc_matches_pmid(pmc_full, pmid):
                                        print(f"Validated: {pmc_full} matches PMID {pmid}")
                                        valid_pmcids.append(pmc_full)
                                    else:
                                        print(f"Rejected: {pmc_full} does not match PMID {pmid}")
                                break
                        pmcid_map[pmid] = valid_pmcids if valid_pmcids else None
                    else:
                        pmcid_map[pmid] = None
                else:
                    print(f"Failed to retrieve data for PMID {pmid} (HTTP {response.status_code})")
                    pmcid_map[pmid] = None
                break
            except requests.exceptions.RequestException as e:
                print(f"Error fetching data for PMID {pmid}: {e}")
                time.sleep(2)
        else:
            pmcid_map[pmid] = None

    return pmcid_map


if __name__ == "__main__":
    input_file = r"C:/Users/Bart Westerman XPS/AppData/Local/Programs/Python/Python36-32/GBM1000PMIDs.txt"
    output_file = r"C:/Users/Bart Westerman XPS/AppData/Local/Programs/Python/Python36-32/pmid_to_pmcid_mapping.txt"

    pmids = read_pmids_from_file(input_file)

    if pmids:
        print(f"Found {len(pmids)} PMIDs in the file.")
        result = convert_pmid_to_pmcid(pmids)

        with open(output_file, 'w') as outfile:
            outfile.write("PMID\tPMCIDs\n")
            for pmid, pmcids in result.items():
                if pmcids:
                    outfile.write(f"{pmid}\t{','.join(pmcids)}\n")
                else:
                    outfile.write(f"{pmid}\tNone\n")

        print(f"PMID to PMCID mapping saved to '{output_file}'.")
    else:
        print("No PMIDs found in the input file.")
