In [None]:
# Step 2. Text cleaning step, provides a cleaned extracted text file with the pmcid and drug name
#!pip install pandas fuzzywuzzy python-Levenshtein transformers torch PyMuPDF
import os
import re
import csv
import requests
import pandas as pd
from xml.etree import ElementTree as ET
from fuzzywuzzy import fuzz
import fitz  # PyMuPDF for PDF extraction
import requests

# ------------------- File Paths ------------------- #
pmcid_to_drug_file = r"C:/Users/Bart Westerman XPS/AppData/Local/Programs/Python/Python36-32/pmcid_to_drug_mapping.txt"
raw_text_dir = r"C:/Users/Bart Westerman XPS/AppData/Local/Programs/Python/Python36-32/PMC_Raw_Texts/"
output_result_file = r"C:/Users/Bart Westerman XPS/AppData/Local/Programs/Python/Python36-32/drug_synonym_search_results.csv"
output_text_folder = r"C:/Users/Bart Westerman XPS/AppData/Local/Programs/Python/Python36-32/extracted_text/"

# Dummy search function placeholder
def search_for_terms_in_text(text, synonyms, threshold=80):
    matches = []
    for synonym in synonyms:
        for line in text.splitlines():
            if fuzz.partial_ratio(synonym.lower(), line.lower()) >= threshold:
                matches.append(line.strip())
    return matches

# ------------------- PubMed XML Cleaning ------------------- #
def clean_pubmed_xml(file_path, output_folder, pmcid, drug_name):
    print(f"Cleaning XML data from: {file_path}...")
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        text_sections = []
        title = root.find(".//article-title")
        if title is not None and title.text:
            text_sections.append(f"Title: {title.text.strip()}")
        abstract = root.find(".//abstract")
        if abstract is not None:
            abstract_text = " ".join([elem.text.strip() for elem in abstract.findall(".//p") if elem.text])
            text_sections.append(f"Abstract:\n{abstract_text}")
        body = root.find(".//body")
        if body is not None:
            body_text = " ".join([elem.text.strip() for elem in body.findall(".//p") if elem.text])
            text_sections.append(f"Body:\n{body_text}")
        readable_text = "\n\n".join(text_sections)

        safe_drug = re.sub(r'[\\/*?:"<>|]', "_", drug_name.strip().replace(" ", "_"))
        output_file_path = os.path.join(output_folder, f"extracted_text_{pmcid}_{safe_drug}.txt")

        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write(readable_text)
        print(f"Extracted text saved to: {output_file_path}")
        return readable_text
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return f"Error processing file: {e}"

# ------------------- PDF Text Extraction ------------------- #
def extract_text_from_pdf(file_path, output_folder, pmcid, drug_name):
    print(f"Extracting text from PDF: {file_path}...")
    try:
        doc = fitz.open(file_path)
        text = ""
        for page in doc:
            text += page.get_text()

        safe_drug = re.sub(r'[\\/*?:"<>|]', "_", drug_name.strip().replace(" ", "_"))
        output_file_path = os.path.join(output_folder, f"extracted_text_{pmcid}_{safe_drug}.txt")

        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write(text)
        print(f"Extracted text from PDF saved to: {output_file_path}")
        return text
    except Exception as e:
        print(f"Error processing PDF file {file_path}: {e}")
        return f"Error processing file: {e}"

# --- Added fallback: Download PDF from Europe PMC backend ---

def download_pdf_from_europepmc(pmcid):
    url = f"https://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC{pmcid}&blobtype=pdf"
    headers = {"User-Agent": "Mozilla/5.0"}
    print(f"[PMC{pmcid}] Attempting to download PDF from Europe PMC backend...")

    try:
        response = requests.get(url, headers=headers, timeout=20)
        if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('application/pdf'):
            # Save to default output folder
            filename = os.path.join(output_text_folder, f"{pmcid}_downloaded.pdf")
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f"[PMC{pmcid}] PDF downloaded successfully to {filename}")

            # Also save to backup folder
            pmc_str = f"PMC{pmcid}"
            dest_folder = os.path.join(pdf_backup_root, pmc_str, pmc_str)
            os.makedirs(dest_folder, exist_ok=True)
            dest_path = os.path.join(dest_folder, f"{pmcid}_downloaded.pdf")
            with open(dest_path, "wb") as f:
                f.write(response.content)
            print(f"[PMC{pmcid}] PDF also saved to backup folder: {dest_path}")

            return filename
        else:
            print(f"[PMC{pmcid}] PDF not available or wrong content type. Status: {response.status_code}")
            return None
    except Exception as e:
        print(f"[PMC{pmcid}] Error downloading PDF: {e}")
        return None

# ------------------- Main Script ------------------- #
def main():
    print("Starting the script...")
    result_data = []

    # Ensure output folder exists
    if not os.path.exists(output_text_folder):
        os.makedirs(output_text_folder)

    # Step 1: Load PMCID to Drug Mapping Data
    print("Loading PMCID to Drug Mapping file...")
    try:
        pmcid_to_drug_df = pd.read_csv(pmcid_to_drug_file, sep='\t', encoding='ISO-8859-1')
        print("✅ Successfully loaded PMCID to Drug Mapping file.")
    except Exception as e:
        print(f"❌ Error loading PMCID to Drug Mapping file: {e}")
        return

    # Clean PMCID column
    pmcid_to_drug_df["PMCID"] = pmcid_to_drug_df["PMCID"].astype(str).str.split(".").str[0]
    pmcid_to_drug_df = pmcid_to_drug_df[pmcid_to_drug_df["PMCID"].notna() & (pmcid_to_drug_df["PMCID"] != "0")]
    # Step 2: Process each PMCID
    print("Processing each PMCID...")
    for index, row in pmcid_to_drug_df.iterrows():
        pmcid = row['PMCID']
        drug_name_field = row['Drug Synonyms']
        if isinstance(drug_name_field, str):
            drug_synonyms = [syn.strip() for syn in drug_name_field.split(',')]
        else:
            drug_synonyms = []

        print(f"🔍 Processing PMCID{pmcid} for drug(s): {drug_name_field}")

        folder_path = os.path.join(raw_text_dir, f"PMC{pmcid}")
        if not os.path.isdir(folder_path):
            print(f"⚠️ Raw text folder not found for {pmcid}. Skipping local files extraction.")

        nxml_files = []
        pdf_files = []
        if os.path.isdir(folder_path):
            for root_dir, dirs, files in os.walk(folder_path):
                for file in files:
                    if file.endswith('.nxml'):
                        nxml_files.append(os.path.join(root_dir, file))
                    elif file.endswith('.pdf'):
                        pdf_files.append(os.path.join(root_dir, file))

        extracted_texts = []

        # Try NXML extraction first if available
        if nxml_files:
            for file_path in nxml_files:
                print(f"Processing NXML file: {file_path}")
                cleaned_text = clean_pubmed_xml(file_path, output_text_folder, pmcid, drug_name_field)
                if "Error processing file" not in cleaned_text:
                    extracted_texts.append(cleaned_text)

        # If no NXML files, try PDF extraction from local PDF files
        elif pdf_files:
            for file_path in pdf_files:
                print(f"Processing PDF file: {file_path}")
                extracted_text = extract_text_from_pdf(file_path, output_text_folder, pmcid, drug_name_field)
                if "Error processing file" not in extracted_text:
                    extracted_texts.append(extracted_text)

        # --- Added fallback: If no local files, try downloading PDF from Europe PMC and extract ---
        else:
            print(f"No NXML or PDF files found locally for PMCID {pmcid}. Trying Europe PMC PDF download fallback...")
            pdf_path = download_pdf_from_europepmc(pmcid)
            if pdf_path:
                extracted_text = extract_text_from_pdf(pdf_path, output_text_folder, pmcid, drug_name_field)
                if "Error processing file" not in extracted_text:
                    extracted_texts.append(extracted_text)
                else:
                    print(f"[PMC{pmcid}] Extraction from downloaded PDF failed.")
            else:
                print(f"[PMC{pmcid}] PDF download fallback failed. No text extracted.")

        if extracted_texts:
            combined_text = "\n\n".join(extracted_texts)
        else:
            combined_text = ""

        # Search for drug mentions
        efficacy_results = search_for_terms_in_text(combined_text, drug_synonyms, threshold=80)

        result_data.append({
            'PMCID': pmcid,
            'DrugName': drug_name_field,
            'Drug Synonym': ', '.join(drug_synonyms),
            'File': folder_path if os.path.isdir(folder_path) else "No local folder"
        })

    # Step 3: Save results to CSV
    if result_data:
        result_df = pd.DataFrame(result_data)
        result_df.to_csv(output_result_file, index=False)
        print(f"✅ Search results saved to {output_result_file}.")
    else:
        print("❌ No results found.")

    print("Script execution complete.")

if __name__ == "__main__":
    main()

