In [None]:
import pandas as pd
import pubchempy as pcp
import requests
from rdkit import Chem
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import re
import json

# ---------- COMPREHENSIVE DRUG NAME RESOLUTION ----------

# Pre-built mapping databases for common codes
DRUG_CODE_MAP = {
    # CAS numbers to drug names
    '1001645-58-4': 'TAS-116', '10030-85-0': 'Ascorbic acid', '129-56-6': 'Rhodamine',
    '10058-F4': 'c-Myc inhibitor', '10074-G5': 'c-Myc inhibitor II', 
    'BAY 11-7082': 'BAY11-7082', '001, RAD': 'Everolimus',
    
    # Common drug codes
    'A-1210477': 'BCL-XL inhibitor', 'A-196': 'SUV39H1 inhibitor', 
    'ABT-737': 'BCL-2 inhibitor', 'AZD-2014': 'AZD2014',
    
    # Brand to generic names
    'XARELTO': 'Rivaroxaban', 'ADRIAMYCIN': 'Doxorubicin', 'XELODA': 'Capecitabine',
    'GLEEVEC': 'Imatinib', 'NEXIUM': 'Esomeprazole', 'LIPITOR': 'Atorvastatin'
}

# API endpoints for code resolution
CHEMBL_API = "https://www.ebi.ac.uk/chembl/api/data/molecule/{}.json"
PUBCHEM_API = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/JSON"

def resolve_drug_identifier(identifier):
    """Convert any drug identifier to searchable chemical name"""
    if pd.isna(identifier):
        return None
    
    identifier = str(identifier).strip()
    
    # 1. Check pre-built mapping first
    if identifier.upper() in DRUG_CODE_MAP:
        return DRUG_CODE_MAP[identifier.upper()]
    
    # 2. Handle CAS numbers (format: 00000-00-0)
    if re.match(r'^\d{2,7}-\d{2}-\d$', identifier):
        return resolve_cas_number(identifier)
    
    # 3. Handle CHEMBL IDs
    if identifier.startswith('CHEMBL'):
        return resolve_chembl_id(identifier)
    
    # 4. Handle ZINC IDs
    if identifier.startswith('ZINC'):
        return resolve_zinc_id(identifier)
    
    # 5. Handle NCGC IDs
    if identifier.startswith('NCGC'):
        return resolve_ncgc_id(identifier)
    
    # 6. Handle drug codes (alphanumeric patterns)
    if re.match(r'^[A-Z]+-\d+$', identifier) or re.match(r'^[A-Z]+\d+$', identifier):
        return resolve_drug_code(identifier)
    
    # 7. If it's already a chemical name, return as-is
    if is_likely_chemical_name(identifier):
        return identifier
    
    return identifier  # Fallback to original

def resolve_cas_number(cas):
    """Convert CAS number to chemical name"""
    try:
        # Try PubChem first
        compounds = pcp.get_compounds(cas, 'name')
        if compounds:
            return compounds[0].iupac_name or compounds[0].synonyms[0] if compounds[0].synonyms else cas
    except:
        pass
    
    # Try common CAS mappings
    cas_map = {
        '51-56-9': 'Nitrogen mustard', '55-86-7': 'Cyclophosphamide',
        '57-22-7': 'Paclitaxel', '83-88-5': 'Riboflavin'
    }
    return cas_map.get(cas, cas)

def resolve_chembl_id(chembl_id):
    """Convert CHEMBL ID to chemical name"""
    try:
        response = requests.get(CHEMBL_API.format(chembl_id), timeout=5)
        if response.status_code == 200:
            data = response.json()
            return data.get('pref_name', chembl_id)
    except:
        pass
    return chembl_id

def resolve_drug_code(code):
    """Resolve drug codes like BAY-11-7082, A-1210477"""
    # Common patterns
    code_upper = code.upper()
    
    # BAY compounds
    if code_upper.startswith('BAY'):
        return f"{code} (BAY compound)"
    
    # AZD compounds (AstraZeneca)
    if code_upper.startswith('AZD'):
        return f"{code} (AZD compound)"
    
    # GSK compounds
    if code_upper.startswith('GSK'):
        return f"{code} (GSK compound)"
    
    # Generic research codes
    if re.match(r'^[A-Z]-\d+', code_upper):
        return f"{code} (research compound)"
    
    return code

def is_likely_chemical_name(name):
    """Check if text looks like a chemical name"""
    chemical_indicators = [
        'ACID', 'INIBITOR', 'ANTAGONIST', 'AGONIST', 'HYDROCHLORIDE', 
        'SULFATE', 'CITRATE', 'MALEATE', 'ESTER', 'DERIVATIVE',
        '(+)-', '(-)-', '(R)-', '(S)-', 'BETA-', 'ALPHA-'
    ]
    
    name_upper = name.upper()
    return any(indicator in name_upper for indicator in chemical_indicators)

# ---------- ENHANCED SEARCH FUNCTIONS ----------

def comprehensive_pubchem_search(resolved_name, original_name):
    """Search PubChem with multiple strategies"""
    search_terms = [resolved_name, original_name]
    
    # Add variations for complex names
    if '(' in resolved_name and ')' in resolved_name:
        # Remove stereochemistry for broader search
        simple_name = re.sub(r'\([^)]*\)', '', resolved_name).strip()
        search_terms.append(simple_name)
    
    for term in search_terms:
        try:
            compounds = pcp.get_compounds(term, 'name', timeout=5)
            if compounds:
                for compound in compounds:
                    # Verify it's a reasonable match
                    if verify_compound_match(compound, term):
                        return compound.canonical_smiles, compound.inchikey
        except:
            continue
    
    return None, None

def verify_compound_match(compound, search_term):
    """Verify the found compound actually matches our search"""
    # Get compound names for comparison
    compound_name = (compound.iupac_name or '').lower()
    compound_synonyms = [syn.lower() for syn in getattr(compound, 'synonyms', [])]
    
    search_lower = search_term.lower()
    
    # Basic word overlap check
    search_words = set(re.findall(r'[a-z]+', search_lower))
    compound_words = set(re.findall(r'[a-z]+', compound_name))
    
    if search_words.intersection(compound_words):
        return True
    
    # Check if search term appears in synonyms
    for synonym in compound_synonyms:
        if search_lower in synonym or synonym in search_lower:
            return True
    
    return len(search_words) == 0  # If no words, accept anyway

def comprehensive_chembl_search(resolved_name):
    """Enhanced ChEMBL search"""
    try:
        # Try exact match first
        url = f'https://www.ebi.ac.uk/chembl/api/data/molecule/search.json?q="{resolved_name}"'
        response = requests.get(url, timeout=5)
        
        if response.status_code == 200:
            data = response.json()
            for molecule in data.get('molecules', []):
                smiles = molecule.get('molecule_structures', {}).get('canonical_smiles')
                if smiles and Chem.MolFromSmiles(smiles):
                    return smiles
    except:
        pass
    
    return None

# ---------- MAIN FETCHING FUNCTION ----------

def fetch_comprehensive_smiles(row):
    """Ultimate SMILES fetching function"""
    original_name = str(row['drug_name']).strip()
    
    # Step 1: Resolve identifier to proper name
    resolved_name = resolve_drug_identifier(original_name)
    
    # Step 2: Try PubChem with resolved name
    smiles, inchikey = comprehensive_pubchem_search(resolved_name, original_name)
    if smiles:
        return smiles, inchikey, "OK", resolved_name
    
    # Step 3: Try ChEMBL
    smiles = comprehensive_chembl_search(resolved_name)
    if smiles:
        return smiles, None, "OK", resolved_name
    
    # Step 4: Fallback to original name search
    if resolved_name != original_name:
        smiles, inchikey = comprehensive_pubchem_search(original_name, original_name)
        if smiles:
            return smiles, inchikey, "OK", original_name
    
    return None, None, "Manual", resolved_name

# ---------- EXECUTION ----------

def main_comprehensive():
    # Load your data
    df=pd.read_csv("all_drugs_list.csv")
    drug_list=df['drug_name']
    data = pd.DataFrame({'drug_name': drug_list})  # Your list goes here
    
    # Add result columns
    for col in ["smiles", "inchikey", "status", "resolved_name"]:
        data[col] = None
    
    print(f"üî¨ Processing {len(data)} drugs with comprehensive resolution...")
    
    start_time = time.time()
    
    # Process with enhanced strategy
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {
            executor.submit(fetch_comprehensive_smiles, data.iloc[i]): i 
            for i in range(len(data))
        }
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            i = futures[future]
            try:
                smiles, inchikey, status, resolved_name = future.result(timeout=15)
                data.at[i, "smiles"] = smiles
                data.at[i, "inchikey"] = inchikey
                data.at[i, "status"] = status
                data.at[i, "resolved_name"] = resolved_name
            except:
                data.at[i, "status"] = "Error"
    
    # Save results
    data.to_csv("comprehensive_drug_smiles.csv", index=False)
    
    # Summary
    elapsed = (time.time() - start_time) / 60
    ok_count = (data["status"] == "OK").sum()
    
    print(f"‚úÖ COMPREHENSIVE RESULTS:")
    print(f"üìä Success rate: {ok_count}/{len(data)} ({ok_count/len(data)*100:.1f}%)")
    print(f"‚è∞ Time: {elapsed:.1f} minutes")
    
    # Show resolution examples
    print(f"üîç Sample resolutions:")
    sample = data.head(8)
    for _, row in sample.iterrows():
        print(f"   {row['drug_name']} ‚Üí {row['resolved_name']} ‚Üí {row['status']}")

# Run it
if __name__ == "__main__":
    main_comprehensive()