In [87]:
import requests
import asyncio
import aiohttp
import pandas as pd
import time

In [115]:
# testset
# compound_names = [
#     'Mercaptopurine',
#     'Acetaminophen',
#     'Azathioprine',
#     'Chlorpheniramine maleate',
#     'Clofibrate',
# ]

SEMAPHORE_LIMIT = 5
RATE_LIMIT_DELAY = 0.21  # limitation of pubchem server

async def fetch_cid(session, name, semaphore):
    async with semaphore:
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name.replace(' ', '%20')}/cids/TXT"
        try:
            async with session.get(url, timeout=aiohttp.ClientTimeout(total=10)) as response:
                if response.status == 200:
                    text = await response.text()
                    cids = text.strip().split()
                    cid = cids[0] if cids else None
                    await asyncio.sleep(RATE_LIMIT_DELAY) 
                    return name, cid
                else:
                    print(f" 1 {name} → HTTP error {response.status}")
                    await asyncio.sleep(RATE_LIMIT_DELAY)
                    return name, None
        except Exception as e:
            print(f" 0 {name} → URL error: {e}")
            await asyncio.sleep(RATE_LIMIT_DELAY)
            return name, None

async def get_cids_async(names):
    semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT)
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_cid(session, name, semaphore) for name in names]
        results = await asyncio.gather(*tasks)
        return dict(results)

def get_smiles_batch(cids, cid_to_smiles):
    if not cids:
        print("cids is empty")
        return cid_to_smiles
    cid_str = ','.join(cids)
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid_str}/property/CanonicalSMILES/JSON"
    try:
        res = requests.get(url, timeout=15)
        if res.status_code == 200:
            data = res.json()
            # CID → SMILES
            for prop in data['PropertyTable']['Properties']:
                cid_to_smiles[str(prop['CID'])] = prop.get('ConnectivitySMILES', '')
            return cid_to_smiles
        else:
            print(f"Batch SMILES request failed: HTTP {res.status_code}")
            return {}
    except Exception as e:
        print(f"Error in batch SMILES request: {e}")
        return {}

# test
# name_to_cid = await get_cids_async(compound_names)
# valid_cids = [cid for cid in name_to_cid.values() if cid is not None]
# cid_to_smiles = get_smiles_batch(valid_cids)

In [143]:
dili_rank = pd.read_excel("DILI_rank.xlsx")
dili_rank.rename(columns={"Drug Induced Liver Injury Rank (DILIrank) Dataset Ver 2.0 | FDA": "LTKBID",
                          "Unnamed: 1": "CompoundName",
                          "Unnamed: 2": "SeverityClass",
                          "Unnamed: 3": "LabelSection",
                          "Unnamed: 4": "vDILI-Concern",
                          "Unnamed: 5": "Comment"}, inplace=True)
dili_rank = dili_rank.drop(0)

In [109]:
compounds_name = dili_rank["CompoundName"].tolist()
name_to_cid = await get_cids_async(compounds_name)
valid_cids = [cid for cid in name_to_cid.values() if cid is not None]

 1 CompoundName → HTTP error 404
 1 Abatacept → HTTP error 404
 1 Abciximab → HTTP error 404
 1 Acetylcholine chloride → HTTP error 503
 1 Adalimumab → HTTP error 404
 1 Adenosine → HTTP error 503
 1 Alaproclate → HTTP error 503
 1 Alemtuzumab → HTTP error 404
 1 Aldesleukin → HTTP error 503
 1 Alglucosidase alfa → HTTP error 404
 1 Almotriptan malate → HTTP error 503
 1 Alosetron hydrochloride → HTTP error 503
 1 Alteplase → HTTP error 404
 1 Ambenonium chloride → HTTP error 503
 1 Aminosalicylic acid  → HTTP error 404
 1 Amiodarone hydrochloride → HTTP error 503
 1 Anakinra → HTTP error 404
 1 Anidulafungin → HTTP error 503
 1 Antithymocyte globulin → HTTP error 404
 1 Argatroban → HTTP error 503
 1 Argipressin → HTTP error 503
 1 Asparaginase → HTTP error 404
 1 Atenolol → HTTP error 503
 1 Avanafil → HTTP error 503
 1 Avapritinib → HTTP error 503
 1 Basiliximab → HTTP error 404
 1 Belumosudil mesylate → HTTP error 503
 1 Benzylpenicilloyl polylysine → HTTP error 404
 1 Betaine → HT

In [127]:
batch_size = 300
cid_to_smiles = {}
for batch in [valid_cids[i:i + batch_size] for i in range(0, len(valid_cids), batch_size)]:
    cid_to_smiles = get_smiles_batch(batch, cid_to_smiles)

In [147]:
dili_rank["CID"] = dili_rank["CompoundName"].map(name_to_cid)
dili_rank["SMILES"] = dili_rank["CID"].map(cid_to_smiles)

In [149]:
dili_rank_clean = dili_rank.dropna(subset=["SMILES"])

In [151]:
dili_rank_clean

Unnamed: 0,LTKBID,CompoundName,SeverityClass,LabelSection,vDILI-Concern,Comment,CID,SMILES
1,LT00040,Abacavir sulfate,8,Warnings & precautions,vMOST-DILI-concern,Unchanged,441384,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)C4CC(C=C4)CO.C1C...
2,LT03618,Abaloparatide,0,No match,vNo-DILI-concern,New,76943386,CCC(C)C(C(=O)NC(CCC(=O)N)C(=O)NC(CC(=O)O)C(=O)...
5,LT03619,Abemaciclib,3,Warnings & precautions,vLess-DILI-concern,New,46220502,CCN1CCN(CC1)CC2=CN=C(C=C2)NC3=NC=C(C(=N3)C4=CC...
6,LT02339,Abiraterone acetate,8,Warnings & precautions,vMost-DILI-concern,New,9821849,CC(=O)OC1CCC2(C3CCC4(C(C3CC=C2C1)CC=C4C5=CN=CC...
7,LT03614,Acalabrutinib,4,Adverse reactions,Ambiguous-DILI-concern,New,71226662,CC#CC(=O)N1CCCC1C2=NC(=C3N2C=CN=C3N)C4=CC=C(C=...
...,...,...,...,...,...,...,...,...
1332,LT01017,Ziprasidone hydrochloride,3,Adverse reactions,vLess-DILI-concern,Unchanged,219099,C1CN(CCN1CCC2=C(C=C3C(=C2)CC(=O)N3)Cl)C4=NSC5=...
1333,LT01088,Zoledronic acid,0,No match,vLess-DILI-concern,Unchanged,68740,C1=CN(C=N1)CC(O)(P(=O)(O)O)P(=O)(O)O
1334,LT01010,Zolmitriptan,3,Adverse reactions,Ambiguous-DILI-concern,Unchanged,60857,CN(C)CCC1=CNC2=C1C=C(C=C2)CC3COC(=O)N3
1335,LT00164,Zolpidem tartrate,7,Adverse reactions,Ambiguous-DILI-concern,Unchanged,441338,CC1=CC=C(C=C1)C2=C(N3C=C(C=CC3=N2)C)CC(=O)N(C)...


In [153]:
dili_rank_clean.to_csv('DILI.csv', index=False)