In [1]:
import pandas as pd
import requests
from tqdm import tqdm

In [2]:
response_df = pd.read_csv("dataset/GDSC2_fitted_dose_response_24Jul22.csv")
drug_info_df = pd.read_csv("dataset/screened_compounds_rel_8.4.csv")


In [3]:
merged = pd.merge(response_df, drug_info_df, on="DRUG_ID", how="left")

In [4]:
merged = merged[["DRUG_ID", "DRUG_NAME_x", "CELL_LINE_NAME", "LN_IC50"]]
merged.columns = ["DRUG_ID", "DRUG_NAME", "CELL_LINE_NAME", "LN_IC50"]

In [5]:
merged

Unnamed: 0,DRUG_ID,DRUG_NAME,CELL_LINE_NAME,LN_IC50
0,1003,Camptothecin,PFSK-1,-1.462148
1,1003,Camptothecin,A673,-4.869447
2,1003,Camptothecin,ES5,-3.360684
3,1003,Camptothecin,ES7,-5.045014
4,1003,Camptothecin,EW-11,-3.741620
...,...,...,...,...
242031,2499,N-acetyl cysteine,SNU-175,10.134495
242032,2499,N-acetyl cysteine,SNU-407,8.575555
242033,2499,N-acetyl cysteine,SNU-61,10.520666
242034,2499,N-acetyl cysteine,SNU-C5,10.701430


In [6]:
def fetch_smiles_by_name(name):
    try:
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/CanonicalSMILES/TXT"
        response = requests.get(url)
        if response.status_code == 200:
            return response.text.strip()
    except:
        return None
    return None

In [7]:
from tqdm.notebook import tqdm

# Unique drug names only
unique_drugs = merged["DRUG_NAME"].dropna().unique()

# Fetch once per drug
drug2smiles = {}
for name in tqdm(unique_drugs, desc="Fetching SMILES"):
    drug2smiles[name] = fetch_smiles_by_name(name)

# Map back to full DataFrame
merged["SMILES"] = merged["DRUG_NAME"].map(drug2smiles)

HBox(children=(FloatProgress(value=0.0, description='Fetching SMILES', max=286.0, style=ProgressStyle(descript…




In [17]:
merged = merged[~merged["SMILES"].isna()]

In [21]:
merged.to_csv("dataset/GDSC_SMILES_merged.csv")