In [1]:
import pandas as pd
import numpy as np
import random as rd

from urllib.request import HTTPError
from tqdm import tqdm
from joblib import Parallel, delayed

In [2]:
# pd.DataFrame(['NSC ' + str(i) for i in list(nci60Act.index)]).to_csv(
#     'test.csv', 
#     header=None,
#     index=False
# )

In [3]:
nsc_to_cid = pd.read_csv(
    'nsc_to_cid.txt',
    sep='\t',
    header=None
).dropna().reset_index(drop=True)
nsc_to_cid[0] = [i.split(' ')[1] for i in nsc_to_cid[0]]

In [4]:
pubchem_id = pd.read_csv('../data/nci60PubChemID.csv', index_col=0)
pubchem_id = pubchem_id.reset_index()
pubchem_id.columns = nsc_to_cid.columns

In [5]:
t = pd.concat([pubchem_id, nsc_to_cid])
t = t.astype(int)
t = t.drop_duplicates()
t = t.reset_index(drop=True)
t.columns = ['NSC', 'CID']
t.index = list(t.NSC)
t = t.drop('NSC', axis=1)

In [6]:
def get_SMILES_from_pubchemID(pubchem_id):
    """
    Get the SMILES from pubchem_id using PubChem API

    Parameters
    ----------

    pubchem_id : pd.DataFrame

    Returns
    -------

    df : pd.DataFrame

    """

    df = pd.DataFrame()
    for i in tqdm(
        list(range(500, pubchem_id.shape[0], 500)) + [pubchem_id.shape[0]]
    ):
        cid = ""
        for j in pubchem_id["CID"][i - 500 : i]:
            cid = str(j) + "," + cid

        try:
            df = pd.concat(
                [
                    pd.read_csv(
                        "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/"
                        + cid
                        + "/property/CanonicalSMILES/CSV"
                    ),
                    df,
                ]
            )
        except HTTPError as e:
            # if we got error, devided by more small number to get where we got the error.
            # Not sure why I didn't get error again though...
            for i in np.array_split(np.array(cid.split(",")), 2):
                t = ""
                for l in i:
                    t += l + ","
                try:
                    df = pd.concat(
                        [
                            pd.read_csv(
                                "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/"
                                + t
                                + "/property/CanonicalSMILES/CSV"
                            ),
                            df,
                        ]
                    )
                except HTTPError as e:
                    if e.code == 403:
                        print("error")
    df = (
        df.drop_duplicates("CanonicalSMILES")
        .sort_values("CID")
        .reset_index(drop=True)
    )
    df = pubchem_id.reset_index().merge(df, on="CID")[
        ["index", "CanonicalSMILES"]
    ]
    df.columns = ["NSC", "SMILES"]

    return df

In [7]:
df = get_SMILES_from_pubchemID(t)

100%|██████████| 36/36 [00:09<00:00,  3.97it/s]


In [8]:
t = t.reset_index()
t.columns = ["NSC", "CID"]

In [9]:
df.merge(t, on='NSC').to_csv('../data/nsc_cid_smiles.csv', index=False)