In [1]:
import pandas as pd
import numpy as np
import random as rd
import requests as req

from urllib.request import HTTPError
from tqdm import tqdm
from joblib import Parallel, delayed

In [2]:
NSC = list(pd.read_csv('../DrugCell/data_rcellminer/nci60Act.csv', index_col=0).index)
nsc_cid = pd.read_csv('../data/NSC_to_CID.csv')

In [3]:
def get_NSC_CID_table(nsc_list, nsc_cid_table):
    
    """
    Obtain the NSC and CID table using API

    Parameters
    ----------

    nsc_list : list
        List of NSC numbers
    
    nsc_cid_table : pd.DataFrame
        NSC to CID table which you already have

    returns
    -------

    df : pd.DataFrame

    """
    
    NSC = set(nsc_list) - set(list(nsc_cid_table['NSC']))
    t = pd.DataFrame()
    cid_503 = []
    for i in tqdm(NSC):
        link = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sourceid/DTP.NCI/"
        resp = req.get(link + str(i) + "/cids/TXT")
        status_code = resp.status_code

        if status_code == 200:
            t = pd.concat([
                t,
                pd.DataFrame([i, resp.text.split('\n')[0]]).T
            ])
        else:
            if status_code == 503:
                cid_503.append(i)
                
    for i in tqdm(cid_503):
        link = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sourceid/DTP.NCI/"
        resp = req.get(link + str(i) + "/cids/TXT")
        status_code = resp.status_code

        if status_code == 200:
            t = pd.concat([
                t,
                pd.DataFrame([i, resp.text.split('\n')[0]]).T
            ])
        else:
            if status_code == 503:
                cid_503.append(i)
                
    return pd.concat([
        nsc_cid_table,
        t.rename(columns={0:'NSC', 1:'CID'})
    ])

In [4]:
df = get_NSC_CID_table(NSC, nsc_cid)
df.to_csv('../data/NSC_to_CID.csv', index=0)

100%|██████████| 667/667 [02:29<00:00,  4.45it/s]
0it [00:00, ?it/s]


In [5]:
def get_SMILES_from_pubchemID(pubchem_id):
    """
    Get the SMILES from pubchem_id using PubChem API

    Parameters
    ----------

    pubchem_id : pd.DataFrame

    Returns
    -------

    df : pd.DataFrame

    """

    df = pd.DataFrame()
    for i in tqdm(
        list(range(500, pubchem_id.shape[0], 500)) + [pubchem_id.shape[0]]
    ):
        cid = ""
        for j in pubchem_id["CID"][i - 500 : i]:
            cid = str(j) + "," + cid

        try:
            df = pd.concat(
                [
                    pd.read_csv(
                        "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/"
                        + cid
                        + "/property/CanonicalSMILES/CSV"
                    ),
                    df,
                ]
            )
        except HTTPError as e:
            for i in np.array_split(np.array(cid.split(",")), 2):
                t = ""
                for l in i:
                    t += l + ","
                try:
                    df = pd.concat(
                        [
                            pd.read_csv(
                                "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/"
                                + t
                                + "/property/CanonicalSMILES/CSV"
                            ),
                            df,
                        ]
                    )
                except HTTPError as e:
                    if e.code == 403:
                        print("error")

    df = pubchem_id.merge(df, on="CID")
    df.columns = ['NSC', 'CID', 'SMILES']

    return df

In [None]:
df = get_SMILES_from_pubchemID(df)
df.drop_duplicates('NSC').reset_index(drop=True).to_csv('../data/nsc_cid_smiles.csv', index=False)

 40%|████      | 19/47 [00:06<00:07,  3.82it/s]