In [8]:
from chembl_webresource_client.new_client import new_client
from tqdm import tqdm


In [9]:
resources = [res for res in dir(new_client) if not res.startswith('_')]
print(resources)



# 1. 
### Retrieve all approved drugs from the ChEMBL database, sort them by approval year and name

In [10]:
from chembl_webresource_client.new_client import new_client
import pandas as pd

approved_molecules = new_client.molecule.filter(max_phase=4, molecule_type="Small molecule")

In [11]:
drug_list = []

for mol in approved_molecules:
    if mol.get("first_approval") and mol.get("pref_name"):
        drug_list.append({
            'name': mol['pref_name'],
            'chembl_id': mol['molecule_chembl_id'],
            'approval_year': mol['first_approval']
        })

df = pd.DataFrame(drug_list)

df = df.sort_values(by=['approval_year', 'name'])

print(df.head(20))

                               name      chembl_id  approval_year
74                     BUTABARBITAL      CHEMBL449           1939
1762            BUTABARBITAL SODIUM  CHEMBL1200982           1939
1493   DESOXYCORTICOSTERONE ACETATE  CHEMBL1200542           1939
445                       GUANIDINE      CHEMBL821           1939
1600        GUANIDINE HYDROCHLORIDE  CHEMBL1200728           1939
144                       HISTAMINE       CHEMBL90           1939
2661            HISTAMINE PHOSPHATE  CHEMBL3989520           1939
351                   SULFAPYRIDINE      CHEMBL700           1939
2115                  AMINOPHYLLINE  CHEMBL1370561           1940
967                    THEOPHYLLINE      CHEMBL190           1940
1061                 ERGOCALCIFEROL     CHEMBL1536           1941
60                     SULFADIAZINE      CHEMBL439           1941
1379            SULFADIAZINE SODIUM  CHEMBL1200351           1941
260                      MEPERIDINE      CHEMBL607           1942
1226      

# 2.
### For each approved drug since 2014 that you identified in step (1), retrieve a list of UniProt accession numbers, namely protein targets associated with the drug;

In [12]:
print(df.loc[df['approval_year'] >= 2014].head(20))

                              name      chembl_id  approval_year
1135                    APREMILAST   CHEMBL514800           2014
2329                   ASUNAPREVIR  CHEMBL2105735           2014
1050                      ATALUREN   CHEMBL256997           2014
1071                    BELINOSTAT   CHEMBL408513           2014
2293                   CEFTOLOZANE  CHEMBL2103872           2014
2057           CEFTOLOZANE SULFATE  CHEMBL1213250           2014
2479                     CERITINIB  CHEMBL2403108           2014
2229                   DACLATASVIR  CHEMBL2023898           2014
2444   DACLATASVIR DIHYDROCHLORIDE  CHEMBL2303621           2014
2506                     DASABUVIR  CHEMBL3137312           2014
2581  DASABUVIR SODIUM MONOHYDRATE  CHEMBL3544985           2014
925                      DELAMANID   CHEMBL218650           2014
2286                     DROXIDOPA  CHEMBL2103827           2014
2296                 EFINACONAZOLE  CHEMBL2103877           2014
2397                    E

In [13]:
from chembl_webresource_client.new_client import new_client


def get_uniprot_ids_for_chembl(chembl_id):
    """Return a set of UniProt IDs for protein targets of the given ChEMBL molecule."""
    uniprot_ids = set()
    mechanisms = new_client.mechanism.filter(molecule_chembl_id=chembl_id)

    for mech in mechanisms:
        target_id = mech.get("target_chembl_id")
        if target_id:
            target = new_client.target.get(target_id)
            for comp in target.get("target_components", []):
                for xref in comp.get("target_component_xrefs", []):
                    if xref.get("xref_src_db") == "UniProt":
                        uniprot_ids.add(xref["xref_id"])

    return uniprot_ids


chembl_ids = df.loc[df['approval_year'] >= 2014]['chembl_id'].tolist()

chembl_to_uniprot = {
    chembl_id: get_uniprot_ids_for_chembl(chembl_id)
    for chembl_id in chembl_ids
}

for chembl_id, uniprot_set in chembl_to_uniprot.items():
    print(f"{chembl_id}: {', '.join(sorted(uniprot_set)) if uniprot_set else 'No UniProt targets found'}")

CHEMBL514800: A5YW33, B3KTC4, O15443, O43433, O75522, O76092, P27815, Q07343, Q08493, Q08499, Q13549, Q13550, Q13551, Q13945, Q16255, Q16691, Q5DM53, Q5TEK4, Q5TEK5, Q5TEK6, Q6PMT2, Q7Z2L8, Q8IV84, Q8IVA7, Q8IVA9, Q8IVD2, Q8IVD3, Q8WUQ3, Q96HL4, Q9H3H2, Q9HCX7, Q9UN44, Q9UN45, Q9UN46, Q9UPJ6
CHEMBL2105735: A3EZI9, D2K2A8
CHEMBL256997: A3KQT0, A5D8V9, A6NG21, A6NIB2, A8K094, A8K0H3, A8K4V7, A8K502, A8K504, A8K505, A8K9V4, A8MZ73, A9C4C1, B2R495, B2R4A6, B2R4B3, B2R4D4, B2R4E3, B2R4F0, B2R4F4, B2R4F5, B2R4H2, B2R4H3, B2R4K2, B2R4M7, B2R4M8, B2R4Q3, B2R4T2, B2R4U4, B2R4Y1, B2R4Y3, B2R549, B2R591, B2R5A8, B2R5B2, B2R5G0, B2R5G5, B2R7N5, B2R801, B2RDD5, B2RDV9, B4DEP9, B4DLX3, B4DW28, B4E3C2, B5ME31, B7Z4K2, C9JB50, D3DP05, D3DQG5, D3DTR8, D3DU82, D3DVJ4, D3DWN2, D3DWW6, D6W634, E7EPK6, E9PB24, F5H1S2, F8VWC5, G5E9L2, J3KN86, J3QL51, P02248, P02249, P02250, P02383, P02403, P02404, P02433, P04643, P04645, P05386, P05388, P06366, P08227, P08526, P08708, P08865, P09058, P09896, P0CW22, P10660,

# 3.
### For each protein with a UniProt accession number that you identified in step (2), retrieve UniProt keywords associated with it.

In [15]:
import requests


def get_uniprot_keywords(uniprot_ids, batch_size=500):
    url = "https://rest.uniprot.org/uniprotkb/search"
    id_to_keywords = {}

    def chunked(iterable, size):
        for i in range(0, len(iterable), size):
            yield iterable[i:i+size]

    for batch in chunked(list(uniprot_ids), batch_size):
        query = " OR ".join(f"(accession:{uid})" for uid in batch)
        params = {
            "query": query,
            "format": "json",
            "fields": "accession,keyword",
            "size": batch_size
        }

        response = requests.get(url, params=params)
        response.raise_for_status()
        results = response.json()

        for entry in results.get('results', []):
            acc = entry['primaryAccession']
            keywords = [kw['name'] for kw in entry.get('keywords', [])]
            id_to_keywords[acc] = keywords

    return id_to_keywords


all_uniprot_ids = set(
    uniprot_id
    for ids in chembl_to_uniprot.values()
    for uniprot_id in ids
)

keywords_by_id = get_uniprot_keywords(all_uniprot_ids)

for acc, keywords in keywords_by_id.items():
    print(f"{acc}: {', '.join(keywords) if keywords else 'No keywords'}")

P08865: 3D-structure, Acetylation, Cell membrane, Cytoplasm, Direct protein sequencing, Disease variant, Host cell receptor for virus entry, Host-virus interaction, Isopeptide bond, Membrane, Nucleus, Phosphoprotein, Proteomics identification, Receptor, Reference proteome, Repeat, Ribonucleoprotein, Ribosomal protein, Ubl conjugation
P10721: 3D-structure, Alternative splicing, ATP-binding, Cell membrane, Cytoplasm, Direct protein sequencing, Disease variant, Disulfide bond, Glycoprotein, Immunoglobulin domain, Kinase, Magnesium, Membrane, Metal-binding, Nucleotide-binding, Phosphoprotein, Proteomics identification, Proto-oncogene, Receptor, Reference proteome, Repeat, Signal, Transferase, Transmembrane, Transmembrane helix, Tyrosine-protein kinase, Ubl conjugation
P19099: 3D-structure, Disease variant, Heme, Iron, Lipid metabolism, Membrane, Metal-binding, Mitochondrion, Mitochondrion inner membrane, Monooxygenase, Oxidoreductase, Proteomics identification, Reference proteome, Steroid 