In [8]:
from PubchemAPI import *

# Case Study

Code Snippet 1: Fast Virtual Screening of Nirmatrelvir Analogs

In [9]:
# --- Resolve Compound and Find 2D Analogs (≥90% similarity) ---
CID_QUERY = "Nirmatrelvir"

cid_list = get_list_text_from_url(compound_name_get_cids(CID_QUERY)[0])
if cid_list:
    NIRMATRELVIR_CID = cid_list
    similar_cids = get_list_text_from_url(compound_fastsimilarity_2d_cid_get_cids(NIRMATRELVIR_CID))
    print(f"Query CID: {NIRMATRELVIR_CID}")
    print(f"Total Analogs (≥90%): {len(similar_cids)} | Top 5: {similar_cids[:5]}")
    smiles_smilar=get_list_text_from_url(compound_cid_get_SMILES(similar_cids))
    print(f"Top 5 SMILES: {smiles_smilar[:5]}")
else:
    print(f"No CID found for '{CID_QUERY}'")


Query CID: ['155903259']
Total Analogs (≥90%): 724 | Top 5: ['155903259', '162396372', '162396442', '162396452', '162396458']
Top 5 SMILES: ['CC1([C@@H]2[C@H]1[C@H](N(C2)C(=O)[C@H](C(C)(C)C)NC(=O)C(F)(F)F)C(=O)N[C@@H](C[C@@H]3CCNC3=O)C#N)C', 'CC(C)[C@@H](C(=O)N1C[C@H]2[C@@H]([C@H]1C(=O)N[C@@H](C[C@@H]3CCNC3=O)C#N)C2(C)C)NC(=O)C(F)(F)F', 'CC1([C@@H]2[C@H]1[C@H](N(C2)C(=O)[C@H](C(C)(C)C)NC(=O)NC)C(=O)N[C@@H](C[C@@H]3CCNC3=O)C#N)C', 'CC1([C@@H]2[C@H]1[C@H](N(C2)C(=O)[C@H](CC3CCC3)NC(=O)C(F)(F)F)C(=O)N[C@@H](C[C@@H]4CCNC4=O)C#N)C', 'CC1([C@@H]2[C@H]1[C@H](N(C2)C(=O)[C@H](CC(F)F)NC(=O)C(F)(F)F)C(=O)N[C@@H](C[C@@H]3CCNC3=O)C#N)C']


Code Snippet 2: Chemotaxonomic Retrieval of Streptomyces Metabolites

In [10]:
STREPTOMYCES_TAX_ID = 1883
print(f"--- Identifying assays for Streptomyces (TaxID {STREPTOMYCES_TAX_ID}) ---")

# Fetch all assay IDs (AIDs) for the given Taxonomy ID
streptomyces_aids = get_list_text_from_url(taxonomy_taxid_get_aids(STREPTOMYCES_TAX_ID))

if streptomyces_aids:
    print(f"Found {len(streptomyces_aids)} AIDs associated with Streptomyces (TaxID {STREPTOMYCES_TAX_ID}).")
    print(f"First 5 AIDs: {streptomyces_aids[:5]}")

    # Fetch descriptions for the first 5 AIDs
    descriptions = assay_aid_get_description(streptomyces_aids[:5])
    print("\nDescriptions for first 5 AIDs:", descriptions)
else:
    print(f"Failed to retrieve AIDs for Streptomyces (TaxID {STREPTOMYCES_TAX_ID}).")


--- Identifying assays for Streptomyces (TaxID 1883) ---
Found 33 AIDs associated with Streptomyces (TaxID 1883).
First 5 AIDs: ['286595', '286596', '288804', '288805', '288806']

Descriptions for first 5 AIDs: ['https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/286595,286596,288804,288805,288806/description/xml']


#Code Snippet 3: Active Compounds for EGFR (GeneID 1956)

In [11]:
EGFR_GENE_ID = "1956"

# Retrieve all AIDs associated with the gene
aids_txt_url = convert_json_get_txt(gene_geneid_get_aids(EGFR_GENE_ID)[0])
aids_list = get_list_text_from_url(aids_txt_url)

if aids_list:
    # Print descriptions for the first 5 AIDs
    print("Descriptions for first 5 AIDs:")
    descriptions = assay_aid_get_description(aids_list[:5])
    print("Descriptions for first 5 AIDs:", descriptions)
    # Get active CIDs for the first AID
    FIRST_AID = aids_list[0]
    active_cid_url = assay_aid_get_cids_active(FIRST_AID)[0]
    active_cids_list = get_list_text_from_url(active_cid_url)
    print(f"\nActive CIDs URL: {active_cid_url}")
    print(f"First 5 Active CIDs: {active_cids_list[:5]}")
else:
    print("Failed to retrieve AIDs for the GeneID.")


Descriptions for first 5 AIDs:
Descriptions for first 5 AIDs: ['https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/1433,1622,1726,1727,1729/description/xml']

Active CIDs URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/1433/cids/txt?aids_type=active
First 5 Active CIDs: ['135398510', '3038522', '5329102', '24867231', '11485656']


Code Snippet 4: Protein-Centric Bioactivity Retrieval (EGFR, UniProt P00533)

In [12]:

# ===============================
# Protein → Assay → Active Compounds Workflow
# ===============================

EGFR_ACCESSION_ID = "P00533"
print(f"--- Protein Target Resolution ({EGFR_ACCESSION_ID}) ---")

# Retrieve all AIDs
aids_list = get_list_text_from_url(protein_accession_get_aids(EGFR_ACCESSION_ID, output_format="txt")[0])
FIRST_AID = aids_list[0]
print(f"Total AIDs: {len(aids_list)} | First AID: {FIRST_AID}")

# Retrieve active compounds for the first AID
active_cids = get_list_text_from_url(assay_aid_get_cids_active(FIRST_AID)[0])
print(f"Active CIDs ({len(active_cids)}): {active_cids[:5] if active_cids else 'None found'}")


--- Protein Target Resolution (P00533) ---
Total AIDs: 6329 | First AID: 1433
Active CIDs (38): ['135398510', '3038522', '5329102', '24867231', '11485656']


Code Snippet 5: Automated Cell Line Compound Retrieval (A549)

In [13]:
A549_CELL_ACC = "CVCL_0030"

# Retrieve all AIDs associated with the A549 cell line
aids_list = get_list_text_from_url(convert_json_get_txt(cell_cellacc_get_aids(A549_CELL_ACC)[0]))

if aids_list:
    # Use the first AID to fetch active CIDs
    FIRST_AID = aids_list[0]
    active_cids_list = get_list_text_from_url(assay_aid_get_cids_active(FIRST_AID)[0])
    
    print(f"Total Active CIDs: {len(active_cids_list)}")
    print(f"First 5 Active CIDs: {active_cids_list[:5]}")
else:
    print("No AIDs found for the A549 cell line.")


Total Active CIDs: 72
First 5 Active CIDs: ['1941616', '16188433', '2940337', '1048482', '6552076']


Code Snippet 6: Pathway-Centric Data Retrieval (Reactome R-HSA-70171)

In [14]:

# ==============================================================================
# CORE WORKFLOW (Fixed Logic)
# ==============================================================================

# The Pathway Accession (pwacc) ID from Reactome (e.g., cell cycle)
REACTOME_PWACC = "Reactome:R-HSA-70171"

print(f"--- PATHWAY AGGREGATION ({REACTOME_PWACC}) ---")

# 1. Retrieve pathway summary
summary_url = pathway_pwacc_get_summary(REACTOME_PWACC)[0]
print(f"Pathway Summary URL: {summary_url}")

# 2. Retrieve associated chemical compounds (CIDs)
# FIX: Use output_format="txt" directly
cids_list = get_list_text_from_url(pathway_pwacc_get_cids(REACTOME_PWACC, output_format="txt")[0])
if cids_list:
    print(f"Total CIDs: {len(cids_list)} | First 5: {cids_list[:5]}")

# 3. Retrieve associated GeneIDs
# FIX: Use output_format="txt" directly
geneids_list = get_list_text_from_url(pathway_pwacc_get_geneids(REACTOME_PWACC, output_format="txt")[0])
if geneids_list:
    print(f"Total GeneIDs: {len(geneids_list)} | First 5: {geneids_list[:5]}")


--- PATHWAY AGGREGATION (Reactome:R-HSA-70171) ---
Pathway Summary URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/pathway/pwacc/Reactome%3AR-HSA-70171/summary/json
Total CIDs: 32 | First 5: ['223', '813', '888', '962', '1038']
Total GeneIDs: 55 | First 5: ['226', '229', '230', '669', '2023']
