# Comparison of PubChemPy vs PubChemAPI:
In this comparison, PubChemAPI is easier to use than PubChemPy when working across multiple PubChem domains (e.g., protein → assay → compound). PubChemAPI requires fewer steps and less manual handling of namespaces and endpoints, whereas PubChemPy involves more explicit domain transitions and parameter management. As a result, PubChemAPI provides a more streamlined and efficient workflow for cross-domain queries.

In [16]:
from PubchemAPI import *

# Case Study

# Code Snippet 1: Fast Virtual Screening of Nirmatrelvir Analogs

#PubchemAPI

In [17]:
# --- Resolve Compound and Find 2D Analogs (≥90% similarity) ---
CID_QUERY = "Nirmatrelvir"

cid_list = get_list_text_from_url(compound_name_get_cids(CID_QUERY)[0])
if cid_list:
    NIRMATRELVIR_CID = cid_list
    similar_cids = get_list_text_from_url(compound_fastsimilarity_2d_cid_get_cids(NIRMATRELVIR_CID,threshold=90))
    print(f"Query CID: {NIRMATRELVIR_CID}")
    print(f"Total Analogs (≥90%): {len(similar_cids)} | Top 5: {similar_cids[:5]}")
else:
    print(f"No CID found for '{CID_QUERY}'")


Query CID: ['155903259']
Total Analogs (≥90%): 724 | Top 5: ['155903259', '162396372', '162396442', '162396452', '162396458']


#pubchempy

In [18]:
import pubchempy as pcp

# --- Resolve Compound and Find 2D Analogs (≥90% similarity) ---
CID_QUERY = "Nirmatrelvir"

# 1. Resolve Name to CID
# get_cids returns a list of integers directly
cid_list = pcp.get_cids(CID_QUERY, 'name')

if cid_list:
    NIRMATRELVIR_CID = cid_list[0]
    print(f"Query CID: {NIRMATRELVIR_CID}")

    try:
        # 2. Find 2D Analogs
        # searchtype='similarity' performs a 2D similarity search
        # threshold=90 sets the similarity cutoff
        analogs = pcp.get_compounds(NIRMATRELVIR_CID, namespace='cid', searchtype='fastsimilarity_2d', threshold=90)
        
        # PubChemPy returns full Compound objects, so we extract the .cid attribute
        similar_cids = [a.cid for a in analogs]
        
        print(f"Total Analogs (≥90%): {len(similar_cids)} | Top 5: {similar_cids[:5]}")
        
    except Exception as e:
        print(f"Error during similarity search: {e}")

else:
    print(f"No CID found for '{CID_QUERY}'")

Query CID: 155903259
Total Analogs (≥90%): 724 | Top 5: [155903259, 162396372, 162396442, 162396452, 162396458]


# Code Snippet 2: Chemotaxonomic Retrieval of Streptomyces Metabolites

#PubchemAPI

In [19]:
STREPTOMYCES_TAX_ID = 1883
print(f"--- Identifying assays for Streptomyces (TaxID {STREPTOMYCES_TAX_ID}) ---")

# Fetch all assay IDs (AIDs) for the given Taxonomy ID
streptomyces_aids = get_list_text_from_url(taxonomy_taxid_get_aids(STREPTOMYCES_TAX_ID))

if streptomyces_aids:
    print(f"Found {len(streptomyces_aids)} AIDs associated with Streptomyces (TaxID {STREPTOMYCES_TAX_ID}).")
    print(f"First 5 AIDs: {streptomyces_aids[:5]}")
else:
    print(f"Failed to retrieve AIDs for Streptomyces (TaxID {STREPTOMYCES_TAX_ID}).")


--- Identifying assays for Streptomyces (TaxID 1883) ---
Found 33 AIDs associated with Streptomyces (TaxID 1883).
First 5 AIDs: ['286595', '286596', '288804', '288805', '288806']


#pubchempy

In [20]:
import pubchempy as pcp

STREPTOMYCES_TAX_ID = 1883
print(f"--- Identifying assays for Streptomyces (TaxID {STREPTOMYCES_TAX_ID}) ---")

try:
    # Use pcp.request to target the 'taxonomy' domain explicitly.
    # Structure: domain='taxonomy', namespace='taxid', operation='aids'
    response = pcp.request(STREPTOMYCES_TAX_ID, domain='taxonomy', namespace='taxid', operation='aids', output='TXT')
    
    # response.read() returns bytes, so we decode to string and split by newline
    streptomyces_aids = [line.strip() for line in response.read().decode('utf-8').split('\n') if line.strip()]

    if streptomyces_aids:
        print(f"Found {len(streptomyces_aids)} AIDs associated with Streptomyces (TaxID {STREPTOMYCES_TAX_ID}).")
        print(f"First 5 AIDs: {streptomyces_aids[:5]}")
    else:
        print(f"Failed to retrieve AIDs for Streptomyces (TaxID {STREPTOMYCES_TAX_ID}).")

except Exception as e:
    print(f"Error retrieving data: {e}")

--- Identifying assays for Streptomyces (TaxID 1883) ---
Found 33 AIDs associated with Streptomyces (TaxID 1883).
First 5 AIDs: ['286595', '286596', '288804', '288805', '288806']


#Code Snippet 3: Active Compounds for EGFR (GeneID 1956)

#PubchemAPI

In [21]:

EGFR_GENE_ID = "1956"
aids_txt_url = convert_json_get_txt(gene_geneid_get_aids(EGFR_GENE_ID)[0])
aids_list = get_list_text_from_url(aids_txt_url)

if aids_list:
    FIRST_AID = aids_list[0]
    active_cid_url = assay_aid_get_cids_active(FIRST_AID)[0]
    active_cids_list = get_list_text_from_url(active_cid_url)
    print(f"Active CIDs URL: {active_cid_url}")
    print(f"First 5 Active CIDs: {active_cids_list[:5]}")
else:
    print("Failed to retrieve AIDs for the GeneID.")


Active CIDs URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/1433/cids/txt?aids_type=active
First 5 Active CIDs: ['135398510', '3038522', '5329102', '24867231', '11485656']


#pubchempy

In [22]:
import pubchempy as pcp

# EGFR GeneID
EGFR_GENE_ID = 1956
print(f"--- Retrieving AIDs for EGFR (GeneID {EGFR_GENE_ID}) ---")

# Step 1: Get all AIDs associated with the GeneID
gene_aids_json = pcp.get_json(
    identifier=EGFR_GENE_ID,
    namespace='geneid',
    domain='gene',
    operation='aids'
)

# Extract AIDs
aids_list = gene_aids_json.get('InformationList', {}).get('Information', [{}])[0].get('AID', []) if gene_aids_json else []

if not aids_list:
    print("Failed to retrieve AIDs for the GeneID.")
else:
    FIRST_AID = aids_list[0]
    print(f"First AID for EGFR: {FIRST_AID}")

    # Step 2: Get all active CIDs for the first AID
    active_cids_json = pcp.get_json(
        identifier=FIRST_AID,
        namespace='aid',
        domain='assay',
        operation='cids',
        aids_type='active'  # retrieve only active compounds
    )
    print(active_cids_json)
    active_cids_list = active_cids_json.get('InformationList', {}).get('Information', [{}])[0].get('CID', []) if active_cids_json else []
    
    if active_cids_list:
        print(f"Retrieved {len(active_cids_list)} active CIDs for AID {FIRST_AID}")
        print(f"First 5 Active CIDs: {active_cids_list[:5]}")
    else:
        print(f"No active CIDs found for AID {FIRST_AID}")


--- Retrieving AIDs for EGFR (GeneID 1956) ---
First AID for EGFR: 1433
{'InformationList': {'Information': [{'AID': 1433, 'CID': [135398510, 3038522, 5329102, 24867231, 11485656, 5494449, 156422, 208908, 3038525, 160355, 156414, 216239, 10113978, 6445562, 11712649, 9926791, 5291, 451705, 3062316, 123631, 11617559, 3025986, 151194, 153999, 176870, 9884685, 9874913, 5169, 11667893, 5330790, 11409972, 10138259, 24202429, 4521392, 176155, 11656518, 3081361, 16007391]}]}}
Retrieved 38 active CIDs for AID 1433
First 5 Active CIDs: [135398510, 3038522, 5329102, 24867231, 11485656]


#Code Snippet 4: Active Compounds for EGFR (Protein Accession P00533)

#PubcChemAPI

In [23]:

# ===============================
# Protein → Assay → Active Compounds Workflow
# ===============================

EGFR_ACCESSION_ID = "P00533"
print(f"--- Protein Target Resolution ({EGFR_ACCESSION_ID}) ---")

# Retrieve all AIDs
aids_list = get_list_text_from_url(protein_accession_get_aids(EGFR_ACCESSION_ID, output_format="txt")[0])
FIRST_AID = aids_list[0]
print(f"Total AIDs: {len(aids_list)} | First AID: {FIRST_AID}")

# Retrieve active compounds for the first AID
active_cids = get_list_text_from_url(assay_aid_get_cids_active(FIRST_AID)[0])
print(f"Active CIDs ({len(active_cids)}): {active_cids[:5] if active_cids else 'None found'}")


--- Protein Target Resolution (P00533) ---
Total AIDs: 6329 | First AID: 1433
Active CIDs (38): ['135398510', '3038522', '5329102', '24867231', '11485656']


#PubcChemAPI

In [24]:
import pubchempy as pcp

# ===============================
# Protein → Assay → Active Compounds Workflow
# ===============================

EGFR_ACCESSION_ID = "P00533"
print(f"--- Protein Target Resolution ({EGFR_ACCESSION_ID}) ---")

# Step 1: Retrieve all AIDs linked to the protein accession
protein_aids_json = pcp.get_json(
    identifier=EGFR_ACCESSION_ID,
    namespace='accession',  # Protein accession namespace
    domain='protein',         # Assays targeting the protein
    operation='aids'               # Retrieve assay IDs
)

# Extract AIDs
aids_list = protein_aids_json.get('InformationList', {}).get('Information', [{}])[0].get('AID', []) if protein_aids_json else []

if not aids_list:
    print(f"No AIDs found for protein {EGFR_ACCESSION_ID}.")
else:
    FIRST_AID = aids_list[0]
    print(f"Total AIDs: {len(aids_list)} | First AID: {FIRST_AID}")

    # Step 2: Retrieve active CIDs for the first AID
    active_cids_json = pcp.get_json(
        identifier=FIRST_AID,
        namespace='aid',   # Assay ID namespace
        domain='assay',    # Assay domain
        operation='cids',  # Retrieve compound IDs
        aids_type='active' # Only active compounds
    )

    # Extract active CIDs
    active_cids = active_cids_json.get('InformationList', {}).get('Information', [{}])[0].get('CID', []) if active_cids_json else []

    print(f"Active CIDs ({len(active_cids)}): {active_cids[:5] if active_cids else 'None found'}")


--- Protein Target Resolution (P00533) ---
Total AIDs: 6329 | First AID: 1433
Active CIDs (38): [135398510, 3038522, 5329102, 24867231, 11485656]


Code Snippet 5: Automated Cell Line Compound Retrieval (A549)

#PubcChemAPI

In [25]:
A549_CELL_ACC = "CVCL_0030"

# Retrieve all AIDs associated with the A549 cell line
aids_list = get_list_text_from_url(convert_json_get_txt(cell_cellacc_get_aids(A549_CELL_ACC)[0]))

if aids_list:
    # Use the first AID to fetch active CIDs
    FIRST_AID = aids_list[0]
    active_cids_list = get_list_text_from_url(assay_aid_get_cids_active(FIRST_AID)[0])
    
    print(f"Total Active CIDs: {len(active_cids_list)}")
    print(f"First 5 Active CIDs: {active_cids_list[:5]}")
else:
    print("No AIDs found for the A549 cell line.")


Total Active CIDs: 72
First 5 Active CIDs: ['1941616', '16188433', '2940337', '1048482', '6552076']


#pubchempy

In [26]:
import pubchempy as pcp

# A549 Cell Line Accession
A549_CELL_ACC = "CVCL_0030"
print(f"--- Retrieving AIDs for A549 (Cell Accession {A549_CELL_ACC}) ---")

# Step 1: Get all AIDs associated with the cell line
cell_aids_json = pcp.get_json(
    identifier=A549_CELL_ACC,
    namespace='cellacc',  # Cell domain
    domain='cell',
    operation='aids'
)

# Extract AIDs
aids_list = cell_aids_json.get('InformationList', {}).get('Information', [{}])[0].get('AID', []) if cell_aids_json else []

if not aids_list:
    print("No AIDs found for A549 cell line.")
else:
    FIRST_AID = aids_list[0]
    print(f"Total AIDs found: {len(aids_list)} | First AID: {FIRST_AID}")

    # Step 2: Get all active CIDs for the first AID
    active_cids_json = pcp.get_json(
        identifier=FIRST_AID,
        namespace='aid',
        domain='assay',
        operation='cids',
        aids_type='active'
    )

    active_cids_list = active_cids_json.get('InformationList', {}).get('Information', [{}])[0].get('CID', []) if active_cids_json else []

    if active_cids_list:
        print(f"Total active CIDs for AID {FIRST_AID}: {len(active_cids_list)}")
        print(f"First 5 Active CIDs: {active_cids_list[:5]}")
    else:
        print(f"No active CIDs found for AID {FIRST_AID}")


--- Retrieving AIDs for A549 (Cell Accession CVCL_0030) ---
Total AIDs found: 15449 | First AID: 1259
Total active CIDs for AID 1259: 72
First 5 Active CIDs: [1941616, 16188433, 2940337, 1048482, 6552076]


Code Snippet 6: Pathway-Centric Data Retrieval (Reactome R-HSA-70171)

#PubcChemAPI

In [27]:

# ==============================================================================
# CORE WORKFLOW (Fixed Logic)
# ==============================================================================

# The Pathway Accession (pwacc) ID from Reactome (e.g., cell cycle)
REACTOME_PWACC = "Reactome:R-HSA-70171"

print(f"--- PATHWAY AGGREGATION ({REACTOME_PWACC}) ---")

# 1. Retrieve pathway summary
summary_url = pathway_pwacc_get_summary(REACTOME_PWACC)[0]
print(f"Pathway Summary URL: {summary_url}")

# 2. Retrieve associated chemical compounds (CIDs)
# FIX: Use output_format="txt" directly
cids_list = get_list_text_from_url(
    pathway_pwacc_get_cids(REACTOME_PWACC, output_format="txt")[0]
)
if cids_list:
    print(f"Total CIDs: {len(cids_list)} | First 5: {cids_list[:5]}")

# 3. Retrieve associated GeneIDs
# FIX: Use output_format="txt" directly
geneids_list = get_list_text_from_url(
    pathway_pwacc_get_geneids(REACTOME_PWACC, output_format="txt")[0]
)
if geneids_list:
    print(f"Total GeneIDs: {len(geneids_list)} | First 5: {geneids_list[:5]}")


--- PATHWAY AGGREGATION (Reactome:R-HSA-70171) ---
Pathway Summary URL: https://pubchem.ncbi.nlm.nih.gov/rest/pug/pathway/pwacc/Reactome%3AR-HSA-70171/summary/json
Total CIDs: 32 | First 5: ['223', '813', '888', '962', '1038']
Total GeneIDs: 55 | First 5: ['226', '229', '230', '669', '2023']


In [28]:
import pubchempy as pcp
import pubchempy as pcp
import socket

socket.setdefaulttimeout(60)  # Set timeout to 60 seconds

# ======================================================================
# Pathway → Compounds & Genes Workflow using PubChemPy
# ======================================================================

REACTOME_PWACC = "Reactome:R-HSA-70171"
print(f"--- PATHWAY AGGREGATION ({REACTOME_PWACC}) ---")

# Step 1: Retrieve pathway summary
summary_json = pcp.get_json(
    identifier=REACTOME_PWACC,
    namespace='pwacc',  # Pathway accession namespace
    domain='pathway',
    operation='summary'
)
print(f"Pathway Summary JSON: {summary_json}")

# Step 2: Retrieve associated chemical compounds (CIDs)
cids_json = pcp.get_json(
    identifier=REACTOME_PWACC,
    namespace='pwacc',
    domain='pathway',
    operation='cids'
)
print(f"CIDs JSON: {cids_json}")
cids_list = cids_json.get('InformationList', {}).get('Information', [{}])[0].get('CID', []) if cids_json else []
if cids_list:
    print(f"Total CIDs: {len(cids_list)} | First 5: {cids_list[:5]}")
else:
    print("No CIDs found for this pathway.")

# Step 3: Retrieve associated GeneIDs
geneids_json = pcp.get_json(
    identifier=REACTOME_PWACC,
    namespace='pwacc',
    domain='pathway',
    operation='geneids'
)
print(f"GeneIDs JSON: {geneids_json}")
geneids_list = geneids_json.get('InformationList', {}).get('Information', [{}])[0].get('GeneID', []) if geneids_json else []
if geneids_list:
    print(f"Total GeneIDs: {len(geneids_list)} | First 5: {geneids_list[:5]}")
else:
    print("No GeneIDs found for this pathway.")


--- PATHWAY AGGREGATION (Reactome:R-HSA-70171) ---
Pathway Summary JSON: {'PathwaySummaries': {'PathwaySummary': [{'PathwayAccession': 'Reactome:R-HSA-70171', 'SourceName': 'Reactome', 'SourceID': 'R-HSA-70171', 'SourceURL': 'https://reactome.org/content/detail/R-HSA-70171', 'Name': 'Glycolysis', 'Type': 'organism_specific', 'Category': 'pathway', 'Description': 'The reactions of glycolysis (e.g., van Wijk and van Solinge 2005) convert glucose 6-phosphate to pyruvate. The entire process is cytosolic. Glucose 6-phosphate is reversibly isomerized to form fructose 6-phosphate. Phosphofructokinase 1 catalyzes the physiologically irreversible phosphorylation of fructose 6-phosphate to form fructose 1,6-bisphosphate. In six reversible reactions, fructose 1,6-bisphosphate is converted to two molecules of phosphoenolpyruvate and two molecules of NAD+ are reduced to NADH + H+. Each molecule of phosphoenolpyruvate reacts with ADP to form ATP and pyruvate in a physiologically irreversible reactio