This is a notebook to show protein bioActivity accessible from PubChem and ChEMBL to show what data can be derived from these sites and compare and contrast the results.



In [1]:
import requests
import argparse
import pandas as pd
import matplotlib.pyplot as plt
from chembl_webresource_client.new_client import new_client
import unittest

ModuleNotFoundError: No module named 'pandas'

Define all the modules to fetch the bioactivity data from PubChem and ChEMBL

In [None]:
# Module: Fetch data from PubChem
def fetch_pubchem_bioactivity(uniprot_id):
    """
    Fetch bioactivity data from PubChem for a given UniProt ID.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/target/ProteinGI/UniProtID/{uniprot_id}/concise/JSON"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json().get('Table', {}).get('Row', [])
    else:
        return []

# Module: Fetch data from ChEMBL
def fetch_chembl_bioactivity(uniprot_id):
    """
    Fetch bioactivity data from ChEMBL for a given UniProt ID.
    """
    target = new_client.target.filter(target_components__accession=uniprot_id)
    targets = list(target)
    if not targets:
        return []
    
    target_chembl_id = targets[0]['target_chembl_id']
    activities = new_client.activity.filter(
        target_chembl_id=target_chembl_id,
        standard_type__in=["IC50", "Ki", "Kd"]
    )
    return list(activities)


# Module: Extract GO terms
def extract_chembl_go_terms(uniprot_id):
    """
    Extract GO terms from ChEMBL for a given UniProt ID.
    """
    target = new_client.target.filter(target_components__accession=uniprot_id)
    targets = list(target)
    if not targets:
        return []
    
    go_terms = targets[0].get('component_go_slim', [])
    return go_terms


def extract_pubchem_go_terms(uniprot_id):
    """
    Extract GO terms from PubChem for a given UniProt ID.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/target/ProteinGI/UniProtID/{uniprot_id}/description/JSON"
    response = requests.get(url)
    if response.status_code == 200:
        description = response.json()
        return description.get('Table', {}).get('GO_Terms', [])
    else:
        return []


# Module: Save data to CSV
def save_to_csv(data, filename):
    """
    Save data to a CSV file.
    """
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)


# Module: Visualization
def visualize_overlap(data1, data2, label1, label2, title):
    """
    Visualize overlaps between two datasets using a Venn diagram.
    """
    set1 = set(data1)
    set2 = set(data2)
    plt.figure(figsize=(6, 6))
    plt.title(title)
    plt.gca().add_patch(plt.Circle((0.5, 0.5), 0.3, fill=False, label=label1))
    plt.gca().add_patch(plt.Circle((0.7, 0.5), 0.3, fill=False, label=label2))
    plt.legend()
    plt.show()


# Module: Command-line argument parsing
def parse_args():
    parser = argparse.ArgumentParser(description="Compare bioactivity data and GO terms from PubChem and ChEMBL.")
    parser.add_argument("uniprot_id", help="UniProt ID of the target protein.")
    parser.add_argument("--output", help="Base filename for saving data to CSV.", default="output")
    return parser.parse_args()



Main Execution Function

In [None]:
# Main execution function
def main():
    args = parse_args()

    # Fetch data
    pubchem_bioactivity = fetch_pubchem_bioactivity(args.uniprot_id)
    chembl_bioactivity = fetch_chembl_bioactivity(args.uniprot_id)
    pubchem_go_terms = extract_pubchem_go_terms(args.uniprot_id)
    chembl_go_terms = extract_chembl_go_terms(args.uniprot_id)

    # Save results to CSV
    save_to_csv(pubchem_bioactivity, f"{args.output}_pubchem_bioactivity.csv")
    save_to_csv(chembl_bioactivity, f"{args.output}_chembl_bioactivity.csv")
    save_to_csv(pubchem_go_terms, f"{args.output}_pubchem_go_terms.csv")
    save_to_csv(chembl_go_terms, f"{args.output}_chembl_go_terms.csv")

    # Visualize results
    visualize_overlap(
        [d['CID'] for d in pubchem_bioactivity if 'CID' in d],
        [d['molecule_chembl_id'] for d in chembl_bioactivity if 'molecule_chembl_id' in d],
        "PubChem",
        "ChEMBL",
        "Bioactivity Overlap"
    )


# Unit tests
class TestBioactivityComparison(unittest.TestCase):
    def test_fetch_pubchem_bioactivity(self):
        data = fetch_pubchem_bioactivity("P00734")  # Example UniProt ID
        self.assertIsInstance(data, list)

    def test_fetch_chembl_bioactivity(self):
        data = fetch_chembl_bioactivity("P00734")
        self.assertIsInstance(data, list)

    def test_extract_pubchem_go_terms(self):
        data = extract_pubchem_go_terms("P00734")
        self.assertIsInstance(data, list)

    def test_extract_chembl_go_terms(self):
        data = extract_chembl_go_terms("P00734")
        self.assertIsInstance(data, list)


if __name__ == "__main__":
    main()
