In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# ChEA-KG Appyter

The ChEA-KG Appyter predicts regulatory subnetworks of transcription factors (TFs) for an input gene set. The query gene sets are compared against libaries of TF target gene sets in ChEA3 to identify the most likely regulating TFs. The TFs are then connected via edges in the ChEA-KG background gene regulatory network (GRN). This network was constructed by submitting thousands of gene expression signatures from RummaGEO to ChEA3 for transcription factor enrichment analysis. The functions of enriched TFs are predicted using the Gene Set Foundation Model. Automatic summaries of each subnetwork are produced using Google Gemini.

The only required input is a gene list. Options to change other parameters such as the subnetowrk size and layout are provided. It is recommended not to change the remaining parameters. 

In [None]:
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm

from google import genai
import urllib.request
import json

from IPython.display import display, FileLink, Markdown, HTML
import ipycytoscape as ipc

import warnings
import time
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
%%appyter hide_code

{% do SectionField(
    name='section1', 
    title = '1. Submit Your Gene List', 
    subtitle = 'Upload a text file containing your gene list -OR- copy and paste your gene list into the text box below (One gene per row). You can also try the default gene list provided.'
) %}
{% do SectionField(
    name='section2', 
    title = '2. Choose subnetwork size', 
    subtitle = 'Choose the number of top-ranked enriched TFs included in each subnetwork. The default is 10.'
) %}
{% do SectionField(
    name='section3', 
    title = '3. Choose subnetwork layout', 
    subtitle = 'Choose a Cytoscape-supported layout to view the network. The default is cose, a force-directed layout.'
) %}

In [None]:
%%appyter hide_code

{% set gene_list_kind = TabField(
    name='gene_list_kind',
    label='Gene List',
    default='Paste',
    description='Paste or upload your gene list',
    required=True,
    choices={
        'Paste': [
            TextListField(
                name='gene_list_input',
                label='Gene List',
                description='Paste your gene list (One gene per row):',
                default=['MIR4454', 'RNU86', 'SNORD34', 'EEF1A1', 'RPL11', 'DCT', 'RPL37A', 'SNORD33', 'GAPDH', 'SNORD74', 'MIR4461', 'MIR4680', 'CD63', 'SNORD68', 'LDHA', 'TMSB4X', 
                'RPL27', 'SNORD108', 'MIR3191', 'RPS18', 'RPL41', 'ENO1', 'CAPG', 'RPS15A', 'SNORD79', 'FN1', 'LGALS3', 'GPNMB', 'NPM1', 'RPL7', 'RPS14', 'SNORD38A', 'RPS13', 'RPS7', 
                'ATOX1', 'PKM2', 'RPL31', 'SNORD76', 'SNORD42B', 'RPS29', 'BCYRN1', 'RPL6', 'ATP5E', 'RPS3A', 'RPL39', 'YWHAZ', 'TOMM7', 'RPS27A', 'SNORD49A', 'MIR4482-1', 'SNORD5', 
                'RPL30', 'MIR1292', 'RPL5', 'SNORD59A', 'RPS21', 'PSAP', 'RPL35A', 'RPL13AP5', 'SNORD50B', 'H2AFZ', 'SNORD27', 'PPIA', 'PRDX1', 'RPL21P28', 'RPL9', 'RPS12', 'HSP90B1', 
                'COX7C', 'RPL23', 'SNORD30', 'LDHB', 'GNB2L1', 'SGK1', 'FKBP1A', 'SNORD57', 'HSPD1', 'SNORD12', 'MIR4273', 'RPS4X', 'UQCRH', 'RPS2', 'COX5B', 'ATP5B', 'MIR4687', 
                'MIR4263', 'RPS10', 'RAN', 'MIR3687', 'PTTG1', 'SNORD87', 'ATP1A1', 'NME2', 'SNORD18A', 'RPSA', 'TUBB', 'RPL22', 'RPLP0', 'SNORD101', 'ANXA5', 'CD74', 'RPS6', 
                'MIR4691', 'GNAS', 'GSTO1', 'EIF4A1', 'NQO1', 'SNAR-G1', 'MIR4653', 'UBB', 'RPL38', 'SNORD4A', 'SNORD82', 'MIA', 'SNORD37', 'EIF3E', 'MIR4678', 'PDIA6', 'SLC25A3', 
                'PARK7', 'PGK1', 'C17orf76-AS1', 'YBX1', 'NCL', 'RPL35', 'HSP90AB1', 'MIR103B2', 'RPL26', 'DSTN', 'SNHG8', 'CALR', 'EEF1G', 'ATP5J2', 'TUBA1B', 'CSTB', 'SPP1', 'CALU', 
                'PABPC1', 'PRAME', 'LY6E', 'HNRNPA2B1', 'SNORD38B', 'SLC25A5', 'FABP5', 'MIR1915', 'SERPINF1', 'RPS8', 'DDOST', 'HINT1', 'RPL18A', 'RPS20', 'PSMA7', 'CHCHD2', 'HNRNPA1',
                 'RPL15', 'PSMA1', 'HLA-DRA', 'RPS3', 'RPL27A', 'EEF1B2', 'TXN', 'RPL4', 'SNORD16', 'PRNP', 'MDH1', 'NME1', 'CANX', 'SNORD35B', 'TBCA', 'TPI1', 'LOC645591', 'PTMA', 
                 'ATP5A1', 'CBX3', 'SDCBP', 'C1QBP', 'DBI', 'SNORD59B', 'PLA1A', 'RPL37', 'NACA', 'CDK2', 'MIR324', 'RPS9', 'ALDOA', 'COX6A1', 'RPN2', 'ATP5F1', 'ATP5G3', 'SNAR-E', 
                 'GMPR', 'SNRPD2', 'MIR4700', 'GYPC', 'CTSK', 'SHFM1', 'P4HB', 'CTSZ', 'MIR1260B', 'PCNA', 'HMGB1', 'COX7B', 'TM4SF1', 'SNORA41', 'CTSB', 'SLC20A1', 'IER3', 'ACSL3', 
                 'CKS2', 'ATP1B3', 'MIR3188', 'ZNFX1-AS1', 'COX7A2', 'MIR135A1', 'PTGES3', 'CSDE1', 'LAMP2', 'SDC3', 'AMD1', 'MIR4523', 'SPON2', 'RPL10A', 'MIR4639', 'MIR4517', 'XRCC6', 
                 'CSE1L', 'RPL8', 'MIR3653', 'MIR3190', 'SNORD54', 'RPL14', 'NBL1', 'ACTR3', 'ATP5C1', 'SNORD22', 'ATP1A1OS', 'EIF5A', 'SNORA10', 'GNG12', 'EIF3L', 'YWHAE', 'VDAC1', 
                 'CD109', 'SRP9', 'ATP5H', 'SLIRP', 'MFI2', 'RPL19', 'LPXN', 'CLIC4', 'BTF3', 'HLA-A', 'FAM167B', 'PDIA3', 'SEC61G', 'MGST1', 'TXNRD1', 'ATP5G1', 'LITAF', 'HSD17B12', 
                 'IVNS1ABP', 'SNORD31', 'NT5E', 'A2M', 'UBA52', 'POMP', 'HMGN2', 'ARL6IP5', 'HSP90AA1', 'SIRPA', 'EMP3', 'WFDC1', 'EIF3D', 'PYGB', 'SSBP1', 'SNRPB', 'RPL32', 'CCT7', 
                 'IFI6', 'MCAM', 'RPL10', 'XRCC5', 'ATP6V1E1', 'SNRPG', 'MITF', 'RPL13A', 'MIR2861', 'C3orf14', 'C14orf2', 'RPS5', 'FBXO7', 'SPARC', 'SYPL1', 'RGS10', 'SLC45A2', 'APP', 
                 'ANXA1', 'CD68', 'CCT2', 'IPO7', 'CCT4', 'HNRNPA3', 'CAP1', 'HSPE1', 'MBP', 'ACTR2', 'UCN2', 'MIR25', 'CASP1', 'EIF3I', 'SMS', 'MME', 'ARPC2', 'CDC42', 'NDUFB9', 
                 'AP1S2', 'PRDX3', 'SRPX', 'PHGDH', 'FBL', 'CTSC', 'SNORA20', 'HNRNPH1', 'PDIA4', 'EIF3H', 'SOAT1', 'VGF', 'GANAB', 'HSPA9', 'GLO1', 'PRKAR1A', 'SNRPF', 'SDHB', 'TIMP1', 
                 'PSMD6', 'BCL2A1', 'SNRPB2', 'NDUFB3', 'SNHG6', 'CORO1C', 'THOC7', 'SNX10', 'CEACAM1', 'LAPTM4B', 'SFRP1', 'ARPC3', 'G3BP1', 'COX17', 'GPM6B', 'SSR3', 'ETFB', 'MIR4665', 
                 'CCT8', 'SLC43A3', 'GJB1', 'EIF4B', 'RPL18', 'KPNA2', 'CAPZB', 'FABP7', 'NOP56', 'HNRNPK', 'ERP29', 'VAMP8', 'OAT', 'PSMB1', 'CTSH', 'NSA2', 'LGALS3BP', 'SSB', 'LUZP6', 
                 'POLE4', 'TIMP3', 'SF3B14', 'SUMO1', 'UGP2', 'PSMA2', 'PEG10', 'ERGIC3', 'ERH', 'MIR4785', 'C19orf79', 'PLP2', 'AKR1B1', 'AZIN1', 'RAB38', 'ADSL'],
                section='section1'
            ),
        ],
        'Upload': [
            FileField(
                name='gene_list_filename',
                label='Gene List File',
                default='',
                description='Upload your gene list as a text file (One gene per row).',
                section='section1'
            ),
        ],
    },
    section = 'section1',
) %}
{%
    set limit = IntField(
        name='subnetwork-size',
        label="Subnetwork Size",
        default=10,
        description="Choose the subnetwork size",
        section = 'section2',
    )
%}
{%
    set layout = StringField(
        name='subnetwork-layout',
        label="Subnetwork Layout",
        default='cose',
        description="Choose the subnetwork layout. View available layouts here: https://blog.js.cytoscape.org/2020/05/11/layouts/",
        section = 'section3',
    )
%}

In [None]:
%%appyter code_exec

{%- if gene_list_kind.raw_value == 'Paste' %}
gene_list_input = {{ gene_list_kind.value[0] }}
{%- else %}
gene_list_filename = {{ gene_list_kind.value[0] }}
{%- endif %}

{%- if gene_list_kind.raw_value == 'Paste' %}
genes = [x.strip() for x in gene_list_input]
{%- else %}
open_gene_list_file = open(gene_list_filename,'r')
lines = open_gene_list_file.readlines()
genes = [x.strip() for x in lines]
open_gene_list_file.close()
{%- endif %}

term_limit = {{limit}}
sn_layout = {{layout}}

## Extracting enriched subnetwork from ChEA-KG
First, we extract the enriched subnetwork from ChEA-KG programmatically. This requires two steps:

1. Add the gene list to the ChEA-KG site, where it will be accessbile via a unique ID (userListId). This ID can be used to share results but is private by default.  
Input: List of newline separated genes and a description (optional)   
Returns: userListId, the unique ID associated with that gene set  
2. Used to retrieve enrichment results for a particular gene set  
Input: UserListId and min_lib and libraries parameters. ChEA-KG enrichment uses the MeanRank method from ChEA3 (PMID31114921), which takes the average rank of a TF across all 6 primary libraries. `min_lib` requires by default that a TF is ranked in at least 3 libraries but can take any value from 1-6. `term_limit` specifies the number of top-ranked TFs that are returned  
Output: JSON formatted list of enriched nodes and the edges that connect them  

In [None]:
def get_cheakg_results(chea_gene_list, desc=""):
    '''
    Find the subnetwork of enriched TFs for an input gene list
    '''
    CHEA_KG = 'https://chea-kg.maayanlab.cloud/api/enrichment'
        
    payload = {
        'list': (None, "\n".join(chea_gene_list)),
        'description': (None, desc)
    }
    
    response=requests.post(f"{CHEA_KG}/addList", files=payload)
    data = json.loads(response.text)
    
    q = {
        'min_lib': 3, # minimum number of libraries that a TF must be ranked in
        'libraries': [
            {'library': "Integrated--meanRank", 'term_limit': term_limit} # edit term_limit to change number of top-ranked TFs
        ],
        'limit':50, # controls number of edges returned - may cause issues with visualization if too large
        'userListId': data['userListId']
    }
    
    query_json=json.dumps(q)
    
    res = requests.post(CHEA_KG, data=query_json)
    if res.ok:
        data = json.loads(res.text)
    else:
        data = None
        print(res.text)
    return data

def extract_nodes(cytoscape_json):
    '''
    Extract list of node labels and IDs from a cytoscape object
    '''
    nodes = []
    for entry in cytoscape_json['nodes']:
        nodes.append({'id':entry['data']['id'], 'label':entry['data']['label']})
    return nodes

In [None]:
d = get_cheakg_results(genes, "")
nodes = extract_nodes(d)

## Visualizing subnetworks with Cytoscape
The ChEA-KG network visualization is built using Cytoscape.js. We can recreate this visualization in Python using `ipycytoscape`. 

In [None]:
style = [
    {
        "selector": 'node',
        "style": {
            'background-color': 'data(color)',
            'border-color': 'data(borderColor)',
            'border-width': 'data(borderWidth)',
            'label': 'data(label)',
            "text-valign": "center",
            "text-halign": "center",
            'width': 'mapData(node_type, 0, 1, 70, 150)',
            'height': 'mapData(node_type, 0, 1, 70, 150)',
        }
    },
    {
        "selector": 'edge',
        "style": {
            "curve-style": "bezier",
            'line-color': 'data(lineColor)',
            'width': '3',
            "text-rotation": "autorotate",
            "text-margin-x": 0,
            "text-margin-y": 0,
            'font-size': '12px',
            'target-arrow-shape': 'data(directed)',
            'target-endpoint': 'outside-to-node',
            'source-endpoint': 'outside-to-node',
            'target-arrow-color': 'data(lineColor)'
        }
    }
]

In [None]:
cytoscapeobj = ipc.CytoscapeWidget() 
cytoscapeobj.graph.add_graph_from_json(d) 
cytoscapeobj.set_style(style) 
cytoscapeobj.set_layout(name=sn_layout) 
display(cytoscapeobj)

## Enriched TFs bar chart view

## Functional summary of subnetwork TFs
The below summary is automatically generated using Google Gemini. Interpret with caution.

In [None]:
client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])
MODEL = 'gemma-3n-e4b-it'

prompt = f'You are a biomedical researcher who is investigating transcriptional regulatory subnetworks. \
    You submitted a list of genes to ChEA3 for enrichment analysis and the top 10 enriched transcription factors are {nodes}.\
    Write five sentences summarizing the biological function of this subnetwork, based on the functions of the transcription factors\
        in the subnetwork. Your response should only contain these five sentences. Include PubMed citations with PMIDs or DOIs.'

response = client.models.generate_content(
    model=MODEL, contents=prompt
)

res = response.text.replace(r"(\r?\n){2,}", "").replace("'","")
display(Markdown(res))


## Predicting consensus phenotype labels using KOMP2 Phenotypes
Next, to gain a better understanding of the function of our TFs, we'll make predictions about their associated phenotypes using the Gene Set Foundation Model. These predictions are made using the gene set library from the Knockout Mouse Phenotyping Program (KOMP2).

To generate consensus predictions for the network, we predict the top 10 terms for each TF, re-rank them based on score and z-score, and select the top 10 aggregated phenotypes.

In [None]:
import traceback

In [None]:
model = 'rummagene'
SOURCE = 'KOMP2_Mouse_Phenotypes_2022'
cols = ['term','score','zscore','gene']

predict_df = pd.DataFrame()
for n in nodes:
    gene = n['label']
    
    # extract sources
    try:
      res = requests.get('https://gsfm.maayanlab.cloud/api/trpc/sources', params={
        'input': json.dumps({"model":model,"gene":gene})
      }).json()

      # extract predictions
    except Exception as e:
      print(f"Error retrieving sources: {e}")

    sources = { row['source'] for row in res['result']['data'] }

    try:
        res = requests.get(
          f"https://gsfm.maayanlab.cloud/api/trpc/{','.join('predictions' for source in sources)}", params={
            'batch': '1',
            'input': json.dumps({
              str(i): {"model":model,"source":source,"gene":gene,"offset":0,"limit":10}
              for i, source in enumerate(sources)
            }),
          }
        ).json() 
    except Exception as e:
        print(f"Error retrieving predictions: {e}")
        traceback.print_exc()

    all_predictions = dict(zip(sources, [row['result']['data'] for row in res]))
    predictions = all_predictions.get(SOURCE, [])
    entries = [(v['term'], round(v['proba'], 2), v['zscore'], gene) for v in predictions]
    temp = pd.DataFrame(entries, columns=cols)
    if not temp.empty:
      predict_df = pd.concat([predict_df,temp])
    time.sleep(0.5)

In [None]:
consensus_predictions = predict_df.sort_values(by=['score','zscore'], ascending = False)[0:10].reset_index(drop=True)
consensus_predictions