In [2]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# ChEA-KG Appyter: Characterizing Enriched Transcription Factor Regulatory Subnetworks

The [ChEA-KG](https://chea-kg.maayanlab.cloud/) Appyter predicts regulatory subnetworks of transcription factors (TFs) for an input gene set. The query gene sets are compared against libaries of TF target gene sets in ChEA3 to identify the most likely regulating TFs. The TFs are then connected via edges in the ChEA-KG background gene regulatory network (GRN). This network was constructed by submitting thousands of gene expression signatures from [RummaGEO](https://rummageo.com/) to [ChEA3](https://maayanlab.cloud/chea3/) for transcription factor enrichment analysis. The functions of enriched TFs are predicted using the [Gene Set Foundation Model](https://gsfm.maayanlab.cloud/). Automatic summaries of each subnetwork are produced using an LLM.

The only required input is a gene list. Options to change other parameters such as the subnetwork size and layout are provided. It is recommended not to change the remaining parameters. 

In [3]:
# general
import requests
import json
import numpy as np 
import pandas as pd
from IPython.display import display, HTML, Markdown
import os
import traceback

# scatterplot
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import OrderedDict
import scanpy as sc
import anndata
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource
import glasbey
output_notebook()

# bar chart
import plotly.graph_objects as go
import time
import html

# summary
from google import genai
from dotenv import load_dotenv
load_dotenv()

# subnetwork
from dash import Dash, dcc, Input, Output, ctx, callback
import dash.html
import dash_cytoscape as cyto


import warnings
warnings.filterwarnings("ignore")

In [4]:
%%appyter hide_code

{% do SectionField(
    name='section1', 
    title = '1. Submit Your Gene List', 
    subtitle = 'Upload a text file containing your gene list -OR- copy and paste your gene list into the text box below (One gene per row). You can also try the default gene list provided.'
) %}
{% do SectionField(
    name='section2', 
    title = '2. Describe your gene list', 
    subtitle = 'Provide a short description to describe your gene set.'
) %}
{% do SectionField(
    name='section3', 
    title = '3. Choose subnetwork size', 
    subtitle = 'Choose the number of top-ranked enriched TFs included in each subnetwork. The maximum is 20 TFs. The default is 10.'
) %}
{% do SectionField(
    name='section4', 
    title = '4. Choose subnetwork layout', 
    subtitle = 'Choose a Cytoscape-supported layout to view the network. The default is cose, a force-directed layout.'
) %}

In [5]:
%%appyter hide_code

{% set gene_list_kind = TabField(
    name='gene_list_kind',
    label='Gene List',
    default='Paste',
    description='Paste or upload your gene list',
    required=True,
    choices={
        'Paste': [
            TextListField(
                name='gene_list_input',
                label='Gene List',
                description='Paste your gene list (One gene per row):',
                default=['MIR4454', 'RNU86', 'SNORD34', 'EEF1A1', 'RPL11', 'DCT', 'RPL37A', 'SNORD33', 'GAPDH', 'SNORD74', 'MIR4461', 'MIR4680', 'CD63', 'SNORD68', 'LDHA', 'TMSB4X', 
                'RPL27', 'SNORD108', 'MIR3191', 'RPS18', 'RPL41', 'ENO1', 'CAPG', 'RPS15A', 'SNORD79', 'FN1', 'LGALS3', 'GPNMB', 'NPM1', 'RPL7', 'RPS14', 'SNORD38A', 'RPS13', 'RPS7', 
                'ATOX1', 'PKM2', 'RPL31', 'SNORD76', 'SNORD42B', 'RPS29', 'BCYRN1', 'RPL6', 'ATP5E', 'RPS3A', 'RPL39', 'YWHAZ', 'TOMM7', 'RPS27A', 'SNORD49A', 'MIR4482-1', 'SNORD5', 
                'RPL30', 'MIR1292', 'RPL5', 'SNORD59A', 'RPS21', 'PSAP', 'RPL35A', 'RPL13AP5', 'SNORD50B', 'H2AFZ', 'SNORD27', 'PPIA', 'PRDX1', 'RPL21P28', 'RPL9', 'RPS12', 'HSP90B1', 
                'COX7C', 'RPL23', 'SNORD30', 'LDHB', 'GNB2L1', 'SGK1', 'FKBP1A', 'SNORD57', 'HSPD1', 'SNORD12', 'MIR4273', 'RPS4X', 'UQCRH', 'RPS2', 'COX5B', 'ATP5B', 'MIR4687', 
                'MIR4263', 'RPS10', 'RAN', 'MIR3687', 'PTTG1', 'SNORD87', 'ATP1A1', 'NME2', 'SNORD18A', 'RPSA', 'TUBB', 'RPL22', 'RPLP0', 'SNORD101', 'ANXA5', 'CD74', 'RPS6', 
                'MIR4691', 'GNAS', 'GSTO1', 'EIF4A1', 'NQO1', 'SNAR-G1', 'MIR4653', 'UBB', 'RPL38', 'SNORD4A', 'SNORD82', 'MIA', 'SNORD37', 'EIF3E', 'MIR4678', 'PDIA6', 'SLC25A3', 
                'PARK7', 'PGK1', 'C17orf76-AS1', 'YBX1', 'NCL', 'RPL35', 'HSP90AB1', 'MIR103B2', 'RPL26', 'DSTN', 'SNHG8', 'CALR', 'EEF1G', 'ATP5J2', 'TUBA1B', 'CSTB', 'SPP1', 'CALU', 
                'PABPC1', 'PRAME', 'LY6E', 'HNRNPA2B1', 'SNORD38B', 'SLC25A5', 'FABP5', 'MIR1915', 'SERPINF1', 'RPS8', 'DDOST', 'HINT1', 'RPL18A', 'RPS20', 'PSMA7', 'CHCHD2', 'HNRNPA1',
                 'RPL15', 'PSMA1', 'HLA-DRA', 'RPS3', 'RPL27A', 'EEF1B2', 'TXN', 'RPL4', 'SNORD16', 'PRNP', 'MDH1', 'NME1', 'CANX', 'SNORD35B', 'TBCA', 'TPI1', 'LOC645591', 'PTMA', 
                 'ATP5A1', 'CBX3', 'SDCBP', 'C1QBP', 'DBI', 'SNORD59B', 'PLA1A', 'RPL37', 'NACA', 'CDK2', 'MIR324', 'RPS9', 'ALDOA', 'COX6A1', 'RPN2', 'ATP5F1', 'ATP5G3', 'SNAR-E', 
                 'GMPR', 'SNRPD2', 'MIR4700', 'GYPC', 'CTSK', 'SHFM1', 'P4HB', 'CTSZ', 'MIR1260B', 'PCNA', 'HMGB1', 'COX7B', 'TM4SF1', 'SNORA41', 'CTSB', 'SLC20A1', 'IER3', 'ACSL3', 
                 'CKS2', 'ATP1B3', 'MIR3188', 'ZNFX1-AS1', 'COX7A2', 'MIR135A1', 'PTGES3', 'CSDE1', 'LAMP2', 'SDC3', 'AMD1', 'MIR4523', 'SPON2', 'RPL10A', 'MIR4639', 'MIR4517', 'XRCC6', 
                 'CSE1L', 'RPL8', 'MIR3653', 'MIR3190', 'SNORD54', 'RPL14', 'NBL1', 'ACTR3', 'ATP5C1', 'SNORD22', 'ATP1A1OS', 'EIF5A', 'SNORA10', 'GNG12', 'EIF3L', 'YWHAE', 'VDAC1', 
                 'CD109', 'SRP9', 'ATP5H', 'SLIRP', 'MFI2', 'RPL19', 'LPXN', 'CLIC4', 'BTF3', 'HLA-A', 'FAM167B', 'PDIA3', 'SEC61G', 'MGST1', 'TXNRD1', 'ATP5G1', 'LITAF', 'HSD17B12', 
                 'IVNS1ABP', 'SNORD31', 'NT5E', 'A2M', 'UBA52', 'POMP', 'HMGN2', 'ARL6IP5', 'HSP90AA1', 'SIRPA', 'EMP3', 'WFDC1', 'EIF3D', 'PYGB', 'SSBP1', 'SNRPB', 'RPL32', 'CCT7', 
                 'IFI6', 'MCAM', 'RPL10', 'XRCC5', 'ATP6V1E1', 'SNRPG', 'MITF', 'RPL13A', 'MIR2861', 'C3orf14', 'C14orf2', 'RPS5', 'FBXO7', 'SPARC', 'SYPL1', 'RGS10', 'SLC45A2', 'APP', 
                 'ANXA1', 'CD68', 'CCT2', 'IPO7', 'CCT4', 'HNRNPA3', 'CAP1', 'HSPE1', 'MBP', 'ACTR2', 'UCN2', 'MIR25', 'CASP1', 'EIF3I', 'SMS', 'MME', 'ARPC2', 'CDC42', 'NDUFB9', 
                 'AP1S2', 'PRDX3', 'SRPX', 'PHGDH', 'FBL', 'CTSC', 'SNORA20', 'HNRNPH1', 'PDIA4', 'EIF3H', 'SOAT1', 'VGF', 'GANAB', 'HSPA9', 'GLO1', 'PRKAR1A', 'SNRPF', 'SDHB', 'TIMP1', 
                 'PSMD6', 'BCL2A1', 'SNRPB2', 'NDUFB3', 'SNHG6', 'CORO1C', 'THOC7', 'SNX10', 'CEACAM1', 'LAPTM4B', 'SFRP1', 'ARPC3', 'G3BP1', 'COX17', 'GPM6B', 'SSR3', 'ETFB', 'MIR4665', 
                 'CCT8', 'SLC43A3', 'GJB1', 'EIF4B', 'RPL18', 'KPNA2', 'CAPZB', 'FABP7', 'NOP56', 'HNRNPK', 'ERP29', 'VAMP8', 'OAT', 'PSMB1', 'CTSH', 'NSA2', 'LGALS3BP', 'SSB', 'LUZP6', 
                 'POLE4', 'TIMP3', 'SF3B14', 'SUMO1', 'UGP2', 'PSMA2', 'PEG10', 'ERGIC3', 'ERH', 'MIR4785', 'C19orf79', 'PLP2', 'AKR1B1', 'AZIN1', 'RAB38', 'ADSL'],
                section='section1'
            ),
        ],
        'Upload': [
            FileField(
                name='gene_list_filename',
                label='Gene List File',
                default='',
                description='Upload your gene list as a text file (One gene per row).',
                section='section1'
            ),
        ],
    },
    section = 'section1',
) %}
{%
    set description = TextField(
        name='description',
        label="Description",
        default="Genes upregulated in the SKMEL28 melanoma cell line vs. primary melanocytes (GSE88741)",
        description="Provide a short description of your gene set",
        section = 'section2',
    )
%}
{%
    set limit = IntField(
        name='subnetwork-size',
        label="Subnetwork Size",
        default=10,
        min = 1,
        max = 20,
        description="Choose the subnetwork size",
        section = 'section3',
    )
%}
{%
    set layout = ChoiceField(
        name='subnetwork-layout',
        label="Subnetwork Layout",
        default="cose",
        description="Choose the subnetwork layout. The default is 'cose', a force-directed layout. Learn more about Cytoscape layouts here: https://blog.js.cytoscape.org/2020/05/11/layouts/",
        section = 'section4',
        choices = [
            "cose",
            "grid",
            "circle",
            "breadthfirst"
        ]
    )
%}

In [6]:
%%appyter code_exec

{%- if gene_list_kind.raw_value == 'Paste' %}
gene_list_input = {{ gene_list_kind.value[0] }}
{%- else %}
gene_list_filename = {{ gene_list_kind.value[0] }}
{%- endif %}

{%- if gene_list_kind.raw_value == 'Paste' %}
genes = [x.strip() for x in gene_list_input]
{%- else %}
open_gene_list_file = open(gene_list_filename,'r')
lines = open_gene_list_file.readlines()
genes = [x.strip() for x in lines]
open_gene_list_file.close()
{%- endif %}

term_limit = {{limit}}
threshold = 3 #used for ChEA3 to determine min number of libraries for meanrank
sn_layout = "{{layout.value}}"
if sn_layout == "force-directed (cose)":
    sn_layout = "cose"
desc = {{description}}

```python
gene_list_input = ['MIR4454', 'RNU86', 'SNORD34', 'EEF1A1', 'RPL11', 'DCT', 'RPL37A', 'SNORD33', 'GAPDH', 'SNORD74', 'MIR4461', 'MIR4680', 'CD63', 'SNORD68', 'LDHA', 'TMSB4X', 'RPL27', 'SNORD108', 'MIR3191', 'RPS18', 'RPL41', 'ENO1', 'CAPG', 'RPS15A', 'SNORD79', 'FN1', 'LGALS3', 'GPNMB', 'NPM1', 'RPL7', 'RPS14', 'SNORD38A', 'RPS13', 'RPS7', 'ATOX1', 'PKM2', 'RPL31', 'SNORD76', 'SNORD42B', 'RPS29', 'BCYRN1', 'RPL6', 'ATP5E', 'RPS3A', 'RPL39', 'YWHAZ', 'TOMM7', 'RPS27A', 'SNORD49A', 'MIR4482-1', 'SNORD5', 'RPL30', 'MIR1292', 'RPL5', 'SNORD59A', 'RPS21', 'PSAP', 'RPL35A', 'RPL13AP5', 'SNORD50B', 'H2AFZ', 'SNORD27', 'PPIA', 'PRDX1', 'RPL21P28', 'RPL9', 'RPS12', 'HSP90B1', 'COX7C', 'RPL23', 'SNORD30', 'LDHB', 'GNB2L1', 'SGK1', 'FKBP1A', 'SNORD57', 'HSPD1', 'SNORD12', 'MIR4273', 'RPS4X', 'UQCRH', 'RPS2', 'COX5B', 'ATP5B', 'MIR4687', 'MIR4263', 'RPS10', 'RAN', 'MIR3687', 'PTTG1', 'SNORD87', 'ATP1A1', 'NME2', 'SNORD18A', 'RPSA', 'TUBB', 'RPL22', 'RPLP0', 'SNORD101', 'ANXA5', 'CD74', 'RPS6', 'MIR4691', 'GNAS', 'GSTO1', 'EIF4A1', 'NQO1', 'SNAR-G1', 'MIR4653', 'UBB', 'RPL38', 'SNORD4A', 'SNORD82', 'MIA', 'SNORD37', 'EIF3E', 'MIR4678', 'PDIA6', 'SLC25A3', 'PARK7', 'PGK1', 'C17orf76-AS1', 'YBX1', 'NCL', 'RPL35', 'HSP90AB1', 'MIR103B2', 'RPL26', 'DSTN', 'SNHG8', 'CALR', 'EEF1G', 'ATP5J2', 'TUBA1B', 'CSTB', 'SPP1', 'CALU', 'PABPC1', 'PRAME', 'LY6E', 'HNRNPA2B1', 'SNORD38B', 'SLC25A5', 'FABP5', 'MIR1915', 'SERPINF1', 'RPS8', 'DDOST', 'HINT1', 'RPL18A', 'RPS20', 'PSMA7', 'CHCHD2', 'HNRNPA1', 'RPL15', 'PSMA1', 'HLA-DRA', 'RPS3', 'RPL27A', 'EEF1B2', 'TXN', 'RPL4', 'SNORD16', 'PRNP', 'MDH1', 'NME1', 'CANX', 'SNORD35B', 'TBCA', 'TPI1', 'LOC645591', 'PTMA', 'ATP5A1', 'CBX3', 'SDCBP', 'C1QBP', 'DBI', 'SNORD59B', 'PLA1A', 'RPL37', 'NACA', 'CDK2', 'MIR324', 'RPS9', 'ALDOA', 'COX6A1', 'RPN2', 'ATP5F1', 'ATP5G3', 'SNAR-E', 'GMPR', 'SNRPD2', 'MIR4700', 'GYPC', 'CTSK', 'SHFM1', 'P4HB', 'CTSZ', 'MIR1260B', 'PCNA', 'HMGB1', 'COX7B', 'TM4SF1', 'SNORA41', 'CTSB', 'SLC20A1', 'IER3', 'ACSL3', 'CKS2', 'ATP1B3', 'MIR3188', 'ZNFX1-AS1', 'COX7A2', 'MIR135A1', 'PTGES3', 'CSDE1', 'LAMP2', 'SDC3', 'AMD1', 'MIR4523', 'SPON2', 'RPL10A', 'MIR4639', 'MIR4517', 'XRCC6', 'CSE1L', 'RPL8', 'MIR3653', 'MIR3190', 'SNORD54', 'RPL14', 'NBL1', 'ACTR3', 'ATP5C1', 'SNORD22', 'ATP1A1OS', 'EIF5A', 'SNORA10', 'GNG12', 'EIF3L', 'YWHAE', 'VDAC1', 'CD109', 'SRP9', 'ATP5H', 'SLIRP', 'MFI2', 'RPL19', 'LPXN', 'CLIC4', 'BTF3', 'HLA-A', 'FAM167B', 'PDIA3', 'SEC61G', 'MGST1', 'TXNRD1', 'ATP5G1', 'LITAF', 'HSD17B12', 'IVNS1ABP', 'SNORD31', 'NT5E', 'A2M', 'UBA52', 'POMP', 'HMGN2', 'ARL6IP5', 'HSP90AA1', 'SIRPA', 'EMP3', 'WFDC1', 'EIF3D', 'PYGB', 'SSBP1', 'SNRPB', 'RPL32', 'CCT7', 'IFI6', 'MCAM', 'RPL10', 'XRCC5', 'ATP6V1E1', 'SNRPG', 'MITF', 'RPL13A', 'MIR2861', 'C3orf14', 'C14orf2', 'RPS5', 'FBXO7', 'SPARC', 'SYPL1', 'RGS10', 'SLC45A2', 'APP', 'ANXA1', 'CD68', 'CCT2', 'IPO7', 'CCT4', 'HNRNPA3', 'CAP1', 'HSPE1', 'MBP', 'ACTR2', 'UCN2', 'MIR25', 'CASP1', 'EIF3I', 'SMS', 'MME', 'ARPC2', 'CDC42', 'NDUFB9', 'AP1S2', 'PRDX3', 'SRPX', 'PHGDH', 'FBL', 'CTSC', 'SNORA20', 'HNRNPH1', 'PDIA4', 'EIF3H', 'SOAT1', 'VGF', 'GANAB', 'HSPA9', 'GLO1', 'PRKAR1A', 'SNRPF', 'SDHB', 'TIMP1', 'PSMD6', 'BCL2A1', 'SNRPB2', 'NDUFB3', 'SNHG6', 'CORO1C', 'THOC7', 'SNX10', 'CEACAM1', 'LAPTM4B', 'SFRP1', 'ARPC3', 'G3BP1', 'COX17', 'GPM6B', 'SSR3', 'ETFB', 'MIR4665', 'CCT8', 'SLC43A3', 'GJB1', 'EIF4B', 'RPL18', 'KPNA2', 'CAPZB', 'FABP7', 'NOP56', 'HNRNPK', 'ERP29', 'VAMP8', 'OAT', 'PSMB1', 'CTSH', 'NSA2', 'LGALS3BP', 'SSB', 'LUZP6', 'POLE4', 'TIMP3', 'SF3B14', 'SUMO1', 'UGP2', 'PSMA2', 'PEG10', 'ERGIC3', 'ERH', 'MIR4785', 'C19orf79', 'PLP2', 'AKR1B1', 'AZIN1', 'RAB38', 'ADSL']
genes = [x.strip() for x in gene_list_input]
term_limit = 10
threshold = 3 #used for ChEA3 to determine min number of libraries for meanrank
sn_layout = "cose"
if sn_layout == "force-directed (cose)":
    sn_layout = "cose"
desc = '''Genes upregulated in the SKMEL28 melanoma cell line vs. primary melanocytes (GSE88741)'''
```

## Extracting enriched subnetwork from ChEA-KG
First, we extract the enriched subnetwork from ChEA-KG programmatically. This requires two steps:

**1. Add the gene list to the ChEA-KG site, where it will be accessbile via a unique ID (`userListId`). <ins>This ID can be used to share results but is private by default.</ins>**  
*Input:* List of newline separated genes and a description (optional)    
*Returns:* `userListId`, the unique ID associated with that gene set   

**2. Retrieve enrichment results using the gene list ID**   
*Input:* `userListId`, `min_lib` and `libraries` parameters. ChEA-KG enrichment uses the MeanRank method from ChEA3 [(PMID31114921)](https://academic.oup.com/nar/article/47/W1/W212/5494769?login=false), which takes the average rank of a TF across all 6 primary libraries. `min_lib` requires by default that a TF is ranked in at least 3 libraries but can take any value from 1-6. `term_limit` specifies the number of top-ranked TFs that are returned    
*Output:* JSON formatted list of enriched nodes and the edges  

In [7]:
def get_cheakg_results(chea_gene_list, desc=""):
    '''
    Find the subnetwork of enriched TFs for an input gene list
    '''
    CHEA_KG = 'https://chea-kg.maayanlab.cloud/api/enrichment'
        
    payload = {
        'list': (None, "\n".join(chea_gene_list)),
        'description': (None, desc)
    }
    try:
        response=requests.post(f"{CHEA_KG}/addList", files=payload)
        data = json.loads(response.text)
    except Exception as e: 
        print("Error connecting to ChEA-KG: ", e)
    
    q = {
        'min_lib': 3, # minimum number of libraries that a TF must be ranked in
        'libraries': [
            {'library': "Integrated--meanRank", 'term_limit': term_limit} # edit term_limit to change number of top-ranked TFs
        ],
        'limit':50, # controls number of edges returned - may cause issues with visualization if too large
        'userListId': data['userListId']
    }
    
    query_json=json.dumps(q)
    
    res = requests.post(CHEA_KG, data=query_json)
    if res.ok:
        data = json.loads(res.text)
    else:
        data = None
        print(res.text)
    return data

def extract_nodes(cytoscape_json):
    '''
    Extract list of node labels and IDs from a cytoscape object
    '''
    nodes = []
    for entry in cytoscape_json['nodes']:
        nodes.append({'id':entry['data']['id'], 'label':entry['data']['label']})
    return nodes

In [8]:
d = get_cheakg_results(genes, desc)
nodes = extract_nodes(d)
nodelist = [x['label'] for x in nodes]

## UMAP of ChEA-KG TFs
Each point in the UMAP represents a TF with at least one source relationship (n=700). Term frequency-inverse document frequency (TF-IDF) values were computed for the target TF set corresponding to each source TF, and UMAP was applied to the resulting values. The TFs are plotted based on the first two UMAP dimensions. Generally, TFs with more similar target TF sets are positioned closer together. TFs are colored by automatically identified clusters computed with the Leiden algorithm applied to the TF-IDF values. Hovering over points will display the TF and the automatically assigned cluster.

In [9]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = glasbey.create_palette(palette_size=len(clusters), lightness_bounds=(0,100), chroma_bounds=(50,100), as_hex=True)
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf, nodes):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])
    df['lw'] = df['term'].apply(lambda x: 2 if x in nodes else 0.5)
    df['la'] = df['term'].apply(lambda x: 1 if x in nodes else 0)
    df['size'] = df['term'].apply(lambda x: 12 if x in nodes else 5)
    df['fa'] = df['term'].apply(lambda x: 1 if x in nodes else 0.5)

    tooltips = [
        ("Gene Set", "@gene_set"),
        ("Cluster", "@label")
    ]
        
    hover_emb = HoverTool(tooltips=tooltips)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=900, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            sizes = df['size'],
            colors = df['color'],
            fill_alphas = df['fa'],
            label = df['cluster'],
            line_widths = df['lw'],
            line_alphas = df['la'],
            line_colors = ['#000000']*df.shape[0]
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    
    s = plot_emb.scatter(
            'x', 
            'y', 
            size = 'sizes', 
            source = source, 
            color = 'colors', 
            fill_alpha = 'fill_alphas',
            line_alpha = 'line_alphas',
            line_width = 'line_widths',
            line_color = 'line_colors',
            legend_group = 'label'
        )

    plot_emb.add_layout(plot_emb.legend[0], 'right')
    
    return plot_emb

In [10]:
r = requests.get("https://minio.dev.maayanlab.cloud/hgrn-chear/network_target_sets.gmt")
file = r.text.split("\n")

### variables to store gene set data
lib_dict = OrderedDict()

for line in file[:-1]:
    tokens = line.split("\t\t")
    term = tokens[0]
    genes = [x.split(',')[0].strip() for x in tokens[1].split('\t')]
    lib_dict[term] = ' '.join(genes)

## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(
    lib_dict, 
    nneighbors=5,
    mindist=.05,
)

plot = get_scatterplot(scatter_df, nodelist)
show(plot)

**Figure 1.** *UMAP embedding of ChEA-KG source TFs. Enriched TFs for the input gene set are enlarged and outlined. Each point represents a TF with at least one source relationship. Points are colored based on their leiden cluster membership.*

## Bar Chart View of Enriched TFs
Enriched TFs are identified using the [ChEA3 api](https://maayanlab.cloud/chea3/). The bar chart below provides a more detailed view of how the TF rankings are calculated. An enrichment score is calculated for each TF against each of the six ChEA3 background libraries of TF-target sets. The TFs are ranked for each library based on their enrichment scores. The final TF rankings are based on the MeanRank, which is the average rank of each TF across all libraries.

In [11]:
def get_chea3_results(gene_set, query_name):
    ADDLIST_URL = 'https://maayanlab.cloud/chea3/api/enrich/'
    payload = {
        'gene_set': gene_set,
        'query_name': query_name
    }
    response = requests.post(ADDLIST_URL, data=json.dumps(payload))
    if not response.ok: 
        # r.ok (where r is the object) returns whether the call to the url was successful
        raise Exception('Error analyzing gene list')
    time.sleep(1)
    return json.loads(response.text) # .text returns the content of response in unicode

# Function for displaying the individual library bar charts 
def display_charts(libs, description): 
    for libname in libs:
        
        display(HTML(f'<h3>{libname}</h3>'))
        
        tfs = [i['TF'] for i in results[libname]][0:term_limit]
        scores = [float(i['FET p-value']) for i in results[libname]][0:term_limit]
        
        # reverse the order/ranking of the tfs (and their respective scores)
        tfs = tfs[::-1]
        scores = scores[::-1]

        # takes the -log of the scores
        scores = -np.log10(scores)

        
        score_range = max(scores) - min(scores)
        x_lowerbound = min(scores) - (score_range * 0.05)
        x_upperbound = max(scores) + (score_range * 0.05)
        
        libfig = go.Figure(data = go.Bar(name = libname, 
                                         x = scores, 
                                         y = tfs, 
                                         marker = go.bar.Marker(color = 'rgb(255,127,80)'), 
                                         orientation = 'h'))
        libfig.update_layout(
            title = {
                'text':'Bar Chart of Scores based on FET p-values',
                'y': 0.87,
                'x': 0.5,
                'xanchor':'center',
                'yanchor':'top'
            },
            xaxis_title = '-log\u2081\u2080(FET p-value)', 
            # \u208 unicode to get the subscript (need a subscript of "10")
            yaxis_title = 'Transcription Factors',
            font = dict(
                size = 16,
                color = 'black'
            )
        )
        
        libfig.update_xaxes(range = [x_lowerbound, x_upperbound])
        
        libfig.show()
        
        display(HTML(f'<h5>{description[libname]}</h5>'))
        
def indexfinder(lib_score_list, value):
    index = 1
    for num in lib_score_list:
        if num == value:
            return index
        elif num != 0:
            index += 1


In [12]:
results = get_chea3_results(genes, 'query')

In [13]:
c_lib_palette = {'ARCHS4 Coexpression':'rgb(196, 8, 8)',
                 'ENCODE ChIP-seq':'rgb(244, 109, 67)',
                 'Enrichr Queries':'rgb(242, 172, 68)', 
                 'GTEx Coexpression':'rgb(236, 252, 68)',
                 'Literature ChIP-seq':'rgb(165, 242, 162)',
                 'ReMap ChIP-seq':'rgb(92, 217, 78)'}
# this sets all the color values for all the libraries that will be displayed in the bar chart

# NOTE: removed Integrated mean/topRank since those are compiled from the above 6 libraries 
# afterwards and so none of the TFs will have Integrated mean/topRank as one of their libraries

c_lib_means = {'ARCHS4 Coexpression': [0] * term_limit, 'ENCODE ChIP-seq': [0] * term_limit, 
               'Enrichr Queries': [0] * term_limit, 'GTEx Coexpression': [0] * term_limit,
               'Literature ChIP-seq': [0] * term_limit, 'ReMap ChIP-seq': [0] * term_limit}
# creates a dictionary where each library is a key, and the values are empty lists with as
# many indices/spaces as the user has requested transcription factors (ex: if the user
# requests 15 TFs to be returned, the lists will have 15 spaces)


libs_sorted = ['ARCHS4 Coexpression','ENCODE ChIP-seq','Enrichr Queries',
               'GTEx Coexpression','Literature ChIP-seq','ReMap ChIP-seq']


mr_results = results['Integrated--meanRank']
###### NOTE: for meanRank, the TFs are already ranked by Score ######

for i in range(len(mr_results)):
    for lib in libs_sorted:
        mr_results[i].update({lib:0})
        
for i in range(len(mr_results)):
    thing = mr_results[i]['Library'].split(';')
    for a in range(len(thing)):
        library, value = thing[a].split(',')
        mr_results[i].update({library:int(value)})
    
sortedARCHS4 = sorted(mr_results, key = lambda k: k['ARCHS4 Coexpression'])
sortedGTEx = sorted(mr_results, key = lambda k: k['GTEx Coexpression']) 
sortedEnrichr = sorted(mr_results, key = lambda k: k['Enrichr Queries']) 
sortedENCODE = sorted(mr_results, key = lambda k: k['ENCODE ChIP-seq']) 
sortedReMap = sorted(mr_results, key = lambda k: k['ReMap ChIP-seq']) 
sortedLit = sorted(mr_results, key = lambda k: k['Literature ChIP-seq']) 

rankedARCHS4 = [entry['ARCHS4 Coexpression'] for entry in sortedARCHS4]
rankedENCODE = [entry['ENCODE ChIP-seq'] for entry in sortedENCODE]
rankedEnrichr = [entry['Enrichr Queries'] for entry in sortedEnrichr] 
rankedGTEx = [entry['GTEx Coexpression'] for entry in sortedGTEx]
rankedLit = [entry['Literature ChIP-seq'] for entry in sortedLit]
rankedReMap = [entry['ReMap ChIP-seq'] for entry in sortedReMap] 


ranking_dict = {'ARCHS4 Coexpression':rankedARCHS4,
                'ENCODE ChIP-seq':rankedENCODE,
                'Enrichr Queries':rankedEnrichr,
                'GTEx Coexpression':rankedGTEx,
                'Literature ChIP-seq':rankedLit,
                'ReMap ChIP-seq':rankedReMap}

In [14]:
for tfentry in mr_results:
    tfentry.update( [('SumRank', 0), ('AvgRank', 0) ])
    library_scores = tfentry['Library'].split(';')
    lib_counter = 0
    for a in library_scores:
        l, v = a.split(',')
        v = int(v)
        #scorerank = ranking_dict[l].index(v) + 1
        scorerank = indexfinder(ranking_dict[l], int(v))
        tfentry['SumRank'] += int(scorerank)
        lib_counter += 1
    tfentry['AvgRank'] = (tfentry['SumRank'] / lib_counter)
    
sorted_results = sorted(mr_results, key = lambda k: k['AvgRank'])

In [15]:
sorted_top_results = []
index = 0
while (len(sorted_top_results) < term_limit):
    if len(sorted_results[index]['Library'].split(';')) >= threshold:
        sorted_top_results.append(sorted_results[index])
    index += 1
    # moves on to the next index
    
sorted_top_results = sorted_top_results[::-1]

# set up a list with all the TFs, sorted by rank (lowest to highest, in line with top_results)
sorted_tfs = []
for i in range(0, len(sorted_top_results)):
    sorted_tfs.append(sorted_top_results[i].get('TF'))
    # this pulls only the TF name from top_results and adds it to sorted_tfs

In [16]:
for i, tfentry in enumerate(sorted_top_results):
    libscores = tfentry['Library'].split(';')
    for a in libscores:
        lib, value = a.split(',')
        rank = indexfinder(ranking_dict[lib], int(value))
        avg = tfentry['AvgRank']
        tot = tfentry['SumRank']
        bar_length = (rank*avg)/tot
        c_lib_means[lib][i] = float(bar_length)

In [17]:
# Plotting the actual bar chart
fig = go.Figure(data = [go.Bar(name = c_lib, 
                               x = c_lib_means[c_lib], 
                               y = sorted_tfs,
                               marker = go.bar.Marker(color = c_lib_palette[c_lib]), 
                               orientation = 'h') 
                        for c_lib in libs_sorted])
h = 400 if term_limit <=10 else 400+10*term_limit
fig.update_layout(barmode = 'stack')
'''fig.update_layout(
    title = {
        'text': 'Stacked Bar Chart of Average Ranks in Different Libraries',
        'y': 0.67,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
    }
)'''
fig.update_layout(
    
    xaxis_title = 'Average of Ranks Across All Libraries',
    yaxis_title = 'Transcription Factors',
    font = dict(
        size = 16,
        color = 'black'
    ),
    width=900,
    height=h
)
html_str = fig.to_html(include_plotlyjs='cdn')
escaped_html = html.escape(html_str)

iframe = f"""
<iframe srcdoc="{escaped_html}" width="100%" height="{h}" frameborder="0"></iframe>
"""

display(HTML(iframe))

**Figure 2.** *Bar chart of enriched transcription factors. The bar chart displays the top ranked transcription factors based on their average rank across each of the TF-target libraries used in the ChEA3 enrichment calculation. Each bar displays a top ranked TF along with its average score for each library.* 

## Automatically Generated Summary of Subnetwork TFs
The below summary is automatically generated using Google's [Gemma 3n E4B](https://huggingface.co/google/gemma-3n-E4B-it-litert-preview) model. Interpret results with caution.

In [47]:
ids = []
for node in nodelist:
    res = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={node}&retmode=json')
    t = json.loads(res.text)
    ids.extend(t['esearchresult']['idlist'][:10])
    time.sleep(0.3)
idlist = [f'[{i}] PMID:{ids[i]}' for i in range(len(ids))]
idlist = (", ").join(idlist)
idlist

'[0] PMID:40559852, [1] PMID:40559598, [2] PMID:40552091, [3] PMID:40551516, [4] PMID:40549063, [5] PMID:40545768, [6] PMID:40542047, [7] PMID:40540320, [8] PMID:40539450, [9] PMID:40539046, [10] PMID:38658415, [11] PMID:34147695, [12] PMID:32295384, [13] PMID:30524950, [14] PMID:28062691, [15] PMID:27080126, [16] PMID:26238961, [17] PMID:18258292, [18] PMID:40560457, [19] PMID:40557837, [20] PMID:40556512, [21] PMID:40554764, [22] PMID:40554207, [23] PMID:40554053, [24] PMID:40552364, [25] PMID:40552293, [26] PMID:40551206, [27] PMID:40548863, [28] PMID:40560629, [29] PMID:40560338, [30] PMID:40560010, [31] PMID:40558399, [32] PMID:40558276, [33] PMID:40558274, [34] PMID:40558039, [35] PMID:40557912, [36] PMID:40555606, [37] PMID:40555459, [38] PMID:39673574, [39] PMID:39442197, [40] PMID:39303720, [41] PMID:36477413, [42] PMID:36399538, [43] PMID:36310710, [44] PMID:34604214, [45] PMID:33925358, [46] PMID:31999691, [47] PMID:30192028, [48] PMID:40425854, [49] PMID:40087331, [50] PMID

In [55]:
try:    
    API_KEY = os.environ['CHEAKG_GEMINI_API_KEY']
    client = genai.Client(api_key=API_KEY)
    MODEL = 'gemma-3n-e4b-it'

    prompt = f'Produce a one-paragraph summary of this list of PubMed publications {idlist}. Your summary should explain the function of these transcription factors: {nodelist}. For each sentence, include the index number of the relevant citation. Only include the summary in your response.\            '

    response = client.models.generate_content(
        model=MODEL, contents=prompt
    )

    res = response.text.replace(r"(\r?\n){2,}", "").replace("'","")#.replace("] ","]")
    display(Markdown(res))
except:
    print("An API key is required to use this feature. API keys can be generated in Google AI Studio: https://aistudio.google.com/apikey. The model used in this Appyter (Gemma 3n E4B) is open-source and free to use.")

The provided PubMed publications explore the roles of several transcription factors – NFE2L2, THAP4, MYC, TP53, GTF3A, ZNF207, PRMT3, ETV4, HMGA1, and CEBPZ – in various cellular processes, particularly in the context of cancer development and progression [0-97].  NFE2L2 is frequently implicated in cellular stress response, inflammation, and tumorigenesis, often acting as a target for therapeutic intervention [0, 21, 40]. THAP4 plays a role in transcriptional regulation, chromatin remodeling, and cancer metastasis, sometimes acting as a tumor suppressor or oncogene depending on the context [1, 28, 40]. MYC is a well-known oncogene that regulates cell growth, proliferation, and metabolism, and is often dysregulated in various cancers [1, 30, 40]. TP53, a crucial tumor suppressor, maintains genomic stability and regulates cell cycle arrest, apoptosis, and senescence, with its inactivation frequently observed in cancer [2, 17, 36]. GTF3A is involved in transcriptional regulation, particularly in development and neuronal function, and its mutations are associated with developmental disorders and some cancers [3, 33, 36]. ZNF207 is a zinc finger protein that functions as a transcriptional repressor and is frequently altered in various cancers, contributing to tumorigenesis and metastasis [4, 16, 40]. PRMT3 is an epigenetic regulator involved in DNA methylation and histone modification, and is often aberrantly activated in cancer, promoting cell survival and proliferation [5, 11, 40]. ETV4 is a transcription factor involved in hematopoiesis and other developmental processes, and is often overexpressed or mutated in various cancers [6, 26, 40]. HMGA1 is a non-coding RNA that functions as a chromatin modifier and regulator of gene expression, with its dysregulation implicated in cancer development and progression [7, 18, 40].  CEBPZ is a transcription factor involved in adipogenesis, inflammation, and cancer, and its dysregulation can contribute to tumorigenesis and metastasis [8, 24, 40]. These studies highlight the complex and often overlapping roles of these transcription factors in human health and disease, suggesting potential targets for therapeutic strategies [0-97].





## Subnetwork Phenotype predictions with KOMP2
The Gene Set Foundation Model (GSFM) [(PMID:40501705)](https://pubmed.ncbi.nlm.nih.gov/40501705/) is trained to recover held-out genes from gene sets. Below, we use this tool to make predictions about the gene set membership of subnetwork TFs using the [Knockout Mouse Phenotyping Program](https://www.mousephenotype.org/) (KOMP2) Mouse Phenotypes 2022 [gene set library](https://maayanlab.cloud/Enrichr/#libraries). We report the top 10 phenotype predictions across all TFs in the network. 

To generate consensus TF predictions, we predict terms individually for each enriched TF and select the top 10 from the set of all predictions, ranked by score and z-score.

In [None]:
model = 'rummagene'
SOURCE = 'KOMP2_Mouse_Phenotypes_2022'
cols = ['term','score','zscore','gene']

predict_df = pd.DataFrame()
for n in nodes:
    gene = n['label']
    
    # extract sources
    try:
      res = requests.get('https://gsfm.maayanlab.cloud/api/trpc/sources', params={
        'input': json.dumps({"model":model,"gene":gene})
      }).json()

      # extract predictions
    except Exception as e:
      print(f"Error retrieving sources: {e}")

    sources = { row['source'] for row in res['result']['data'] }

    try:
        res = requests.get(
          f"https://gsfm.maayanlab.cloud/api/trpc/{','.join('predictions' for source in sources)}", params={
            'batch': '1',
            'input': json.dumps({
              str(i): {"model":model,"source":source,"gene":gene,"offset":0,"limit":10}
              for i, source in enumerate(sources)
            }),
          }
        ).json() 
    except Exception as e:
        print(f"Error retrieving predictions: {e}")
        traceback.print_exc()

    all_predictions = dict(zip(sources, [row['result']['data'] for row in res]))
    predictions = all_predictions.get(SOURCE, [])
    entries = [(v['term'], round(v['proba'], 2), v['zscore'], gene) for v in predictions]
    temp = pd.DataFrame(entries, columns=cols)
    if not temp.empty:
      predict_df = pd.concat([predict_df,temp])
    time.sleep(0.5)

In [None]:
consensus_predictions = predict_df.sort_values(by=['score','zscore'], ascending = False)[0:10].reset_index(drop=True)

In [None]:
display(HTML(consensus_predictions.to_html(index=False)))

**Table 1.** *Consensus TF predictions from GSFM based on the KOMP2 Mouse Phenotypes 2022 gene set library. The table shows the top 10 highest scoring predictions for all enriched TFs, along with the model-assigned score and z-score for each term. The "gene" column indicates the TF for which the high-scoring function prediction was made.*

## Visualizing Enriched Subnetwork with Cytoscape
The ChEA-KG network visualization is built using [Cytoscape.js](https://js.cytoscape.org/). We can recreate this visualization in Python using the `ipycytoscape` [widget](https://github.com/cytoscape/ipycytoscape). Be careful of zooming out too far - it can be challenging to find the graph again!

In [None]:
style = [
    {
        "selector": 'node',
        "style": {
            'background-color': 'data(color)',
            'border-color': 'data(borderColor)',
            'border-width': 'data(borderWidth)',
            'label': 'data(label)',
            "text-valign": "center",
            "text-halign": "center",
            'width': 'mapData(node_type, 0, 1, 70, 150)',
            'height': 'mapData(node_type, 0, 1, 70, 150)',
        }
    },
    {
        "selector": 'edge',
        "style": {
            "curve-style": "bezier",
            'line-color': 'data(lineColor)',
            'width': '3',
            "text-rotation": "autorotate",
            "text-margin-x": 0,
            "text-margin-y": 0,
            'font-size': '12px',
            'target-arrow-shape': 'data(directed)',
            'target-endpoint': 'outside-to-node',
            'source-endpoint': 'outside-to-node',
            'target-arrow-color': 'data(lineColor)'
        }
    }
]

styles = {
    'output': {
        'overflow-y': 'scroll',
        'overflow-wrap': 'break-word',
        'height': 'calc(100% - 25px)',
        'border': 'thin lightgrey solid'
    },
    'tab': {'height': 'calc(98vh - 115px)'}
}

In [None]:
cyto.load_extra_layouts()
app = Dash()


app.layout = dash.html.Div([
    dash.html.Div(className='eight columns', children=[
        cyto.Cytoscape(
            id='cytoscape-image-export',
            layout={'name': sn_layout},
            style={'width': '100%', 'height': '500px'},
            elements=d,
            stylesheet=style
        )
    ]), 
    dash.html.Div(className='four columns', children=[
        dash.html.Div('Download graph as PNG:'),
        dash.html.Button("Download PNG", id="btn-get-png"),
        dash.html.Div(id='image-text', children='Image data will appear here')
    ])
])

# Store image data when generated
@app.callback(
    Output('image-text', 'children'),
    Input('cytoscape-image-export', 'imageData'),
)
def put_image_string(data):
    if data:
        return dash.html.Img(src=data)
    return "No image data yet."
# Trigger download when button is clicked
@app.callback(
    Output("cytoscape-image-export", "generateImage"),
    Input("btn-get-png", "n_clicks"),
)
def get_image(get_png_clicks):
    if get_png_clicks:
        return {
            'type': 'png',
            'action': 'store'
        }
    return dash.no_update

app.run()