In [9]:
import pandas as pd

from bengrn import get_GT_db, get_perturb_gt, get_sroy_gt, BenGRN, FILEDIR

import numpy as np
import tqdm
import scanpy as sc
import os

from huggingface_hub import hf_hub_download

from scprint import scPrint
from scprint.tasks import GNInfer
from scdataloader import Preprocessor

from bengrn import BenGRN
from anndata.utils import make_index_unique

from bengrn.base import train_classifier
from bengrn import compute_genie3
from grnndata import utils as grnutils
import joblib

from grnndata import read_h5ad

from matplotlib import pyplot as plt
from scdataloader import utils as data_utils

from pyvis import network as pnx
import networkx as nx
import scipy.sparse
import gseapy as gp
from gseapy import dotplot

from scdataloader.utils import revert_to_raw

import scipy.sparse

import torch

torch.set_float32_matmul_precision("medium")

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
LOC = '/pasteur/appa/scratch/jkalfon/data/'

name = "Macrophage"

In [7]:

grn_m = read_h5ad(LOC+ f"grn_task4_mouse_{name}.h5ad")
grn_h = read_h5ad(LOC+ f"grn_task4_human_{name}.h5ad")

In [37]:
grn_m.var['symbol'] = grn_m.var.symbol.str.upper()

In [8]:
# between the two networks we have ~3000 genes in common over their 4000 genes
grn_m.var.index = grn_m.var.symbol.str.upper()
common = set(grn_h.var.index) & set(grn_m.var.index)
len(common)

2978

In [45]:
# hub genes based on edge centrality (number of connections / strength of connections)
grn_h.grn.sum(0).sort_values(ascending=False).head(30)

index
FTL         67.522400
B2M         63.816372
TMSB4X      54.556316
NLN         51.200565
FTH1        43.270775
HLA-DRA     40.650234
TXNIP       38.062641
EIF1        35.565845
C1QA        25.740410
UBXN1       25.120731
LYZ         23.978209
ITGB2       23.338360
CD74        23.209900
RPS14       22.008221
GABARAP     21.861885
SOD2        21.790438
ACTB        20.383188
C1QB        20.347918
PPIA        20.202841
C1QC        19.605148
RPS27A      19.256315
SPIC        19.180021
RPS6        18.481428
ALB         18.100208
ALOX5AP     17.430592
SRGN        17.301922
DMRT2       16.415487
IFI30       16.027319
SH3BGRL3    14.680469
RPL7        13.885112
dtype: float32

In [46]:
# without taking in account genes that are not present in the mouse network
grn_h.grn.sum(0).loc[list(common)].sort_values(ascending=False).head(30)

index
B2M        63.816372
TMSB4X     54.556316
FTH1       43.270775
ITGB2      23.338360
CD74       23.209900
RPS14      22.008221
SOD2       21.790438
PPIA       20.202841
C1QC       19.605148
SPIC       19.180021
RPS6       18.481428
ALB        18.100208
SRGN       17.301922
IFI30      16.027319
RPS19      13.683068
CD163      13.305556
CST3       13.139258
RPLP1      11.228999
MYL6       11.163845
PTP4A1     11.134760
S100A11    10.099115
CD3D       10.072865
VSIG4       9.813706
OLR1        9.525496
S100A4      9.370368
HMOX1       9.352107
CTSD        8.292619
XAF1        7.860425
VIM         7.456419
CXCL3       7.263704
dtype: float32

ACTB=ACTB (most expressed in B cell, motility, key linked gene is CFL1 top 10 in mouse)

B2M=B2M (serum protein found in association with the major histocompatibility complex, linked to HLA-C, top 20 in human genes)

TMSB4X = TMSB10, TMSB4X (Thymosin Beta, cell prolif, highly linked to actin and ACTB)

CD83, CD52 = CD52 CD3D, (CD74)  ( immunoglobulin superfamily of receptors, Regulate b cell function, esp CD83, CD52 less)

S100A5 = S100A6 (key cell process regulation with calcium binding, both in top 3 relation to TOP53)

RPL41, RPL3 = RPL10, RPLP1 (ribosomal proteins, much more for human in top50, highly co-regulated, often together, have in important impact on mRNA content by working on splicing, mRNA stability, sequestration of TFs )

[RPS6] = RPS27A, RPS24, (RPS28) (same)

[FTL] = FTH1 (iron storage)
    MEF2C, JUNB (key TF in b cells, https://pubmed.ncbi.nlm.nih.gov/12907453/, https://pmc.ncbi.nlm.nih.gov/articles/PMC2518613/)

RGS2, RGS1, GPR183, GPX1 (G protein Receptor, signaling members, super important b cell members)

JCHAIN, ITM2C, OGT, UBE2S, IFITM1, CREM, FCER1G, CST3
    TOP2A, CRIP1, PPP1R15A, MS4A1, PPIA, PTMA

"Immunoglobulin Receptor Binding (GO:0034987)" JCHAIN
"B cell activation", "host pathogen pathways" IFITM1, FCER1G "B cell activation" MS4A1 == "virus detection" PPIA, PPP1R15A
"MHC Class II Protein Complex Binding" MS4A1, "HSPA1A" (MHC related)(PPP1R15A;PTMA)
"circadian rythm" CST3;CREM;OGT = TOP2A

() = in the non matching genes
[] = in the top 50

can find homologs and might use some genes that are similar when others are not available or expressed

bias on the 1000 cells and which ones had which genes expressed
worked using only human / mouse known orhologs, but some relationships might not have been found

Generically, many genes with high importance in generic cellular processes, and some with high importance in B cell function.

Even more interesting, many of these genes are listed in papers about the transcriptional control dynamic of B cell function using paired single cell ATAC-seq - CHip-seq - RNA-seq  https://www.biorxiv.org/content/10.1101/2022.02.23.481342v2.full.pdf "Fate-resolved gene regulatory signatures of individual B lymphocytes in the early1stages of Epstein-Barr Virus infection2"
(PPP1R-, TOP2A, MS4A1, FCER2/FCER1G, JCHAIN, FTH1, FTL, TMSB10, TMSB4X, GPR183, (likely other members of the G protein family), numerous ribosomal subunit genes (listed as example: RPS27A) and also))

In [None]:
INTerferon induced genes are among the top hub genes in human but not in mouse

In [None]:
APOE
FCNA
SRRM2
MRC1
IGFBP7
WFDC17
CFP
ITM2B
NDFIP1
NME2
DDX5
VSIG4
ZBTB20
TUBA4A

In [None]:
IFI30 - IFITM3 INTerferon induced transmembrane protein 3 and thiol reductase involved in MHC class II antigen processing


In [None]:
NLN
HLA-DRA
TXNIP
EIF1
UBXN1
LYZ
ITGB2
GABARAP
SOD2
ACTB
PPIA
SPIC
ALOX5AP
SRGN
DMRT2
SH3BGRL

In [None]:
CD27

In [57]:
# Assuming 'node_names' contains the list of gene names.
# Note: it seems with enrichr, one can only use one gene set at a time, choose accordingly
enr = gp.enrichr(
    gene_list=[#"JCHAIN", "ITM2C", "OGT", "UBE2S", "IFITM1", "CREM", "FCER1G", "CST3"],
    #"NLN",
    #"HLA-DRA",
    #"TXNIP",
    #"EIF1",
    #"UBXN1",
    #"LYZ",
    #"ITGB2",
    #"GABARAP",
    #"SOD2",
    #"ACTB",
    #"PPIA",
    #"SPIC",
    #"ALOX5AP",
    #"SRGN",
    #"DMRT2",
    #"SH3BGRL",
    #],
    "APOE",
    "FCNA",
    "SRRM2",
    "MRC1",
    "IGFBP7",
    "WFDC17",
    "CFP",
    "ITM2B",
    "NDFIP1",
    "NME2",
    "DDX5",
    "VSIG4",
    "ZBTB20",
    "TUBA4A",
    ],
    # [
    #    "TOP2A",
    #    "CRIP1",
    #    "PPP1R15A",
    #    "MS4A1",
    #    "PPIA",
    #    "PTMA",
    # ], #["JCHAIN", "ITM2C", "OGT", "UBE2S", "IFITM1", "CREM", "FCER1G", "CST3"],
    gene_sets=[
        "WikiPathway_2023_Human"
    ],  # , 'MSigDB_Hallmark_2020', 'Reactome_2022', 'Tabula_Sapiens', 'WikiPathway_2023_Human', 'TF_Perturbations_Followed_by_Expression', 'PPI_Hub_Proteins', 'OMIM_Disease', 'GO_Molecular_Function_2023'],
    organism="Human",  # change accordingly
    # description='pathway',
    cutoff=0.05,  # test dataset, use lower value for real case
    background=grn_h.var.symbol.tolist(),
)
enr.res2d[enr.res2d["Adjusted P-value"] < 0.1].head(20)

Unnamed: 0,Gene_set,Term,P-value,Adjusted P-value,Old P-value,Old adjusted P-value,Odds Ratio,Combined Score,Genes
0,WikiPathway_2023_Human,Complement System WP2806,0.002909,0.042997,0,0,31.708683,185.175704,VSIG4;CFP
1,WikiPathway_2023_Human,ApoE And miR 146 In Inflammation And Atheroscl...,0.009407,0.042997,0,0,142.65,665.640291,APOE
2,WikiPathway_2023_Human,Interactions Between LOXL4 And Oxidative Stres...,0.010968,0.042997,0,0,118.854167,536.364611,IGFBP7
3,WikiPathway_2023_Human,Lipid Particles Composition WP3601,0.010968,0.042997,0,0,118.854167,536.364611,APOE
4,WikiPathway_2023_Human,Alzheimer 39 S Disease And miRNA Effects WP2059,0.013818,0.042997,0,0,13.780788,59.006095,APOE;TUBA4A
5,WikiPathway_2023_Human,Alzheimer 39 S Disease WP5124,0.013818,0.042997,0,0,13.780788,59.006095,APOE;TUBA4A
6,WikiPathway_2023_Human,Familial Hyperlipidemia Type 3 WP5110,0.014082,0.042997,0,0,89.109375,379.862471,APOE
7,WikiPathway_2023_Human,Metabolic Pathway Of LDL HDL And TG Including ...,0.015635,0.042997,0,0,79.194444,329.307681,APOE
8,WikiPathway_2023_Human,Complement Activation WP545,0.018736,0.0458,0,0,64.772727,257.620082,CFP
9,WikiPathway_2023_Human,Melatonin Metabolism And Effects WP3298,0.023371,0.051417,0,0,50.866071,191.065387,APOE


In [None]:
HUMAN
1	WikiPathway_2023_Human	Mechanoregulation And Pathology Of YAP TAZ Via...	0.001860	0.051145	0	0	38.006689	238.958204	ITGB2;ACTB
2	WikiPathway_2023_Human	Male Infertility WP4673	0.005202	0.052236	0	0	21.788462	114.578700	HLA-DRA;SOD2
3	WikiPathway_2023_Human	Acute Viral Myocarditis WP4298	0.005202	0.052236	0	0	21.788462	114.578700	ITGB2;ACTB
4	WikiPathway_2023_Human	Glucocorticoid Receptor Pathway WP2880	0.005448	0.052236	0	0	21.253283	110.784267	SRGN;ALOX5AP
5	WikiPathway_2023_Human	Selenium Micronutrient Network WP15	0.005698	0.052236	0	0	20.743590	107.193841	ALOX5AP;SOD2
Allograft Rejection	0.003408	0.047707	0	0	11.940171	67.840978	SRGN;ITGB2;HLA-DRA
0	GO_Molecular_Function_2023	K48-linked Polyubiquitin Modification-Dependen...	0.002622	0.055033	0	0	inf	inf	UBXN1
1	GO_Molecular_Function_2023	Ubiquitin Protein Ligase Binding (GO:0031625)	0.004653	0.055033	0	0	10.637405	57.124879	UBXN1;TXNIP;GABARAP
2	GO_Molecular_Function_2023	Ubiquitin-Like Protein Ligase Binding (GO:0044...	0.005693	0.055033	0	0	9.865248	50.988665	UBXN1;TXNIP;GABARAP
3	GO_Molecular_Function_2023	MHC Class II Receptor Activity (GO:0032395)	0.007848	0.056897	0	0	203.678571	987.336365	HLA-DRA
4	GO_Molecular_Function_2023	GABA Receptor Binding (GO:0050811)	0.010451	0.060615	0	0	135.761905	619.218567	GABARAP
5	GO_Molecular_Function_2023	Complement Component C3b Binding (GO:0001851)	0.015638	0.075584	0	0	81.428571	338.583502	ITGB2
6	GO_Molecular_Function_2023	Cyclosporin A Binding (GO:0016018)	0.023371	0.088566	0	0	50.866071	191.065387	PPIA
7	GO_Molecular_Function_2023	Ubiquitin Ligase-Substrate Adaptor Activity (G...	0.028495	0.088566	0	0	40.678571	144.735018	SH3BGRL
8	GO_Molecular_Function_2023	Ribosomal Small Subunit Binding (GO:0043024)	0.028495	0.088566	0	0	40.678571	144.735018	EIF1
9	GO_Molecular_Function_2023	Beta-Tubulin Binding (GO:0048487)	0.031048	0.088566	0	0	36.974026	128.382165	GABARAP

MOUSE
0	GO_Molecular_Function_2023	Amyloid-Beta Binding (GO:0001540)	0.002012	0.044950	0	0	38.564626	239.436015	APOE;ITM2B
1	GO_Molecular_Function_2023	G-quadruplex DNA Binding (GO:0051880)	0.004714	0.044950	0	0	356.812500	1911.551692	NME2
2	GO_Molecular_Function_2023	Phosphatidylcholine-Sterol O-acyltransferase A...	0.004714	0.044950	0	0	356.812500	1911.551692	APOE
3	GO_Molecular_Function_2023	C2H2 Zinc Finger Domain Binding (GO:0070742)	0.006280	0.044950	0	0	237.833333	1205.890220	SRRM2
4	GO_Molecular_Function_2023	Complement Component C3b Binding (GO:0001851)	0.009407	0.044950	0	0	142.650000	665.640291	VSIG4
5	GO_Molecular_Function_2023	Guanyl Ribonucleotide Binding (GO:0032561)	0.011047	0.044950	0	0	15.556172	70.090327	NME2;TUBA4A
6	GO_Molecular_Function_2023	WW Domain Binding (GO:0050699)	0.012526	0.044950	0	0	101.857143	446.130544	NDFIP1
7	GO_Molecular_Function_2023	Nucleoside Diphosphate Kinase Activity (GO:000...	0.012526	0.044950	0	0	101.857143	446.130544	NME2
8	GO_Molecular_Function_2023	Low-Density Lipoprotein Particle Receptor Bind...	0.014082	0.044950	0	0	89.109375	379.862471	APOE
9	GO_Molecular_Function_2023	Phosphotransferase Activity, Phosphate Group A...	0.014082	0.044950	0	0	89.109375	379.862471	NME2
10	GO_Molecular_Function_2023	mRNA Binding (GO:0003729)	0.016865	0.044950	0	0	12.363234	50.473384	SRRM2;DDX5
11	GO_Molecular_Function_2023	Nucleobase-Containing Compound Kinase Activity...	0.017187	0.044950	0	0	71.262500	289.582676	NME2
12	GO_Molecular_Function_2023	pre-mRNA Binding (GO:0036002)	0.017187	0.044950	0	0	71.262500	289.582676	DDX5
13	GO_Molecular_Function_2023	Lipoprotein Particle Receptor Binding (GO:0070...	0.021829	0.046386	0	0	54.788462	209.540593	APOE
14	GO_Molecular_Function_2023	Nuclear Androgen Receptor Binding (GO:0050681)	0.021829	0.046386	0	0	54.788462	209.540593	DDX5
15	GO_Molecular_Function_2023	R-SMAD Binding (GO:0070412)	0.021829	0.046386	0	0	54.788462	209.540593	DDX5
16	GO_Molecular_Function_2023	Promoter-Specific Chromatin Binding (GO:1990841)	0.024912	0.049824	0	0	47.466667	175.266110	DDX5
0	Reactome_2022	Metabolism Of Proteins R-HSA-392499	0.000344	0.034396	0	0	12.930719	103.122267	DDX5;IGFBP7;APOE;CFP;ITM2B;TUBA4A
1	Reactome_2022	Amyloid Fiber Formation R-HSA-977225	0.000936	0.046788	0	0	57.989796	404.429819	APOE;ITM2B
2	Reactome_2022	Chylomicron Clearance R-HSA-8964026	0.003145	0.082834	0	0	713.750000	4112.675946	APOE
3	Reactome_2022	Post-translational Protein Phosphorylation R-H...	0.003358	0.082834	0	0	29.381818	167.367878	IGFBP7;APOE
4	Reactome_2022	Regulation Of IGF Transport And Uptake By IGFB...	0.004218	0.082834	0	0	26.032258	142.355222	IGFBP7;APOE
5	Reactome_2022	Chylomicron Remodeling R-HSA-8963901	0.006280	0.082834	0	0	237.833333	1205.890220	APOE
6	Reactome_2022	Modulation By Mtb Of Host Immune System R-HSA-...	0.006280	0.082834	0	0	237.833333	1205.890220	MRC1
7	Reactome_2022	Post-translational Protein Modification R-HSA-...	0.007187	0.082834	0	0	7.567766	37.350912	DDX5;IGFBP7;APOE;CFP
8	Reactome_2022	Chylomicron Assembly R-HSA-8963888	0.007845	0.082834	0	0	178.343750	864.587514	APOE
9	Reactome_2022	mRNA Splicing - Major Pathway R-HSA-72163	0.009863	0.082834	0	0	16.536082	76.379846	SRRM2;DDX5
10	Reactome_2022	mRNA Splicing R-HSA-72172	0.010845	0.082834	0	0	15.711485	71.079400	SRRM2;DDX5
0	WikiPathway_2023_Human	Complement System WP2806	0.002909	0.042997	0	0	31.708683	185.175704	VSIG4;CFP
1	WikiPathway_2023_Human	ApoE And miR 146 In Inflammation And Atheroscl...	0.009407	0.042997	0	0	142.650000	665.640291	APOE
2	WikiPathway_2023_Human	Interactions Between LOXL4 And Oxidative Stres...	0.010968	0.042997	0	0	118.854167	536.364611	IGFBP7
3	WikiPathway_2023_Human	Lipid Particles Composition WP3601	0.010968	0.042997	0	0	118.854167	536.364611	APOE
4	WikiPathway_2023_Human	Alzheimer 39 S Disease And miRNA Effects WP2059	0.013818	0.042997	0	0	13.780788	59.006095	APOE;TUBA4A
5	WikiPathway_2023_Human	Alzheimer 39 S Disease WP5124	0.013818	0.042997	0	0	13.780788	59.006095	APOE;TUBA4A
6	WikiPathway_2023_Human	Familial Hyperlipidemia Type 3 WP5110	0.014082	0.042997	0	0	89.109375	379.862471	APOE
7	WikiPathway_2023_Human	Metabolic Pathway Of LDL HDL And TG Including ...	0.015635	0.042997	0	0	79.194444	329.307681	APOE
8	WikiPathway_2023_Human	Complement Activation WP545	0.018736	0.045800	0	0	64.772727	257.620082	CFP
9	WikiPathway_2023_Human	Melatonin Metabolism And Effects WP3298	0.023371	0.051417	0	0	50.866071	191.065387	APOE
10	WikiPathway_2023_Human	Statin Inhibition Of Cholesterol Production WP430	0.026451	0.052901	0	0	44.492188	161.616875	APOE
11	WikiPathway_2023_Human	Fatty Acids And Lipoproteins Transport In Hepa...	0.029521	0.054122	0	0	39.534722	139.266920	APOE

In [None]:
TOP = 100
set(grn_m.grn.sum(0).sort_values(ascending=False).head(TOP).index)

{'ACTB',
 'ATP1A1',
 'ATP4A',
 'B2M',
 'BIRC5',
 'CCDC89',
 'CCNB2',
 'CD3D',
 'CD52',
 'CD74',
 'CFL1',
 'CHAD',
 'COLQ',
 'COX7B',
 'COX8B',
 'CRIP1',
 'CTLA2A',
 'CYP2F2',
 'DUSP11',
 'FAM120B',
 'FCMR',
 'FTH1',
 'GIMAP4',
 'GM266',
 'H2-AA',
 'H2-AB1',
 'H2-EB1',
 'H3F3B',
 'HSP90AB1',
 'JCHAIN',
 'JUNB',
 'LIN54',
 'LY6D',
 'LYPD2',
 'MEF2C',
 'MPC1',
 'MRGPRF',
 'MS4A1',
 'MT-CYTB',
 'MT-ND1',
 'MT-ND2',
 'MT-ND4',
 'NDC80',
 'NWD1',
 'OXCT1',
 'PCDHGA12',
 'PCLAF',
 'PEX11G',
 'PPIA',
 'PPP1R15A',
 'PSME1',
 'PTMA',
 'PTPRB',
 'PTPRCAP',
 'RPL10',
 'RPL13',
 'RPL13A',
 'RPL14',
 'RPL18',
 'RPL18A',
 'RPL3',
 'RPL37',
 'RPL37A',
 'RPL38',
 'RPL39',
 'RPL4',
 'RPL6',
 'RPLP0',
 'RPLP1',
 'RPS12',
 'RPS16',
 'RPS17',
 'RPS18',
 'RPS19',
 'RPS23',
 'RPS24',
 'RPS27',
 'RPS27A',
 'RPS28',
 'RPS29',
 'RPS8',
 'RPSA',
 'RRM2',
 'S100A4',
 'S100A6',
 'SELENOP',
 'SLC25A5',
 'SNRPG',
 'TAF10',
 'TIFA',
 'TMEM132E',
 'TMEM176B',
 'TMSB10',
 'TMSB4X',
 'TOMM7',
 'TOP1',
 'TOP2A',
 'UBB',


In [47]:
# mouse hub genes
grn_m.grn.sum(0).sort_values(ascending=False).head(30)

symbol
C1QC       77.519073
TMSB4X     41.802864
APOE       40.588585
FCNA       39.359177
SRRM2      33.876957
MRC1       33.289558
RPL13A     32.056759
IGFBP7     31.896709
RPL11      27.574696
FTL1       27.434568
B2M        26.620033
IFITM3     26.154570
WFDC17     25.864525
CFP        21.936113
ITM2B      20.361074
NDFIP1     20.023716
CD52       19.125843
MT-CYTB    18.294144
NME2       17.928022
MT-ND1     17.800694
MT-ND4     17.678120
RPS24      17.041389
FTH1       16.653090
CD5L       16.438406
DDX5       15.556078
VSIG4      15.311343
ALB        15.164091
CD74       15.150368
ZBTB20     15.142496
TUBA4A     14.925520
dtype: float32

In [48]:
# without taking in account genes not present in the human network
grn_m.grn.sum(0).loc[list(common)].sort_values(ascending=False).head(30)

symbol
C1QC       77.519073
TMSB4X     41.802864
APOE       40.588585
SRRM2      33.876957
MRC1       33.289558
IGFBP7     31.896709
B2M        26.620033
IFITM3     26.154570
CFP        21.936113
CD52       19.125843
NME2       17.928022
FTH1       16.653090
CD5L       16.438406
VSIG4      15.311343
ALB        15.164091
CD74       15.150368
ZBTB20     15.142496
TUBA4A     14.925520
CTSD       14.200592
FN1        13.714249
TMEM218    12.782110
SLC25A5    12.569665
RPLP1      12.023596
FOLR2      11.515420
GRN        11.379998
HSPA8      11.121265
CXCL2      10.497538
ID3         9.898645
CLEC4F      9.694858
PRDX1       9.653316
dtype: float32

In [15]:
TOP = 20
set(grn_m.grn.sum(0).sort_values(ascending=False).head(TOP).index) & set(
    grn_h.grn.sum(0).sort_values(ascending=False).head(TOP).index
)

{'B2M', 'C1QC', 'TMSB4X'}

In [16]:
# top differential hubs
TOP = 100
# in m not in h
print(
    (
        set(grn_m.grn.sum(0).sort_values(ascending=False).head(TOP).index)
        - set(grn_h.grn.sum(0).sort_values(ascending=False).head(TOP).index)
    )
    & (set(grn_h.var.index) & set(grn_m.var.index))
)
# in h not in m
print(
    (
        set(grn_h.grn.sum(0).sort_values(ascending=False).head(TOP).index)
        - set(grn_m.grn.sum(0).sort_values(ascending=False).head(TOP).index)
    )
    & (set(grn_h.var.index) & set(grn_m.var.index))
)

{'TMEM218', 'SRRM2', 'GRN', 'SLC25A5', 'CFP', 'MPEG1', 'CHMP1B', 'ARG1', 'FOLR2', 'MX1', 'CLEC4F', 'APOA1', 'IFIT3', 'NUSAP1', 'ZFP36', 'ZBTB20', 'TUBA4A', 'KIF22', 'ID3', 'CD5L', 'HSPA8', 'APOE', 'MRC1', 'LPL', 'PLTP', 'HSPA5', 'IGFBP7', 'LGALS3', 'PTPRB', 'NME2', 'CD52', 'CXCL2', 'FN1', 'PRDX1', 'IFITM3'}
{'HMGN2', 'RPS14', 'EPSTI1', 'IFI30', 'S100A11', 'NR4A3', 'OLR1', 'SPARCL1', 'HMGB2', 'EGFL7', 'RPS6', 'CXCL3', 'S100A9', 'MYL6', 'CD163', 'NAMPT', 'PTP4A1', 'CTSL', 'HSP90AB1', 'S100A4', 'SRGN', 'COL1A2', 'VIM', 'SOD2', 'ITGB2', 'CST3', 'CD3D', 'XAF1', 'SPIC', 'HMOX1', 'FOXRED2', 'P2RY13', 'CCL4', 'MARCO', 'MS4A7'}


In [None]:
{'TMEM218', 'SRRM2', 'GRN', 'SLC25A5', 'CFP', 'MPEG1', 'CHMP1B', 'ARG1', 'FOLR2', 'MX1', 'CLEC4F', 'APOA1', 'IFIT3', 'NUSAP1', 'ZFP36', 'ZBTB20', 'TUBA4A', 'KIF22', 'ID3', 'CD5L', 'HSPA8', 'APOE', 'MRC1', 'LPL', 'PLTP', 'HSPA5', 'IGFBP7', 'LGALS3', 'PTPRB', 'NME2', 'CD52', 'CXCL2', 'FN1', 'PRDX1', 'IFITM3'}

{'HMGN2', 'RPS14', 'EPSTI1', 'IFI30', 'S100A11', 'NR4A3', 'OLR1', 'SPARCL1', 'HMGB2', 'EGFL7', 'RPS6', 'CXCL3', 'S100A9', 'MYL6', 'CD163', 'NAMPT', 'PTP4A1', 'CTSL', 'HSP90AB1', 'S100A4', 'SRGN', 'COL1A2', 'VIM', 'SOD2', 'ITGB2', 'CST3', 'CD3D', 'XAF1', 'SPIC', 'HMOX1', 'FOXRED2', 'P2RY13', 'CCL4', 'MARCO', 'MS4A7'}

CXCL2 - CXCL3
CD52 - CD3D
PTP4A1 - PTPRB
HSPA8 / HSPA5 - HSP90AB1

In [None]:
TREM
PPARG
CD36, CD9

In [58]:
TOP = 20
# in m not in h
print(
    (
        set(grn_m.grn.sum(0).sort_values(ascending=False).head(TOP).index)
        - set(grn_h.grn.sum(0).sort_values(ascending=False).head(TOP).index)
    )
    & (set(grn_h.var.index) & set(grn_m.var.index))
)
# in h not in m
print(
    (
        set(grn_h.grn.sum(0).sort_values(ascending=False).head(TOP).index)
        - set(grn_m.grn.sum(0).sort_values(ascending=False).head(TOP).index)
    )
    & (set(grn_h.var.index) & set(grn_m.var.index))
)

{'IGFBP7', 'NME2', 'SRRM2', 'CD52', 'APOE', 'MRC1', 'CFP', 'IFITM3'}
{'PPIA', 'ITGB2', 'FTH1', 'RPS14', 'CD74', 'SOD2'}


In [59]:
TOP = 50
# in m not in h
print(
    (
        set(grn_m.grn.sum(0).sort_values(ascending=False).head(TOP).index)
        - set(grn_h.grn.sum(0).sort_values(ascending=False).head(TOP).index)
    )
    & (set(grn_h.var.index) & set(grn_m.var.index))
)
# in h not in m
print(
    (
        set(grn_h.grn.sum(0).sort_values(ascending=False).head(TOP).index)
        - set(grn_m.grn.sum(0).sort_values(ascending=False).head(TOP).index)
    )
    & (set(grn_h.var.index) & set(grn_m.var.index))
)

{'IGFBP7', 'TUBA4A', 'FOLR2', 'NME2', 'TMEM218', 'SRRM2', 'CD52', 'CD5L', 'GRN', 'CTSD', 'HSPA8', 'APOE', 'FN1', 'MRC1', 'SLC25A5', 'CFP', 'IFITM3', 'ZBTB20'}
{'PPIA', 'ITGB2', 'PTP4A1', 'CST3', 'RPS14', 'CD3D', 'SPIC', 'IFI30', 'RPS6', 'SRGN', 'RPS19', 'SOD2', 'S100A11', 'OLR1', 'MYL6', 'CD163'}


In [20]:
# we now compute eigen centrality creating a sparse network by only keeping the top 20 neighbors for each gene in the network
TOP = 20

grnutils.get_centrality(grn_h, TOP, top_k_to_disp=0)
grnutils.get_centrality(grn_m, TOP, top_k_to_disp=0)


Top central genes: []
Top central genes: []


[]

In [21]:
grn_h.var.centrality.sort_values(ascending=False).head(10)

index
TMSB4X     0.276147
B2M        0.276147
FTH1       0.276147
FTL        0.276147
HLA-DRA    0.276147
EIF1       0.276147
CD74       0.275399
C1QB       0.242367
C1QA       0.231694
PPIA       0.206782
Name: centrality, dtype: float64

In [22]:
grn_m.var.centrality.sort_values(ascending=False).head(10)

symbol
C1QC       0.274250
APOE       0.274250
TMSB4X     0.274250
B2M        0.274249
MT-CYTB    0.274011
MT-ND1     0.274002
FTH1       0.268971
MT-ND4     0.260279
FTL1       0.252664
IFITM3     0.217691
Name: centrality, dtype: float64

In [40]:
TOP = 10
# in m not in h
print(
    (
        set(grn_m.var.centrality.sort_values(ascending=False).head(TOP).index)
        - set(grn_h.var.centrality.sort_values(ascending=False).head(TOP).index)
    )
    & (set(grn_h.var.index) & set(grn_m.var.index))
)
# in h not in m
print(
    (
        set(grn_h.var.centrality.sort_values(ascending=False).head(TOP).index)
        - set(grn_m.var.centrality.sort_values(ascending=False).head(TOP).index)
    )
    & (set(grn_h.var.index) & set(grn_m.var.index))
)

{'C1QC', 'IFITM3', 'APOE'}
{'PPIA', 'CD74'}


In [26]:
np.array(grn_m.X.sum(0))[0]

array([[18., 61., 82., ..., 67.,  1., 20.]], dtype=float32)

In [27]:
scipy.stats.spearmanr(np.array(grn_m.X.sum(0))[0], grn_m.grn.sum(1).values)

SignificanceResult(statistic=0.34178112327196214, pvalue=2.832630747887451e-174)

In [29]:
np.corrcoef(np.array(grn_m.X.sum(0))[0], grn_m.grn.sum(0).values)[0, 1]

0.6804855772300833

In [41]:
TOP = 20
# in m not in h
print(
    (
        set(grn_m.var.centrality.sort_values(ascending=False).head(TOP).index)
        - set(grn_h.var.centrality.sort_values(ascending=False).head(TOP).index)
    )
    & (set(grn_h.var.index) & set(grn_m.var.index))
)
# in h not in m
print(
    (
        set(grn_h.var.centrality.sort_values(ascending=False).head(TOP).index)
        - set(grn_m.var.centrality.sort_values(ascending=False).head(TOP).index)
    )
    & (set(grn_h.var.index) & set(grn_m.var.index))
)

{'CD5L', 'CTSD', 'APOE', 'RPS19', 'IFITM3'}
{'PPIA', 'ITGB2', 'RPS6', 'S100A11', 'FN1', 'SOD2'}


In [42]:
TOP = 50
# in m not in h
print(
    (
        set(grn_m.var.centrality.sort_values(ascending=False).head(TOP).index)
        - set(grn_h.var.centrality.sort_values(ascending=False).head(TOP).index)
    )
    & (set(grn_h.var.index) & set(grn_m.var.index))
)
# in h not in m
print(
    (
        set(grn_h.var.centrality.sort_values(ascending=False).head(TOP).index)
        - set(grn_m.var.centrality.sort_values(ascending=False).head(TOP).index)
    )
    & (set(grn_h.var.index) & set(grn_m.var.index))
)

{'FOLR2', 'IFITM3', 'CD52', 'CLEC4F', 'GRN', 'CTSD', 'CD5L', 'APOE', 'CFP', 'TTR', 'ZBTB20'}
{'PPIA', 'RPS14', 'IFI30', 'S100A11', 'SPARCL1', 'RPS27', 'RPS6', 'LGALS1', 'CD163', 'PTP4A1', 'APOC1', 'S100A4', 'SRGN', 'SOD2', 'ITGB2', 'CST3', 'NME2', 'CCL4', 'FN1'}


# 2. Network similarity

We now look at the similarity of the two networks based on general overlap of their top K edges across their common nodes

In [43]:
K = 20
subgrn_h = grn_h.get(common).grn
subgrn_h = subgrn_h.apply(lambda row: row >= row.nlargest(K).min(), axis=1)

subgrn_m = grn_m.get(common).grn
subgrn_m = subgrn_m.apply(lambda row: row >= row.nlargest(K).min(), axis=1)
(subgrn_m & subgrn_h).sum(1).mean() / K


0.29805238415043656

In [61]:
# Calculate similar for each common CD gene in human compared to mouse
dissim_scores = {}
for gene in subgrn_m.index:
    # Count overlapping connections (human vs mouse)
    overlap = (subgrn_m.loc[gene] & subgrn_h.loc[gene]).sum()
    # Dissimilarity score: how many top-K human connections are NOT present in mouse
    dissim_scores[gene] = (subgrn_m.loc[gene] & ~subgrn_h.loc[gene]).sum()

# Sort genes by similar (highest first)
sim_df = pd.Series(dissim_scores).sort_values(ascending=True)
print("genes with most dissimilar top-K connections (mouse vs human):")
print(sim_df.head(60))

# Show the most similar genes
most_similar = sim_df.head(60).index.tolist()
print(f"\nMost similar genes: {most_similar}")

genes with most dissimilar top-K connections (mouse vs human):
RPLP1        7
HSPA8        8
RPS14        8
RPS27        8
RPS19        9
HMGN2        9
HSPA1B       9
HSPA1A       9
HSPE1        9
IFIT2       10
SERBP1      10
KLF4        10
PPIA        10
IFIT1       10
COX5A       10
SEC61G      10
RAN         10
EGR1        10
RPS6        10
IER2        10
CCL4        10
NME2        10
HSPA5       10
IFIT3       10
HSP90AA1    10
HNRNPA1     10
CD74        11
REL         11
PPP1R14B    11
FOSL2       11
S100A9      11
MAF         11
S100A8      11
NR4A2       11
STAB1       11
MRPL21      11
ISG15       11
MRC1        11
SET         11
HMGB2       11
U2AF1       11
PSMB5       11
CCRL2       11
HSPD1       11
DNAJB1      11
NR4A1       11
HSPB1       11
IGF1        11
STAT1       11
DCN         11
SRGN        11
POLR2E      11
NSA2        11
ZFP36       11
NAA38       11
LYVE1       11
IFITM3      11
HEXB        11
IFITM1      11
TUBB4B      11
dtype: int64

Most similar genes: ['R

In [44]:
# Calculate dissimilarity for each common CD gene in human compared to mouse
dissim_scores = {}
for gene in subgrn_m.index:
    # Count overlapping connections (human vs mouse)
    overlap = (subgrn_m.loc[gene] & subgrn_h.loc[gene]).sum()
    # Dissimilarity score: how many top-K human connections are NOT present in mouse
    dissim_scores[gene] = (subgrn_m.loc[gene] & ~subgrn_h.loc[gene]).sum()

# Sort genes by dissimilarity (highest first)
dissim_df = pd.Series(dissim_scores).sort_values(ascending=False)
print("genes with most dissimilar top-K connections (mouse vs human):")
print(dissim_df.head(60))

# Show the most dissimilar genes
most_dissimilar = dissim_df.head(60).index.tolist()
print(f"\nMost dissimilar genes: {most_dissimilar}")

genes with most dissimilar top-K connections (mouse vs human):
TSPAN6       17
SLCO2A1      17
RPP38        17
TSPAN15      17
PLXNA3       17
RANGRF       17
ANLN         17
EFHD1        17
ID4          17
MRPL44       16
JMJD6        16
AIMP2        16
PRR5         16
SPATS2L      16
RPP25L       16
ATIC         16
XRCC5        16
DNAJC9       16
TMEM107      16
ITGAE        16
TSPAN2       16
GEMIN6       16
PES1         16
RWDD2A       16
EPOR         16
PFN2         16
SCAMP3       16
ENPEP        16
ARC          16
ROR1         16
ATP2A2       16
ANGPTL4      16
ADH4         16
MAFK         16
BORCS8       16
DDX27        16
CCDC43       16
MTFMT        16
GTF2F1       16
SYAP1        16
TMEM70       16
ALDH5A1      16
EXOSC10      16
TPRKB        16
DARS2        16
MTMR11       16
RGS5         16
NUF2         16
SLC26A4      16
TACC3        16
IBA57        16
PPP1R15B     16
NFE2L3       16
RIPK4        16
GCH1         16
EEF1AKMT1    16
GJB2         16
TMEM242      16
ODF3B    

In [63]:
enr = gp.enrichr(
    gene_list=most_dissimilar,
    gene_sets=[
        "GO_Molecular_Function_2023"
    ],  # , 'MSigDB_Hallmark_2020', 'Reactome_2022', 'Tabula_Sapiens', 'WikiPathway_2023_Human', 'TF_Perturbations_Followed_by_Expression', 'PPI_Hub_Proteins', 'OMIM_Disease', 'GO_Molecular_Function_2023'],
    organism="Human",  # change accordingly
    # description='pathway',
    cutoff=0.05,  # test dataset, use lower value for real case
    background=common,
)
enr.res2d[enr.res2d["Adjusted P-value"] < 0.05].head(20)

Unnamed: 0,Gene_set,Term,P-value,Adjusted P-value,Old P-value,Old adjusted P-value,Odds Ratio,Combined Score,Genes
0,GO_Molecular_Function_2023,"Hydroxymethyl-, Formyl- And Related Transferas...",0.000399,0.043522,0,0,inf,inf,MTFMT;ATIC


In [None]:
subgrn_m.pl

# res 

- seeing CD34 first is really cool (when most dissim human / mouse)
  
# pathway analysis
## mouse vs human
- many cell cyle genes
- replication genes
- G2-M Checkpoint / Mitotic Spindle
- DNA synthesis and replication, unwinding of dna...
- microtubulue, and tubulin stuff
- CDK1, MCM2, ..
## human vs mouse
- cytokine signaling
- Fluoropyrimidine, Pyrimidine and serine metabolism
- same otherwise

key differences in B-cell this also changes how they respond to cytokines and influence in how they divide and replicate, 
That difference can be in part driven by differences in nucleotide metabolism, which is key for DNA synthesis and replication,

	•	Ortholog expression / enzyme activities differ: The DPD example shows mouse > human in hepatic activity. That suggests species have evolved different baseline enzyme capacities.
	•	Organ/tissue distribution of enzymes differs: The capecitabine study showed different organ/intestinal vs hepatic enzyme distributions in mice vs humans. That implies that even with the same genes, tissue‐expression context differs across species.
	•	Differences in precursor supply / flux: For serine, de novo synthesis vs uptake differs between species/tissue contexts. So B-cells in mouse may rely more on one mode (say, uptake) vs human B-cells another.
	•	Differences in regulation by signalling or metabolism: Immune cells in different species may use different metabolic wiring (e.g., nutrient transporter expression, mitochondrial vs glycolytic flux). These differences may ripple through nucleotide and serine metabolism.
	•	Evolutionary adaptation to lifespan, cell turnover, and metabolic demand: Mice have much faster cell proliferation rates, higher basal metabolic rates, shorter lifespans; this may drive differences in how strongly they rely on de novo synthesis vs salvage pathways, or how tightly pyrimidine catabolism is regulated.


In [None]:
# Calculate dissimilarity for each common CD gene in human compared to mouse
dissim_scores = {}
for gene in subgrn_h.index:
    # Count overlapping connections (human vs mouse)
    overlap = (subgrn_h.loc[gene] & subgrn_m.loc[gene]).sum()
    # Dissimilarity score: how many top-K human connections are NOT present in mouse
    dissim_scores[gene] = (subgrn_h.loc[gene] & ~subgrn_m.loc[gene]).sum()

# Sort genes by dissimilarity (highest first)
dissim_df = pd.Series(dissim_scores).sort_values(ascending=False)
print("genes with most dissimilar top-K connections (human vs mouse):")
print(dissim_df.head(60))

# Show the most dissimilar genes
most_dissimilar = dissim_df.head(60).index.tolist()
print(f"\nMost dissimilar genes: {most_dissimilar}")

genes with most dissimilar top-K connections (human vs mouse):
GINS1       18
KIF22       18
UBE3A       18
MELK        18
SDC4        18
RFC3        18
GCH1        18
ACTR10      18
NSD1        18
PRR11       18
SHCBP1      18
SIVA1       18
BAZ1B       18
UBE2C       18
DMKN        18
PARPBP      18
RRM2        18
TXN2        18
CENPK       18
SPDL1       18
RAB11A      18
MCM7        18
MCM4        18
SLC25A39    18
CCNB1       18
TOP2A       18
CCNL2       18
CENPH       18
NUSAP1      18
KIF23       18
DHFR        18
SEC63       18
BRCA1       18
SNRNP27     18
GLRX5       18
ASF1B       18
GSDMD       18
SPPL2A      18
GINS2       18
RRM1        18
RRP7A       18
RACGAP1     18
FEN1        18
TYMS        18
PLK1        18
CCNB2       18
AK2         18
SUCLG1      18
CDC20       18
GMNN        18
RFC4        18
DNAJC19     18
DLGAP5      18
PAICS       18
NSRP1       18
MRPL34      18
EXOSC8      18
SPC25       18
KCNK1       18
NDUFA4L2    18
dtype: int64

Most dissimilar genes: 

In [None]:
dissim_df.index[dissim_df.index.str.startswith("CD")]

Index(['CDC20', 'CDK1', 'CDC16', 'CD34', 'CDCA3', 'CDKAL1', 'CDO1', 'CDK4',
       'CD200R1', 'CDC34', 'CDKN2AIPNL', 'CDK14', 'CDKN1A', 'CDC37L1', 'CDC26',
       'CD36', 'CD93', 'CD68', 'CD83', 'CD14', 'CD69', 'CD40', 'CD47', 'CDC37',
       'CD63', 'CD46', 'CD52', 'CD3D'],
      dtype='object')

In [None]:
K = 5
subgrn_h = grn_h.get(
    set(common) & set(grn_h.var.index[grn_h.var.index.str.match(r"CD\d{1,4}")])
).grn
subgrn_h = subgrn_h.apply(lambda row: row >= row.nlargest(K).min(), axis=1)

subgrn_m = grn_m.get(set(common) & set(grn_m.var.index[grn_m.var.index.str.match(r"CD\d{1,4}")])).grn
subgrn_m = subgrn_m.apply(lambda row: row >= row.nlargest(K).min(), axis=1)
(subgrn_m & subgrn_h).sum(1).mean() / K

np.float64(0.5857142857142856)

In [None]:
https://chatgpt.com/share/68ee40f1-7380-8000-b5ef-585db46704e9