<a href="https://colab.research.google.com/github/cellatlas/human/blob/master/markers/adipose/markers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install -q gget
!pip install -q git+https://github.com/sbooeshaghi/ec

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ec (setup.py) ... [?25l[?25hdone


In [5]:
import pandas as pd
import numpy as np
from ec.utils import write_markers

In [6]:
!wget -O genes.txt https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
genes_list = pd.read_csv('genes.txt', header = None)[0].values

--2023-03-18 04:32:39--  https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.box.com (caltech.box.com)... 74.112.186.144
Connecting to caltech.box.com (caltech.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:32:39--  https://caltech.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Reusing existing connection to caltech.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:32:39--  https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.app.box.com (caltech.app.box.com)... 74.112.186.144
Connecting to caltech.app.box.com (caltech.app.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 3

# Adipose

In [7]:
species = "homo_sapiens"
organ = "adipose"
reference = "GRCh38"
paper_doi = "https://doi.org/10.1038/s41586-022-04518-2"
table_link = "https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-022-04518-2/MediaObjects/41586_2022_4518_MOESM4_ESM.xlsx"

# don't include in header
table_name = "41586_2022_4518_MOESM4_ESM.xlsx"

header = [
    {
      "species": species,
      "organ": organ,
      "reference": reference,
      "paper_doi": paper_doi,
      "table_link": table_link
    }
]

In [8]:
excel = pd.read_excel(table_link, skiprows = 1)

df = excel.rename(columns={"cluster": "celltype"})

In [9]:
bidx = df['gene'].isin(genes_list)
print(f'Filtered {np.sum(~bidx)} out of {len(bidx)} genes')
df = df[bidx]

Filtered 372 out of 17361 genes


In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,celltype,gene
0,CXCL14,0.0,3.699825,0.426,0.082,0.0,ASPC,CXCL14
1,NEGR1,0.0,3.668332,0.867,0.247,0.0,ASPC,NEGR1
2,DCN,0.0,3.565394,0.96,0.506,0.0,ASPC,DCN
3,LAMA2,0.0,3.417902,0.829,0.266,0.0,ASPC,LAMA2
4,APOD,0.0,3.383653,0.493,0.12,0.0,ASPC,APOD


In [11]:
min_mean = 100
max_pval = 1e-10
min_lfc = 2.2
max_gene_shares = 2
max_per_celltype = 20

# filter by criteria
dfc = df.query(f"p_val_adj <= {max_pval} & avg_log2FC >= {min_lfc}")

# mask out genes that are shared between max_gene_shares cell type
non_repeat_genes = dfc["gene"].value_counts()[dfc["gene"].value_counts() < max_gene_shares].index.values

m = dfc[dfc.gene.isin(non_repeat_genes)].sort_values('pct.1', ascending = True)

# max number to sample is equal to the min number of genes across all celltype
n_sample = min(m["celltype"].value_counts().min(), max_per_celltype)

# sample n_sample genes
markers = m.groupby('celltype').tail(n_sample)
markers_dict = markers.groupby("celltype")["gene"].apply(lambda x: list(x)).to_dict()

In [12]:
markers.celltype.value_counts()

monocyte          12
nk_cell           12
t_cell            12
pericyte          12
neutrophil        12
b_cell            12
SMC               12
endometrium       12
dendritic_cell    12
endothelial       12
LEC               12
mast_cell         12
ASPC              12
macrophage        12
mesothelium       12
adipocyte         12
Name: celltype, dtype: int64

In [13]:
write_markers("markers.txt", markers_dict, header)

In [14]:
markers.groupby("celltype")["pct.1"].mean().sort_values()

celltype
t_cell            0.558500
monocyte          0.560417
nk_cell           0.567917
neutrophil        0.674000
pericyte          0.690917
b_cell            0.704250
SMC               0.711417
endometrium       0.745417
LEC               0.792833
dendritic_cell    0.794667
endothelial       0.795250
mast_cell         0.807750
macrophage        0.811833
ASPC              0.825917
mesothelium       0.857417
adipocyte         0.949667
Name: pct.1, dtype: float64

In [15]:
!cat markers.txt

# homo_sapiens	adipose	GRCh38	https://doi.org/10.1038/s41586-022-04518-2	https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-022-04518-2/MediaObjects/41586_2022_4518_MOESM4_ESM.xlsx
ASPC	NOVA1,COL6A3,FBN1,DCLK1,COL3A1,LAMA2,CCDC80,NEGR1,GSN,COL1A2,CFD,DCN
LEC	PGM5,RHOJ,PDE1A,MPP7,SNTG2,AC007319.1,MMRN1,STOX2,KALRN,PTPRE,TFPI,PPFIBP1
SMC	LMOD1,CTNNA3,ADGRL3,RYR2,ACTA2,MYH11,FRY,SYNPO2,SORBS2,PDE3A,RCAN2,PRKG1
adipocyte	TRHDE-AS1,TRHDE,WDPCP,GPAM,SIK2,DMD,PPARG,PLIN1,SORBS1,PDE3B,GHR,ACACB
b_cell	IKZF3,FCRL1,BCL11A,OSBPL10,SEL1L3,SIPA1L3,STRBP,BACH2,BANK1,RALGPS2,FCHSD2,AFF3
dendritic_cell	PTMA,CPVL,TMSB10,HLA-DQB1,PABPC1,HLA-DQA1,HLA-DPA1,CST3,HLA-DPB1,HLA-DRB1,HLA-DRA,CD74
endometrium	SYT1,LRFN5,CNTN4,DPP6,PGR,KCNIP4,ADAMTS19,JAZF1,SDK1,MITF,PALLD,ESR1
endothelial	EMCN,VWF,ARL15,PTPRB,MCTP1,PECAM1,RALGAPA2,SPARCL1,MAGI1,MECOM,LDB2,PTPRM
macrophage	LGMN,WWP1,HDAC9,MRC1,PDE4D,IQGAP2,MTSS1,MYO5A,RBM47,SLC9A9,RBPJ,FRMD4B
mast_cell	STX3,HPGD,IL18R1,STXBP5,TNIK,KIT,CPA3,AGAP1,SMYD

In [16]:
# Download table to have a local copy
!wget $table_link -O deg.xlsx

--2023-03-18 04:33:06--  https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-022-04518-2/MediaObjects/41586_2022_4518_MOESM4_ESM.xlsx
Resolving static-content.springer.com (static-content.springer.com)... 151.101.0.95, 151.101.64.95, 151.101.128.95, ...
Connecting to static-content.springer.com (static-content.springer.com)|151.101.0.95|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3916089 (3.7M) [application/octet-stream]
Saving to: ‘deg.xlsx’


2023-03-18 04:33:06 (46.0 MB/s) - ‘deg.xlsx’ saved [3916089/3916089]

