<a href="https://colab.research.google.com/github/cellatlas/human/blob/master/markers/liver/markers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q gget
!pip install -q git+https://github.com/sbooeshaghi/ec

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ec (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
from ec.utils import write_markers

In [3]:
# Get valid gene names
!wget -O genes.txt https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
genes_list = pd.read_csv('genes.txt', header = None)[0].values

--2023-03-18 04:53:26--  https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.box.com (caltech.box.com)... 74.112.186.144
Connecting to caltech.box.com (caltech.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:53:27--  https://caltech.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Reusing existing connection to caltech.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:53:27--  https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.app.box.com (caltech.app.box.com)... 74.112.186.144
Connecting to caltech.app.box.com (caltech.app.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 3

# Liver

In [4]:
species = "homo_sapiens"
organ = "liver"
reference = "hg19"
paper_doi = "https://doi.org/10.1016/j.cell.2021.12.018"
table_link = "https://www.cell.com/cms/10.1016/j.cell.2021.12.018/attachment/c8933c60-a9de-4358-b669-907f776f4e87/mmc1.xlsx"

# don't include in header
table_name = "degs.xlsx"

header = [
    {
      "species": species,
      "organ": organ,
      "reference": reference,
      "paper_doi": paper_doi,
      "table_link": table_link,
    }
]

In [6]:
!wget -O degs.xlsx $table_link

--2023-03-18 04:53:57--  https://www.cell.com/cms/10.1016/j.cell.2021.12.018/attachment/c8933c60-a9de-4358-b669-907f776f4e87/mmc1.xlsx
Resolving www.cell.com (www.cell.com)... 104.18.124.114, 104.18.123.114
Connecting to www.cell.com (www.cell.com)|104.18.124.114|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2697708 (2.6M) [application/octet-stream]
Saving to: ‘degs.xlsx’


2023-03-18 04:53:59 (4.09 MB/s) - ‘degs.xlsx’ saved [2697708/2697708]



In [18]:
excel = pd.read_excel(table_name, sheet_name = None)

ct = {i: i.split(' - ')[-1] for i in excel.keys()}

# stacks the sheets together and makes a new column "cell_type" from the sheet name
df = pd.concat(
    excel, keys=list(excel.keys())
    ).reset_index(0).rename(
        columns={"level_0": "celltype_id", "Unnamed: 0": "gene"}
        )
# # rename the cell types to be human readable
df["celltype"] = df["celltype_id"].map(ct)

df['gene'] = [g.upper() for g in df['gene']]


In [19]:
bidx = df['gene'].isin(genes_list)
print(f'Filtered {np.sum(~bidx)} out of {len(bidx)} genes')
df = df[bidx]

Filtered 1253 out of 10579 genes


In [20]:
# Filter to Human celltypes
df = df[["Human" in ctid for ctid in df['celltype_id']]]

# Clean celltype id names
df['celltype_id'] = [ct.replace("Human ", "") for ct in df['celltype_id']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['celltype_id'] = [ct.replace("Human ", "") for ct in df['celltype_id']]


In [21]:
df.head()

Unnamed: 0,celltype_id,gene,proba_de,proba_not_de,bayes_factor,scale1,scale2,lfc_mean,lfc_median,lfc_std,...,lfc_max,raw_mean1,raw_mean2,non_zeros_proportion1,non_zeros_proportion2,raw_normalized_mean1,raw_normalized_mean2,score,clusters,celltype
0,Endothelial DEGs,PTPRB,1.0,0.0,18.420681,0.002781,2.3e-05,9.485378,9.563296,2.868723,...,19.975838,9.951658,0.009225,0.95801,0.007902,25.752265,0.025601,9541.317193,,Human Endothelial DEGs
2,Endothelial DEGs,CCL14,1.0,0.0,18.420681,0.001054,3e-06,9.697312,9.790769,2.470375,...,18.919907,1.691602,0.001997,0.286874,0.001845,3.88919,0.004641,8126.597945,,Human Endothelial DEGs
3,Endothelial DEGs,STAB2,0.998996,0.001004,6.902331,0.002491,2e-05,9.791159,10.056086,3.454818,...,21.830154,8.751941,0.011313,0.874735,0.009644,22.638637,0.028717,7718.700867,,Human Endothelial DEGs
5,Endothelial DEGs,OIT3,0.999598,0.000402,7.81921,0.002463,8e-06,9.428432,9.619078,2.757434,...,17.755455,3.959421,0.004673,0.789344,0.004279,9.127838,0.011509,7477.916953,,Human Endothelial DEGs
6,Endothelial DEGs,PREX2,0.999598,0.000402,7.81921,0.000692,1e-05,8.944815,9.175792,2.964142,...,18.209438,3.095977,0.003884,0.735709,0.003393,7.942995,0.011404,6230.128008,,Human Endothelial DEGs


In [22]:
min_mean = 15
max_pval = 0.05
min_lfc = 1.5
max_gene_shares = 10
max_per_celltype = 20

# filter by criteria
dfc = df.query(f"raw_normalized_mean1 >= {min_mean} & proba_not_de <= {max_pval} & lfc_mean >= {min_lfc}")

# mask out genes that are shared between max_gene_shares cell type
non_repeat_genes = dfc["gene"].value_counts()[dfc["gene"].value_counts() < max_gene_shares].index.values

m = dfc[dfc.gene.isin(non_repeat_genes)].sort_values('non_zeros_proportion1', ascending = True)

# max number to sample is equal to the min number of genes across all celltype
n_sample = min(m["celltype_id"].value_counts().min(), max_per_celltype)

# sample n_sample genes
markers = m.groupby('celltype_id').tail(10) # Set n_sample manually because some celltypes have low number of markers
markers_dict = markers.groupby("celltype_id")["gene"].apply(lambda x: list(x)).to_dict()

In [28]:
markers.celltype_id.value_counts()

T cell DEGs               10
Basophil DEGs             10
Mig. cDC DEGs             10
Monocyte DEGs             10
pDC DEGs                  10
Hepatocyte DEGs           10
Macrophage DEGs           10
Endothelial DEGs          10
Neutrophil DEGs           10
Fibroblast DEGs           10
Plasma cell DEGs          10
Cholangiocyte DEGs        10
cDC1 DEGs                 10
cDC2 DEGs                  9
B cell DEGs                7
Circ. NK NKT cell DEGs     5
Res. NK cell DEGs          5
Name: celltype_id, dtype: int64

In [29]:
write_markers("markers.txt", markers_dict, header)

In [30]:
!cat markers.txt

# homo_sapiens	liver	hg19	https://doi.org/10.1016/j.cell.2021.12.018	https://www.cell.com/cms/10.1016/j.cell.2021.12.018/attachment/c8933c60-a9de-4358-b669-907f776f4e87/mmc1.xlsx
B cell DEGs	LTB,HLA-DRB1,CD79A,HLA-DPA1,HLA-DPB1,HLA-DRA,CD74
Basophil DEGs	SERPINB1,CTSG,JUN,ANXA1,HPGDS,HPGD,VIM,CD63,CPA3,TPSAB1
Cholangiocyte DEGs	SORBS2,CTNND2,DCDC2,GLIS3,ZBTB20,FGFR2,PKHD1,NFIB,BICC1,ANXA4
Circ. NK NKT cell DEGs	FGFBP2,PRF1,GZMB,GNLY,NKG7
Endothelial DEGs	AKAP12,MS4A6A,STAB2,STAB1,ST6GAL1,PPFIBP1,DNASE1L3,LDB2,LIFR,PTPRB
Fibroblast DEGs	IGFBP7,CCBE1,ZFPM2,ZEB2,EXT1,ANKS1A,ANK3,RBMS3,RBPMS,CALD1
Hepatocyte DEGs	PDE3B,ZBTB20,CYP4F3,DST,CYP3A5,ELL2,ACSL1,CFH,ERRFI1,SORBS2
Macrophage DEGs	C1QB,HLA-DPB1,CTSS,FTL,HLA-DRA,C1QA,CD74,PSAP,CTSB,MS4A6A
Mig. cDC DEGs	HLA-DQB1,VIM,BIRC3,HLA-DRB1,HLA-DPB1,HLA-DQA1,HLA-DRA,HLA-DPA1,TXN,CD74
Monocyte DEGs	CST3,PSAP,LST1,CTSS,S100A11,FCER1G,SAT1,AIF1,FTL,FTH1
Neutrophil DEGs	CTSS,DUSP1,SAT1,S100A6,S100A11,FOS,NEAT1,S100A9,S100A8,FTH1
Plasma cell DEGs	JU

In [15]:
markers.groupby("celltype")["non_zeros_proportion1"].mean().sort_values()


celltype
Human T cell DEGs               0.770394
Human Cholangiocyte DEGs        0.854033
Human Basophil DEGs             0.858355
Human Fibroblast DEGs           0.862372
Human Endothelial DEGs          0.905081
Human B cell DEGs               0.935745
Human Neutrophil DEGs           0.936966
Human Res. NK cell DEGs         0.942188
Human Plasma cell DEGs          0.942489
Human Macrophage DEGs           0.944587
Human Circ. NK NKT cell DEGs    0.947991
Human Hepatocyte DEGs           0.962095
Human pDC DEGs                  0.964803
Human Monocyte DEGs             0.982429
Human Mig. cDC DEGs             0.984615
Human cDC2 DEGs                 0.993449
Human cDC1 DEGs                 0.998898
Name: non_zeros_proportion1, dtype: float64