<a href="https://colab.research.google.com/github/cellatlas/human/blob/master/markers/bladder/markers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q gget
!pip install -q git+https://github.com/sbooeshaghi/ec

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ec (setup.py) ... [?25l[?25hdone


In [None]:
import pandas as pd
import numpy as np
from ec.utils import write_markers

In [None]:
# Extract list of valid gene names in Ensembl release 96
!gget ref human -r "96" -ftp -w "gtf" -d

# Gunzip gtf
!gunzip /content/Homo_sapiens.GRCh38.96.gtf.gz

# Extract gene names
!tail -n +6 /content/Homo_sapiens.GRCh38.96.gtf   | cut -f 9 -d$'\t' | grep -v "transcript_id" | cut -f 6 -d" " | sed 's/"//g' | sed 's/;//'  | sort | uniq > genes.txt
genes_list = pd.read_csv('genes.txt', header = None)[0].values

Fri Mar 17 18:22:41 2023 INFO Fetching reference information for homo_sapiens from Ensembl release: 96.
http://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/Homo_sapiens.GRCh38.96.gtf.gz
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 42.4M  100 42.4M    0     0   623k      0  0:01:09  0:01:09 --:--:--  629k


# Bladder

In [None]:
species = "homo_sapiens"
organ = "bladder"
reference = ""
paper_doi = "https://doi.org/10.1681/ASN.2019040335"
table_link = "https://cdn-links.lww.com/permalink/jsn/c/jsn_30_11_2022_12_07_yu_2019040335_sdc5.xlsx"

# don't include in header
table_name = "jsn_30_11_2022_12_07_yu_2019040335_sdc5.xlsx"

header = [
    {
      "species": species,
      "organ": organ,
      "reference": reference,
      "paper_doi": paper_doi,
      "table_link": table_link,
    }
]


In [None]:
excel = pd.read_excel(table_link, skiprows = 1)

df = excel.rename(columns={"cluster": "celltype"})

In [None]:
df['gene'] = [str(g).upper() for g in df['gene'].values]

In [None]:
# Filter out genes not present in reference
bidx = df['gene'].isin(genes_list)
print(f'Filtered {np.sum(~bidx)} out of {len(bidx)} genes')
df = df[bidx]

Filtered 436 out of 6719 genes


In [None]:
df.head()

Unnamed: 0,p_val,avg_logFC,pct.1,pct.2,p_val_adj,celltype,gene
0,0.0,0.801753,0.994,0.625,0.0,basal cells 1,IGFBP2
2,0.0,0.657349,0.995,0.732,0.0,basal cells 1,KRT15
4,0.0,0.603922,0.949,0.549,0.0,basal cells 1,KRT5
5,0.0,0.585007,0.987,0.654,0.0,basal cells 1,GSTO1
6,0.0,0.56109,0.966,0.572,0.0,basal cells 1,LMO1


In [None]:
min_mean = 100
max_pval = 1e-10
min_lfc = 1.4
max_gene_shares = 2
max_per_celltype = 20

# filter by criteria
dfc = df.query(f"p_val_adj <= {max_pval} & avg_logFC >= {min_lfc}")

# mask out genes that are shared between max_gene_shares cell type
non_repeat_genes = dfc["gene"].value_counts()[dfc["gene"].value_counts() < max_gene_shares].index.values

m = dfc[dfc.gene.isin(non_repeat_genes)].sort_values('pct.1', ascending = True)

# max number to sample is equal to the min number of genes across all celltype
n_sample = min(m["celltype"].value_counts().min(), max_per_celltype)

# sample n_sample genes
markers = m.groupby('celltype').tail(10) # Set n_sample manually because some celltypes have low number of marker genes
markers_dict = markers.groupby("celltype")["gene"].apply(lambda x: list(x)).to_dict()


In [None]:
markers.celltype.value_counts()

dendritic cells        10
T cells                10
fibroblast 3           10
smooth muscle cells    10
neurone                10
monocytes              10
endothelial cells      10
myofibroblast          10
fibroblast 2            9
fibroblast 1            4
Name: celltype, dtype: int64

In [None]:
markers.groupby("celltype")["pct.1"].mean().sort_values()

celltype
dendritic cells        0.687700
T cells                0.740800
fibroblast 1           0.747000
fibroblast 3           0.763800
smooth muscle cells    0.768800
neurone                0.873600
fibroblast 2           0.915111
endothelial cells      0.973400
monocytes              0.973900
myofibroblast          0.992100
Name: pct.1, dtype: float64

In [None]:
write_markers("markers.txt", markers_dict, header)

In [None]:
!cat markers.txt

# homo_sapiens	bladder		https://doi.org/10.1681/ASN.2019040335	https://cdn-links.lww.com/permalink/jsn/c/jsn_30_11_2022_12_07_yu_2019040335_sdc5.xlsx
T cells	TRBC2,LIMD2,THY1,LTB,HCST,IFNGR1,ARHGDIB,RAC2,SHISA5,TMSB10
dendritic cells	CCL4,KLRD1,IL1B,ETV3,NAPSA,TNIP3,PLBD1,SPI1,GM2A,PIM1
endothelial cells	S100A10,FXYD6,VIM,APLP2,FKBP1A,MMRN1,FGL2,CD9,GNG11,CLDN5
fibroblast 1	KLF4,SOCS3,CEBPD,NBL1
fibroblast 2	CD34,PLXDC2,AKAP12,ALDH2,SPARCL1,ECM1,PI16,CD81,FBLN1
fibroblast 3	MATN2,OGN,CXCL12,IGF1,GPX3,MFAP2,MFAP5,DPT,CCDC80,HTRA3
monocytes	GRN,CTSC,CSF1R,PF4,CTSB,C1QC,SELENOP,C1QA,C1QB,APOE
myofibroblast	RBP4,CXCL14,CTSL,MMP2,RCN3,COL6A1,COL1A1,BGN,SERPINH1,IGFBP7
neurone	CSRP2,C2,GPM6A,CLU,SLPI,RARRES2,TIMP2,UPK3B,C3,IGFBP6
smooth muscle cells	PPP1R12A,MYH11,CNN1,CALD1,FLNA,MYLK,TPM2,CSRP1,TPM1,MYL6


In [None]:
# Download table to have a local copy
!wget $table_link -O degs.xlsx

--2023-03-17 18:34:24--  https://cdn-links.lww.com/permalink/jsn/c/jsn_30_11_2022_12_07_yu_2019040335_sdc5.xlsx
Resolving cdn-links.lww.com (cdn-links.lww.com)... 13.107.237.38, 13.107.238.38, 2620:1ec:4e:1::38, ...
Connecting to cdn-links.lww.com (cdn-links.lww.com)|13.107.237.38|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 426625 (417K) [application/vnd.openxmlformats-officedocument.spreadsheetml.sheet]
Saving to: ‘degs.xlsx’


2023-03-17 18:34:24 (5.17 MB/s) - ‘degs.xlsx’ saved [426625/426625]

