<a href="https://colab.research.google.com/github/cellatlas/human/blob/master/markers/testis/markers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q gget
!pip install -q git+https://github.com/sbooeshaghi/ec

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ec (setup.py) ... [?25l[?25hdone


In [2]:
!wget --quiet https://ars.els-cdn.com/content/image/1-s2.0-S1534580720303993-mmc2.xlsx

In [3]:
import pandas as pd
import numpy as np
from ec.utils import write_markers

In [4]:
# Get valid gene names
!wget -O genes.txt https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
genes_list = pd.read_csv('genes.txt', header = None)[0].values

--2023-03-18 05:05:06--  https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.box.com (caltech.box.com)... 74.112.186.144
Connecting to caltech.box.com (caltech.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 05:05:06--  https://caltech.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Reusing existing connection to caltech.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 05:05:06--  https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.app.box.com (caltech.app.box.com)... 74.112.186.144
Connecting to caltech.app.box.com (caltech.app.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 3

# Testis

In [5]:
species = "homo_sapiens"
organ = "testis"
reference = "GRCh37-Enesmbl75"
paper_doi = "https://doi.org/10.1016/j.devcel.2020.05.010"
table_link = "https://ars.els-cdn.com/content/image/1-s2.0-S1534580720303993-mmc2.xlsx"

# don't include in header
table_name = "1-s2.0-S1534580720303993-mmc2.xlsx"

header = [
    {
      "species": species,
      "organ": organ,
      "reference": reference,
      "paper_doi": paper_doi,
      "table_link": table_link,
    }
]

In [6]:
excel = pd.read_excel(table_name, sheet_name=None, skiprows=4)

In [7]:
excel.keys()

dict_keys(['TableS1', 'A.HumanTestis-Centroids', 'B.HumanTestis-Markers', 'C.MonkeyTestis-Centroids', 'D.MonkeyTestis-Markers', 'E.MSCI Escapees'])

In [8]:
df = excel["B.HumanTestis-Markers"].drop(columns=["Unnamed: 7",	"Unnamed: 8",	"Unnamed: 9",	"Unnamed: 10",	"Unnamed: 11"]).rename(columns={"cluster": "celltype"})

In [9]:
df.head()

Unnamed: 0,celltype,gene,p_val,avg_logFC,pct.1,pct.2,p_val_adj
0,Tcell,CD52,2.2525930000000002e-96,2.635621,0.738,0.005,1.0483789999999999e-91
1,Tcell,CD69,1.115023e-73,2.521717,0.646,0.008,5.189428000000001e-69
2,Tcell,B2M,1.7894819999999999e-72,2.396398,0.969,0.255,8.328428e-68
3,Tcell,CCL5,1.105175e-70,2.855041,0.492,0.003,5.143596e-66
4,Tcell,CXCR4,9.216813999999999e-63,2.380985,0.631,0.014,4.289598e-58


In [10]:
# Filter out genes not present in reference
bidx = df['gene'].isin(genes_list)
print(f'Filtered {np.sum(~bidx)} out of {len(bidx)} genes')
df = df[bidx]

Filtered 248 out of 3035 genes


In [25]:
min_mean = 100
max_pval = 1e-10
min_lfc = 1
max_gene_shares = 2
max_per_celltype = 20

# filter by criteria
dfc = df.query(f"p_val_adj <= {max_pval} & avg_logFC >= {min_lfc}")

# mask out genes that are shared between max_gene_shares cell type
non_repeat_genes = dfc["gene"].value_counts()[dfc["gene"].value_counts() < max_gene_shares].index.values

m = dfc[dfc.gene.isin(non_repeat_genes)].sort_values('pct.1', ascending = True)

# max number to sample is equal to the min number of genes across all celltype
n_sample = min(m["celltype"].value_counts().min(), max_per_celltype)

# get n_sample genes
markers = m.groupby('celltype').tail(n_sample)
markers_dict = markers.groupby("celltype")["gene"].apply(lambda x: list(x)).to_dict()


In [26]:
markers.celltype.value_counts()

f-Pericyte        20
Endothelial       20
ImmLeydig         20
m-Pericyte        20
Myoid             20
Macrophage        20
RoundSpermatid    20
Spermatocyte      20
Tcell             20
Spermatogonia     20
Elongating        20
Name: celltype, dtype: int64

In [27]:
write_markers("markers.txt", markers_dict, header)

In [28]:
markers.groupby("celltype")["pct.1"].mean().sort_values()

celltype
f-Pericyte        0.40245
Endothelial       0.45155
ImmLeydig         0.49895
Myoid             0.57100
m-Pericyte        0.57125
Macrophage        0.59255
RoundSpermatid    0.75825
Spermatocyte      0.77230
Spermatogonia     0.85575
Tcell             0.85690
Elongating        0.95005
Name: pct.1, dtype: float64

In [29]:
!cat markers.txt

# homo_sapiens	testis	GRCh37-Enesmbl75	https://doi.org/10.1016/j.devcel.2020.05.010	https://ars.els-cdn.com/content/image/1-s2.0-S1534580720303993-mmc2.xlsx
Elongating	DCUN1D1,TRIM36,LELP1,BOD1L2,CCSER2,UBA52,SMCP,ODF2,TSACC,MLF1,CRISP2,AC007557.1,AKAP4,HMGB4,PHF7,LINC00467,GPX4,TNP1,PRM1,PRM2
Endothelial	NOSTRIN,CD59,PKP4,SYNE1,EMCN,HSPG2,ATF4,CLDN5,CTNNB1,STOM,MYH9,EPAS1,PALMD,EGFL7,AQP1,CD34,IFI27,GNG11,ACTG1,VWF
ImmLeydig	FSTL1,ISLR,SNED1,SCARA5,APOD,PCOLCE,IGF2,ABCA8,PODN,FLRT2,SERPINE2,LUM,C3,H19,PRRX1,CCDC80,IGF1,DLK1,SFRP1,CFD
Macrophage	CTSB,DEK,SNHG5,IER3,FCER1A,MCL1,MNDA,IER5,LPAR6,LYZ,MS4A6A,HLA-DMA,OAZ1,RGS2,AIF1,HLA-DQB1,GPX1,HLA-DQA1,TYROBP,RPL27
Myoid	ACTN1,LAMC3,DPEP1,MFAP4,PEG3,MORF4L2,FHL2,COL15A1,VCAN,BGN,CIRBP,MATN2,TSHZ2,ABLIM1,SMOC2,TPM4,DDX17,LUC7L3,TCEAL4,AEBP1
RoundSpermatid	TMCO2,WDR74,PDCL2,CALCOCO2,EIF4G1,RNF151,AC006019.3,FAM209A,CEP152,SPACA3,CCDC168,ACTL7B,CHD5,FAM186A,ERICH2,TMEM191C,CAST,C20orf144,IQGAP2,FAM229A
Spermatocyte	DNAJC21,CCDC146,DBF4,TPR,LD

### Remove f-pericytes because their markers are not good

In [30]:
!grep -v f-Pericyte markers.txt > markers_f.txt
!mv markers_f.txt markers.txt

In [31]:
!cat markers.txt

# homo_sapiens	testis	GRCh37-Enesmbl75	https://doi.org/10.1016/j.devcel.2020.05.010	https://ars.els-cdn.com/content/image/1-s2.0-S1534580720303993-mmc2.xlsx
Elongating	DCUN1D1,TRIM36,LELP1,BOD1L2,CCSER2,UBA52,SMCP,ODF2,TSACC,MLF1,CRISP2,AC007557.1,AKAP4,HMGB4,PHF7,LINC00467,GPX4,TNP1,PRM1,PRM2
Endothelial	NOSTRIN,CD59,PKP4,SYNE1,EMCN,HSPG2,ATF4,CLDN5,CTNNB1,STOM,MYH9,EPAS1,PALMD,EGFL7,AQP1,CD34,IFI27,GNG11,ACTG1,VWF
ImmLeydig	FSTL1,ISLR,SNED1,SCARA5,APOD,PCOLCE,IGF2,ABCA8,PODN,FLRT2,SERPINE2,LUM,C3,H19,PRRX1,CCDC80,IGF1,DLK1,SFRP1,CFD
Macrophage	CTSB,DEK,SNHG5,IER3,FCER1A,MCL1,MNDA,IER5,LPAR6,LYZ,MS4A6A,HLA-DMA,OAZ1,RGS2,AIF1,HLA-DQB1,GPX1,HLA-DQA1,TYROBP,RPL27
Myoid	ACTN1,LAMC3,DPEP1,MFAP4,PEG3,MORF4L2,FHL2,COL15A1,VCAN,BGN,CIRBP,MATN2,TSHZ2,ABLIM1,SMOC2,TPM4,DDX17,LUC7L3,TCEAL4,AEBP1
RoundSpermatid	TMCO2,WDR74,PDCL2,CALCOCO2,EIF4G1,RNF151,AC006019.3,FAM209A,CEP152,SPACA3,CCDC168,ACTL7B,CHD5,FAM186A,ERICH2,TMEM191C,CAST,C20orf144,IQGAP2,FAM229A
Spermatocyte	DNAJC21,CCDC146,DBF4,TPR,LD