<a href="https://colab.research.google.com/github/cellatlas/human/blob/master/markers/eye/markers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q gget
!pip install -q git+https://github.com/sbooeshaghi/ec

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ec (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
from ec.utils import write_markers

In [3]:
# Get valid gene names
!wget -O genes.txt https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
genes_list = pd.read_csv('genes.txt', header = None)[0].values

--2023-03-18 04:41:51--  https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.box.com (caltech.box.com)... 74.112.186.144
Connecting to caltech.box.com (caltech.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:41:51--  https://caltech.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Reusing existing connection to caltech.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:41:51--  https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.app.box.com (caltech.app.box.com)... 74.112.186.144
Connecting to caltech.app.box.com (caltech.app.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 3

# Eye

In [4]:
species = "homo_sapiens"
organ = "eye"
reference = "hg38"
paper_doi = "https://dx.doi.org/10.1038%2Fs41467-021-25968-8"
table_link = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8478974/bin/41467_2021_25968_MOESM5_ESM.xlsx"

# don't include in header
table_name = "41467_2021_25968_MOESM5_ESM.xlsx"

header = [
    {
      "species": species,
      "organ": organ,
      "reference": reference,
      "paper_doi": paper_doi,
      "table_link": table_link,
    }
]

In [5]:
excel_1 = pd.read_excel(table_link, skiprows = 1, sheet_name = "ChoroidSclera")
excel_2 = pd.read_excel(table_link, skiprows = 1, sheet_name = "Cornea")
excel_3 = pd.read_excel(table_link, skiprows = 1, sheet_name = "IrisCiliaryBody")


df_1 = excel_1.rename(columns={"cluster": "celltype"})
df_2 = excel_2.rename(columns={"cluster": "celltype"})
df_3 = excel_3.rename(columns={"cluster": "celltype"})

# Decided to unify the cell types from the 3 tissues within the eye because otherwise many genes were repeated.
# df_1["celltype"] = "ChoroidSclera_" + df_1["celltype"]
# df_2["celltype"] = "Cornea_" + df_2["celltype"]
# df_3["celltype"] = "IrisCiliaryBody" + df_3["celltype"]

df = pd.concat([df_1, df_2, df_3])


In [6]:
bidx = df['gene'].isin(genes_list)
print(f'Filtered {np.sum(~bidx)} out of {len(bidx)} genes')
df = df[bidx]

Filtered 900 out of 15538 genes


In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,p_val,avg_logFC,pct.1,pct.2,p_val_adj,celltype,gene
0,IGFBP5,1.917143e-260,2.740738,0.981,0.809,3.479997e-256,Fibroblasts,IGFBP5
1,MGP,8.043193e-214,1.760688,1.0,0.992,1.46e-209,Fibroblasts,MGP
2,RARRES1,2.216642e-198,2.315193,0.905,0.622,4.023649e-194,Fibroblasts,RARRES1
3,FBLN1,4.029294e-198,1.967178,0.894,0.596,7.313975e-194,Fibroblasts,FBLN1
4,C1R,3.932635e-193,1.82883,0.872,0.491,7.138518e-189,Fibroblasts,C1R


In [8]:
min_mean = 100
max_pval = 1e-10
min_lfc = 0.7
max_gene_shares = 4
max_per_celltype = 20

# filter by criteria
dfc = df.query(f"p_val_adj <= {max_pval} & avg_logFC >= {min_lfc}")

# mask out genes that are shared between max_gene_shares cell type
non_repeat_genes = dfc["gene"].value_counts()[dfc["gene"].value_counts() < max_gene_shares].index.values

m = dfc[dfc.gene.isin(non_repeat_genes)].sort_values('pct.1', ascending = True)

# max number to sample is equal to the min number of genes across all celltype
n_sample = min(m["celltype"].value_counts().min(), max_per_celltype)

# sample n_sample genes
markers = m.groupby('celltype').tail(n_sample)
markers_dict = markers.groupby("celltype")["gene"].apply(lambda x: list(x)).to_dict()

In [9]:
markers.celltype.value_counts()

COL9A1-hi ciliary body cells         8
Corneal fibroblasts                  8
Smooth muscle cells                  8
Activated T cells                    8
Melanocytes                          8
Ribosomal genes-hi fibroblasts       8
Conjunctival cells                   8
ELF3-hi corneal epithelial cells     8
TGFBI-hi corneal epithelial cells    8
WIF1-hi fibroblasts                  8
Cytotoxic T cells                    8
Ciliary body endothelial cells       8
Putative stem cells                  8
Choroid endothelial cells            8
Pigmented ciliary body cells         8
Fibroblasts                          8
Ciliary body cells                   8
MGP-hi fibroblasts                   8
CRYAA-hi ciliary body cells          8
Schwann cells                        8
MEG3-hi fibroblasts                  8
Monocytes                            8
Name: celltype, dtype: int64

In [10]:
markers.groupby("celltype")["pct.1"].mean().sort_values()


celltype
COL9A1-hi ciliary body cells         0.824000
Schwann cells                        0.886750
Corneal fibroblasts                  0.927250
MEG3-hi fibroblasts                  0.927875
MGP-hi fibroblasts                   0.939500
Fibroblasts                          0.950625
Ciliary body cells                   0.957250
CRYAA-hi ciliary body cells          0.958250
Pigmented ciliary body cells         0.967000
Choroid endothelial cells            0.983125
Putative stem cells                  0.989750
Ciliary body endothelial cells       0.995500
WIF1-hi fibroblasts                  0.997250
Cytotoxic T cells                    0.998750
ELF3-hi corneal epithelial cells     0.999250
Conjunctival cells                   0.999375
TGFBI-hi corneal epithelial cells    0.999375
Melanocytes                          1.000000
Monocytes                            1.000000
Ribosomal genes-hi fibroblasts       1.000000
Smooth muscle cells                  1.000000
Activated T cells        

In [11]:
write_markers("markers.txt", markers_dict, header)

In [12]:
!cat markers.txt

# homo_sapiens	eye	hg38	https://dx.doi.org/10.1038%2Fs41467-021-25968-8	https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8478974/bin/41467_2021_25968_MOESM5_ESM.xlsx
Activated T cells	RPL21,RPS15,RPS25,TPT1,RPL34,RPL10,RPS4X,RPL13
COL9A1-hi ciliary body cells	COL9A1,PCSK1N,PKP2,OPTC,COL9A2,DAPL1,TRPM3,CPAMD8
CRYAA-hi ciliary body cells	HMGCS1,HES5,HES4,ECM1,SLN,CRYAA,CRYBB2,HSPA5
Choroid endothelial cells	IFI27,HLA-E,RPS23,RPS6,IFITM2,IFITM3,TPT1,TMSB10
Ciliary body cells	IVNS1ABP,HES1,CXCL14,ALDH1A1,ENO1,APP,S100A4,CRYAB
Ciliary body endothelial cells	CFL1,GNG11,PPIA,IFITM3,ACTB,TMSB10,TM4SF1,IFI27
Conjunctival cells	S100A11,AQP3,APOBEC3A,DENND2C,PHLDA2,ZFP36L1,CLDN4,TXN
Corneal fibroblasts	TIMP1,IFITM3,ID3,TSC22D1,SLC6A6,KRT15,DST,S100A2
Cytotoxic T cells	RPS20,RPS11,RPLP1,RPL13A,RPS23,RPL21,RPL32,RPS8
ELF3-hi corneal epithelial cells	FABP5,CLDN4,MRPL33,KRT12,ADIRF,SLC20A1,ELF3,PHLDA2
Fibroblasts	TGM2,GSN,CLU,PTGDS,EGR1,MT2A,TIMP1,IGFBP5
MEG3-hi fibroblasts	WSB1,NR4A1,MEG3,PPP1R15A,PNISR

In [13]:
# Download table to have a local copy
!wget $table_link -O deg.xlsx

--2023-03-18 04:42:03--  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8478974/bin/41467_2021_25968_MOESM5_ESM.xlsx
Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 130.14.29.110, 2607:f220:41e:4290::110
Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|130.14.29.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1272133 (1.2M) [application/vnd.openxmlformats-officedocument.spreadsheetml.sheet]
Saving to: ‘deg.xlsx’


2023-03-18 04:42:04 (2.63 MB/s) - ‘deg.xlsx’ saved [1272133/1272133]

