<a href="https://colab.research.google.com/github/cellatlas/human/blob/master/markers/placenta/markers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q gget
!pip install -q git+https://github.com/sbooeshaghi/ec

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ec (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
from ec.utils import write_markers

In [3]:
# Get valid gene names
!wget -O genes.txt https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
genes_list = pd.read_csv('genes.txt', header = None)[0].values

--2023-03-18 04:53:40--  https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.box.com (caltech.box.com)... 74.112.186.144
Connecting to caltech.box.com (caltech.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:53:40--  https://caltech.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Reusing existing connection to caltech.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:53:40--  https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.app.box.com (caltech.app.box.com)... 74.112.186.144
Connecting to caltech.app.box.com (caltech.app.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 3

# Placenta

In [4]:
species = "homo_sapiens"
organ = "placenta"
reference = "hg19"
paper_doi = "https://doi.org/10.1038/s41422-018-0066-y"
table_link = "https://static-content.springer.com/esm/art%3A10.1038%2Fs41422-018-0066-y/MediaObjects/41422_2018_66_MOESM10_ESM.xlsx"

# don't include in header
table_name = "41422_2018_66_MOESM10_ESM.xlsx"

header = [
    {
      "species": species,
      "organ": organ,
      "reference": reference,
      "paper_doi": paper_doi,
      "table_link": table_link
    }
]
    

In [5]:
excel = pd.read_excel(table_link, sheet_name=None, skiprows=1)

In [6]:
excel.keys()

dict_keys(['Marker_Genes_Table'])

In [7]:
df = excel["Marker_Genes_Table"].rename(columns={"cell type": "celltype"})


In [8]:
bidx = df['gene'].isin(genes_list)
print(f'Filtered {np.sum(~bidx)} out of {len(bidx)} genes')
df = df[bidx]

df.head()

Filtered 226 out of 1641 genes


Unnamed: 0,gene,myAUC,avg_diff,power,celltype
0,INSL4,0.981,3.04551,0.962,CTB_8W_1
1,MUC15,0.977,3.23827,0.954,CTB_8W_1
2,TBX3,0.975,2.839081,0.95,CTB_8W_1
3,KRT23,0.965,2.901008,0.93,CTB_8W_1
4,SLC40A1,0.959,3.231121,0.918,CTB_8W_1


In [9]:
min_mean = 100
max_pval = 1e-10
min_lfc = 3.52
max_gene_shares = 2
max_per_celltype = 20

# filter by criteria
dfc = df.query(f"avg_diff >= {min_lfc}")
# mask out genes that are shared between max_gene_shares cell type
non_repeat_genes = dfc["gene"].value_counts()[dfc["gene"].value_counts() < max_gene_shares].index.values

m = dfc[dfc.gene.isin(non_repeat_genes)].sort_values('avg_diff', ascending = True)

# max number to sample is equal to the min number of genes across all celltype
n_sample = min(m["celltype"].value_counts().min(), max_per_celltype)

# sample n_sample genes
markers = m.groupby('celltype').tail(n_sample)
markers_dict = markers.groupby("celltype")["gene"].apply(lambda x: list(x)).to_dict()

In [10]:
m.celltype.value_counts()

Mes_2       74
Mes_1       50
Blood       48
STB_8W      44
Macro_1     42
Macro_2     25
CTB_8W_1     6
Name: celltype, dtype: int64

In [11]:
markers.celltype.value_counts()

CTB_8W_1    6
Macro_1     6
Macro_2     6
Mes_1       6
Mes_2       6
STB_8W      6
Blood       6
Name: celltype, dtype: int64

In [12]:
write_markers("markers.txt", markers_dict, header)

In [13]:
!cat markers.txt

# homo_sapiens	placenta	hg19	https://doi.org/10.1038/s41422-018-0066-y	https://static-content.springer.com/esm/art%3A10.1038%2Fs41422-018-0066-y/MediaObjects/41422_2018_66_MOESM10_ESM.xlsx
Blood	SLC4A1,HBG1,AHSP,ALAS2,HBM,GYPB
CTB_8W_1	OTUB2,DHRS9,GREM2,SLC1A2,LGALS16,ERVFRD-1
Macro_1	LILRB4,FPR3,SDS,HLA-DQA1,HLA-DRB5,HLA-DRB1
Macro_2	HPGDS,TMIGD3,CD5L,SCN9A,F13A1,LYVE1
Mes_1	NKG7,RAMP1,C1S,IGFBP5,C1R,SPARCL1
Mes_2	VTN,TCF21,SLIT2,EGFL6,DLK1,CXCL14
STB_8W	ANGPT2,GNGT1,PSG6,KISS1,PSG8,CRH
