<a href="https://colab.research.google.com/github/cellatlas/human/blob/master/markers/ovary/markers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q gget
!pip install -q git+https://github.com/sbooeshaghi/ec

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ec (setup.py) ... [?25l[?25hdone


In [2]:

import pandas as pd
import numpy as np
from ec.utils import write_markers

In [3]:
# Get valid gene names
!wget -O genes.txt https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
genes_list = pd.read_csv('genes.txt', header = None)[0].values

--2023-03-18 04:53:33--  https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.box.com (caltech.box.com)... 74.112.186.144
Connecting to caltech.box.com (caltech.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:53:33--  https://caltech.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Reusing existing connection to caltech.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:53:33--  https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.app.box.com (caltech.app.box.com)... 74.112.186.144
Connecting to caltech.app.box.com (caltech.app.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 3

# Ovary

In [4]:
species = "homo_sapiens"
organ = "ovary"
reference = "hg19"
paper_doi = "https://doi.org/10.1038/s41467-020-14936-3"
table_link = "https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-020-14936-3/MediaObjects/41467_2020_14936_MOESM3_ESM.xlsx"

# don't include in header
table_name = "41467_2020_14936_MOESM3_ESM.xlsx"

header = [
    {
      "species": species,
      "organ": organ,
      "reference": reference,
      "paper_doi": paper_doi,
      "table_link": table_link
    }
]

In [5]:
df = pd.read_excel(table_link, sheet_name = 1, skiprows = 2)
df.columns = ['gene', 'p_val', 'avg_logFC', 'pct.1', 'pct.2', 'p_val_adj', 'celltype']

In [6]:
bidx = df['gene'].isin(genes_list)
print(f'Filtered {np.sum(~bidx)} out of {len(bidx)} genes')
df = df[bidx]
df.head()

Filtered 629 out of 10210 genes


Unnamed: 0,gene,p_val,avg_logFC,pct.1,pct.2,p_val_adj,celltype
0,FIGLA,0.0,2.388708,0.679,0.007,0.0,oocytes
1,KPNA7,0.0,2.33558,0.821,0.004,0.0,oocytes
2,NLRP5,0.0,2.243391,0.714,0.005,0.0,oocytes
4,ZAR1,0.0,2.085036,0.714,0.004,0.0,oocytes
5,SHD,0.0,2.069233,0.714,0.003,0.0,oocytes


In [7]:
min_mean = 100
max_pval = 1e-10
min_lfc = 1
max_gene_shares = 2
max_per_celltype = 20

# filter by criteria
dfc = df.query(f"p_val_adj <= {max_pval} & avg_logFC >= {min_lfc}")

# mask out genes that are shared between max_gene_shares cell type
non_repeat_genes = dfc["gene"].value_counts()[dfc["gene"].value_counts() < max_gene_shares].index.values

m = dfc[dfc.gene.isin(non_repeat_genes)].sort_values('pct.1', ascending = True)

# Filter out genes not present in reference
bidx = m['gene'].isin(genes_list)
print(f'Filtered {np.sum(~bidx)} out of {len(bidx)} genes')
m_f = m[bidx]

# max number to sample is equal to the min number of genes across all celltype
n_sample = min(m["celltype"].value_counts().min(), max_per_celltype)

# sample n_sample genes
markers = m.groupby('celltype').tail(n_sample)
markers_dict = markers.groupby("celltype")["gene"].apply(lambda x: list(x)).to_dict()

Filtered 0 out of 477 genes


In [8]:
markers.celltype.value_counts()

stroma                16
oocytes               16
granulosa cells       16
endothelial cells     16
t cells               16
monocytes             16
perivascular cells    16
Name: celltype, dtype: int64

In [9]:
write_markers("markers.txt", markers_dict, header)

In [10]:
!cat markers.txt

# homo_sapiens	ovary	hg19	https://doi.org/10.1038/s41467-020-14936-3	https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-020-14936-3/MediaObjects/41467_2020_14936_MOESM3_ESM.xlsx
endothelial cells	PALMD,NFIB,RDX,CLEC14A,UTRN,LIFR,SPTBN1,S100A10,PRSS23,EGFL7,GSN,IFI27,VWF,TCF4,GNG11,SPARCL1
granulosa cells	TNNI3,PROK1,WNT6,KRT18,PTGDS,PRKAR2B,MIR202HG,HES1,C4orf48,BEX1,EMX2,DNAJB1,SPRR2F,WIPF3,IGFBP2,GATM
monocytes	REL,MS4A6A,SLC1A3,SGK1,HLA-DPB1,GPX1,MCL1,CST3,ARPC3,MS4A7,FGL2,CD163,HERPUD1,NEAT1,CTSB,FTL
oocytes	C6orf52,SHD,TSG101,ODC1,DLGAP5,PSMG1,PSMG4,STAG3,ZP3,KPNA7,PAIP1,EPCAM,TUBA1C,UCHL1,PDCD5,ZFAND2A
perivascular cells	CD9,MYH11,RGS5,TAGLN,ACTA2,MYL9,MTRNR2L2,DSTN,C11orf96,TIMP3,TPM2,MYL6,MT2A,ADIRF,IGFBP7,MTRNR2L12
stroma	C7,MIR503HG,GREB1,CLDN11,KCNQ1OT1,SERPINE2,CFH,HTRA1,PEG3,OGN,PDGFRA,C1S,COL1A1,MDK,TCEAL4,DCN
t cells	CST7,GZMA,CYTIP,PFN1,KLF6,ARHGDIB,CD52,CD69,STK4,SH3BGRL3,CCL5,CD2,TSC22D3,ZFP36L2,TMSB4X,BTG1
