<a href="https://colab.research.google.com/github/cellatlas/human/blob/master/markers/bone/markers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q gget
!pip install -q git+https://github.com/sbooeshaghi/ec

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ec (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
from ec.utils import write_markers

In [3]:
# Get valid gene names
!wget -O genes.txt https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
genes_list = pd.read_csv('genes.txt', header = None)[0].values

--2023-03-18 04:37:24--  https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.box.com (caltech.box.com)... 74.112.186.144
Connecting to caltech.box.com (caltech.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:37:24--  https://caltech.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Reusing existing connection to caltech.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:37:24--  https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.app.box.com (caltech.app.box.com)... 74.112.186.144
Connecting to caltech.app.box.com (caltech.app.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 3

# Bone

In [4]:
species = "homo_sapiens"
organ = "bone"
reference = "GRCh38"
paper_doi = "https://doi.org/10.1038/s41422-021-00467-z"
table_link = "https://static-content.springer.com/esm/art%3A10.1038%2Fs41422-021-00467-z/MediaObjects/41422_2021_467_MOESM7_ESM.xlsx"

# don't include in header
table_name = "41422_2021_467_MOESM7_ESM.xlsx"

header = [
    {
      "species": species,
      "organ": organ,
      "reference": reference,
      "paper_doi": paper_doi,
      "table_link": table_link,
    }
]

In [5]:
excel = pd.read_excel(table_link, sheet_name=None)

In [6]:
excel.keys()

dict_keys(['legends', 'Degs human_limb&bone Fig. 1d', 'Degs human long bone Fig. S3b', 'Degs human calvaria Fig. 6b', 'Degs mouse_E11.5 Fig. S2c', 'Degs mouse_E15.5 Fig. S3d'])

In [7]:
df = excel['Degs human_limb&bone Fig. 1d'].rename(columns={"cluster": "celltype"})

In [8]:
# Filter out genes not present in reference
bidx = df['gene'].isin(genes_list)
print(f'Filtered {np.sum(~bidx)} out of {len(bidx)} genes')
df = df[bidx]

Filtered 23 out of 1629 genes


In [9]:
df.head()

Unnamed: 0,celltype,gene,logfoldchanges,pvals_adj
0,LBM1,H2AFZ,1.734192,0.0
1,LBM1,RANBP1,1.468134,0.0
3,LBM1,DUSP6,2.428796,0.0
4,LBM1,HMGA1,1.814476,0.0
5,LBM1,HSPD1,1.229231,0.0


In [10]:
min_mean = 100
max_pval = 1e-10
min_lfc = 1
max_gene_shares = 2
max_per_celltype = 20

# filter by criteria
dfc = df.query(f"pvals_adj <= {max_pval} & logfoldchanges >= {min_lfc}")

# mask out genes that are shared between max_gene_shares cell type
non_repeat_genes = dfc["gene"].value_counts()[dfc["gene"].value_counts() < max_gene_shares].index.values

m = dfc[dfc.gene.isin(non_repeat_genes)].sort_values('logfoldchanges', ascending = True)

# max number to sample is equal to the min number of genes across all celltype
n_sample = min(m["celltype"].value_counts().min(), max_per_celltype)

In [11]:
# sample n_sample genes
markers = m.groupby("celltype").sample(n=n_sample)
markers_dict = markers.groupby("celltype")["gene"].apply(lambda x: list(x)).to_dict()

In [12]:
markers.celltype.value_counts()

Chondrocyte         9
Chondrocyteblast    9
Endothelium         9
Epithelium1         9
Erythrocyte         9
LBM1                9
LBM2                9
LBM3                9
Marophage           9
Myocyte             9
Myoprogenitor       9
OCP                 9
Osteoprogenitor     9
PMSC                9
Schwann             9
Name: celltype, dtype: int64

In [13]:
write_markers("markers.txt", markers_dict, header)

In [14]:
markers.groupby("celltype")["logfoldchanges"].mean().sort_values()

celltype
LBM1                1.292409
OCP                 1.338806
LBM2                1.473177
LBM3                1.661435
PMSC                2.560496
Osteoprogenitor     2.945763
Chondrocyte         2.957895
Myoprogenitor       3.162138
Chondrocyteblast    3.714932
Schwann             4.370102
Erythrocyte         4.887108
Endothelium         4.902454
Myocyte             5.251905
Epithelium1         5.368410
Marophage           6.333287
Name: logfoldchanges, dtype: float64

In [15]:
!cat markers.txt

# homo_sapiens	bone	GRCh38	https://doi.org/10.1038/s41422-021-00467-z	https://static-content.springer.com/esm/art%3A10.1038%2Fs41422-021-00467-z/MediaObjects/41422_2021_467_MOESM7_ESM.xlsx
Chondrocyte	CD99,OSTF1,MEF2C,IFI27L2,PDE4DIP,TNFRSF11B,RBM3,COMP,EDIL3
Chondrocyteblast	MATN1,PRELP,EGR1,SORBS2,MIA,MATN3,COL27A1,SPATS2L,SOX6
Endothelium	RALA,HYAL2,RASGRP3,S100A16,TAX1BP3,NOSTRIN,FLT1,CAV2,MSN
Epithelium1	MAP7,ITGA6,APOE,LAMC1,DLX1,FERMT1,EPCAM,ESRP1,DST
Erythrocyte	CHPT1,RFESD,SLC22A4,ATP5IF1,UROD,GYPA,CYTOR,ADIPOR1,MT1H
LBM1	HIST1H4C,BIRC5,PCLAF,CHD7,ZWINT,SMARCC1,LIX1,CACYBP,FABP5
LBM2	BUB3,CCNB2,NUSAP1,CCNB1,UBE2C,CDK1,AURKB,TUBB4B,KPNA2
LBM3	EDNRA,NR2F2-AS1,TCF12,EDN3,PEG10,HOXB3,CDH2,EFNB2,HOXB5
Marophage	RNASE6,CYBB,DPP7,HMOX1,TPP1,RAB32,SERPINB6,LY86,IGSF6
Myocyte	TEAD4,TNNI1,CAP2,WRNIP1,TRAF4,KREMEN2,CHRNA1,TSPAN33,ATP2B1-AS1
Myoprogenitor	KCNE5,MYC,RBP1,NGFR,PAX3,ACVR2A,HOXA11-AS,CDH15,NECTIN1
OCP	CLU,LGALS1,ISLR,LIMCH1,OGN,IGFBP4,VIM,NFIA,COL5A2
Osteoprogenitor	CDH11,NR3

In [16]:
# Download table to have a local copy
!wget $table_link -O deg.xlsx

--2023-03-18 04:37:30--  https://static-content.springer.com/esm/art%3A10.1038%2Fs41422-021-00467-z/MediaObjects/41422_2021_467_MOESM7_ESM.xlsx
Resolving static-content.springer.com (static-content.springer.com)... 151.101.0.95, 151.101.64.95, 151.101.128.95, ...
Connecting to static-content.springer.com (static-content.springer.com)|151.101.0.95|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 373620 (365K) [application/octet-stream]
Saving to: ‘deg.xlsx’


2023-03-18 04:37:30 (13.8 MB/s) - ‘deg.xlsx’ saved [373620/373620]

