<a href="https://colab.research.google.com/github/cellatlas/human/blob/master/markers/heart/markers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q gget
!pip install -q git+https://github.com/sbooeshaghi/ec

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ec (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np
from ec.utils import write_markers

In [3]:
# Get valid gene names
!wget -O genes.txt https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
genes_list = pd.read_csv('genes.txt', header = None)[0].values

--2023-03-18 04:41:40--  https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.box.com (caltech.box.com)... 74.112.186.144
Connecting to caltech.box.com (caltech.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:41:40--  https://caltech.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Reusing existing connection to caltech.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:41:40--  https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.app.box.com (caltech.app.box.com)... 74.112.186.144
Connecting to caltech.app.box.com (caltech.app.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 3

# Heart

In [4]:
species = "homo_sapiens"
organ = "heart"
reference = "GRCh38"
paper_doi = "https://doi.org/10.1161/CIRCULATIONAHA.119.045401"
table_link = "https://www.ahajournals.org/action/downloadSupplement?doi=10.1161%2FCIRCULATIONAHA.119.045401&file=supplemental+tables+%282%29.xlsx"

# don't include in header
table_name = "degs.xlsx"

header = [
    {
      "species": species,
      "organ": organ,
      "reference": reference,
      "paper_doi": paper_doi,
      "table_link": table_link,
    }
]

In [7]:
!wget -O $table_name --quiet $table_link 

In [11]:
excel = pd.read_excel(table_name, sheet_name=None)

In [12]:
excel.keys()

dict_keys(['Table II QCMetrics', 'Table IV MarkerGene', 'Table V SubclusterMarkerGene', 'Table VI ChamberDiff', 'Table VII SexDiff', 'Table VIII LDScoreSensitivity'])

In [13]:
table = excel["Table IV MarkerGene"].rename(columns = {"Cell Type": "celltype_id", "Gene": "gene", })
df = table[:-1].copy()
df["celltype"] = df.celltype_id.apply(lambda x: x.split('. ')[-1] )


In [14]:
bidx = df['gene'].isin(genes_list)
print(f'Filtered {np.sum(~bidx)} out of {len(bidx)} genes')
df = df[bidx]

Filtered 157 out of 1557 genes


In [15]:
df.head()

Unnamed: 0,celltype_id,gene,Ensembl ID,Chromosome,Pct.Target,Pct.Other,avg_logFC,AUC,PPV50,Marker,celltype
0,1. Fibroblast I,DCN,ENSG00000011465,12,0.888,0.35,1.267279,0.853,0.71711,1.0,Fibroblast I
1,1. Fibroblast I,LAMA2,ENSG00000196569,6,0.993,0.808,0.851194,0.847,0.551188,1.0,Fibroblast I
2,1. Fibroblast I,NEGR1,ENSG00000172260,1,0.905,0.371,1.251984,0.842,0.709009,1.0,Fibroblast I
3,1. Fibroblast I,ACSM3,ENSG00000005187,16,0.836,0.311,1.473957,0.841,0.728646,1.0,Fibroblast I
4,1. Fibroblast I,ABCA6,ENSG00000154262,17,0.754,0.28,1.404422,0.804,0.729496,1.0,Fibroblast I


In [16]:
min_mean = 10
max_pval = 0.05
min_lfc = 1
max_gene_shares = 10
max_per_celltype = 20

# filter by criteria
dfc = df.query(f"Marker == 1.0 & avg_logFC >= {min_lfc}")


# mask out genes that are shared between max_gene_shares cell type
non_repeat_genes = dfc["gene"].value_counts()[dfc["gene"].value_counts() < max_gene_shares].index.values

m = dfc[dfc.gene.isin(non_repeat_genes)].sort_values('Pct.Target', ascending = True)

# max number to sample is equal to the min number of genes across all celltype
n_sample = min(m["celltype"].value_counts().min(), max_per_celltype)

# mask out genes that are shared between max_gene_shares cell type
non_repeat_genes = dfc["gene"].value_counts()[dfc["gene"].value_counts() < max_gene_shares].index.values

m = dfc[dfc.gene.isin(non_repeat_genes)]

# max number to sample is equal to the min number of genes across all celltype
n_sample = min(m["celltype"].value_counts().min(), max_per_celltype)

# sample n_sample genes
markers = m.groupby('celltype').tail(n_sample)
markers_dict = markers.groupby("celltype")["gene"].apply(lambda x: list(x)).to_dict()

In [17]:
markers.celltype.value_counts()

Fibroblast I                     8
Fibroblast II                    8
Atrial Cardiomyocyte             8
Ventricular Cardiomyocyte I      8
Ventricular Cardiomyocyte II     8
Pericyte                         8
Macrophage                       8
9.Endothelium I                  8
Endothelium II                   8
Adipocyte                        8
Vascular Smooth Muscle           8
Fibroblast III                   8
Ventricular Cardiomyocyte III    8
Neuronal                         8
Lymphocyte                       8
Name: celltype, dtype: int64

In [18]:
markers.groupby("celltype")["Pct.Target"].mean().sort_values()

celltype
Adipocyte                        0.437625
Macrophage                       0.518000
Atrial Cardiomyocyte             0.523375
9.Endothelium I                  0.531250
Endothelium II                   0.534625
Fibroblast III                   0.535250
Vascular Smooth Muscle           0.553750
Ventricular Cardiomyocyte III    0.563375
Lymphocyte                       0.564375
Neuronal                         0.571250
Pericyte                         0.572375
Ventricular Cardiomyocyte II     0.619625
Fibroblast II                    0.754750
Fibroblast I                     0.807875
Ventricular Cardiomyocyte I      0.868875
Name: Pct.Target, dtype: float64

In [19]:
write_markers("markers.txt", markers_dict, header)

In [20]:
!cat markers.txt

# homo_sapiens	heart	GRCh38	https://doi.org/10.1161/CIRCULATIONAHA.119.045401	https://www.ahajournals.org/action/downloadSupplement?doi=10.1161%2FCIRCULATIONAHA.119.045401&file=supplemental+tables+%282%29.xlsx
9.Endothelium I	KIAA1217,PLEKHG1,SYNE2,B2M,ADGRF5,PECAM1,DOCK9,CYYR1
Adipocyte	PECR,CIDEC,TNFAIP8,AGPAT2,G0S2,DGAT2,RETSAT,PTGER3
Atrial Cardiomyocyte	KCNJ3,PDLIM3,RALYL,PRELID2,EDNRA,CPNE5,TRIM55,NEDD4L
Endothelium II	GULP1,SLCO2A1,CD9,HMCN1,PTPRB,PRKCH,TMEM108,NRG3
Fibroblast I	DCN,NEGR1,ACSM3,ABCA6,PID1,CDH19,ABCA8,ABCA9
Fibroblast II	DCN,DCLK1,FBN1,CFD,C7,VCAN,CFH,MFAP5
Fibroblast III	BACH2,MTHFD1L,TWIST2,ZNF331,THBS1,PLPP3,ACSL4,FGF7
Lymphocyte	ARHGAP15,PARP8,CCND3,SKAP1,PTPRC,SYTL3,B2M,IKZF1
Macrophage	RNF149,MSR1,SIPA1L1,COLEC12,KCNMA1,MARCH1,GNAQ,MTSS1
Neuronal	SCN7A,MT-CO1,NCAM2,ADGRB3,SHISA9,SLC35F1,GPM6B,MT-CO3
Pericyte	EGFLAM,ADAMTS9,PDE1C,CPM,MIR4435-2HG,ABCC9,KCNAB1,CARMN
Vascular Smooth Muscle	MYH10,PDZRN4,PPP1R12A,MTHFD1L,SDK1,FN1,KCNMA1,ETV6
Ventricular Cardiomyo