<a href="https://colab.research.google.com/github/cellatlas/human/blob/master/markers/pancreas/markers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q gget
!pip install -q git+https://github.com/sbooeshaghi/ec

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ec (setup.py) ... [?25l[?25hdone


In [2]:

import pandas as pd
import numpy as np
from ec.utils import write_markers

In [3]:
!pip install -q gget

# Extract list of valid gene names in Ensembl release 96
!gget ref human -r "96" -ftp -w "gtf" -d

# Gunzip gtf
!gunzip /content/Homo_sapiens.GRCh38.96.gtf.gz

# Extract gene names
!tail -n +6 /content/Homo_sapiens.GRCh38.96.gtf   | cut -f 9 -d$'\t' | grep -v "transcript_id" | cut -f 6 -d" " | sed 's/"//g' | sed 's/;//'  | sort | uniq > genes.txt
genes_list = pd.read_csv('genes.txt', header = None)[0].values


Thu Mar 16 22:18:27 2023 INFO Fetching reference information for homo_sapiens from Ensembl release: 96.
http://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/Homo_sapiens.GRCh38.96.gtf.gz
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 42.4M  100 42.4M    0     0   756k      0  0:00:57  0:00:57 --:--:--  758k


# Pancreas

In [4]:
species = "homo_sapiens"
organ = "pancreas"
reference = "hg19"
paper_doi = "https://doi.org/10.1016/j.cmet.2016.08.020"
table_link = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5069352/bin/mmc2.xlsx"

# don't include in header
table_name = "mmc2.xlsx"

header = [
    {
      "species": species,
      "organ": organ,
      "reference": reference,
      "paper_doi": paper_doi,
      "table_link": table_link
    }
]
  

In [5]:
excel = pd.read_excel(table_link, sheet_name=None, skiprows=4)

In [6]:
excel.keys()

dict_keys(['Overview', 'VariableGenes_Celltypes', 'ExpressedGenes_Celltypes', 'ExpressedGenes_Donors', 'ExpressedGenes_BulkSeq', 'ExpressedGenes_Donors_insilico', 'Cell-type compositions', 'Cell and Mapping statitistics'])

In [7]:
excel["ExpressedGenes_Celltypes"].columns

Index(['Rank', 'Unnamed: 1', 'α-cells', 'β-cells', 'γ-cells', 'δ-cells',
       'ε-cells', 'co-expression', 'unclass endocrine', 'acinar cells',
       'ductal cells', 'MHC class II', 'mast cells', 'PSCs',
       'endothelial cells', 'unclass exocrine'],
      dtype='object')

In [8]:
excel["VariableGenes_Celltypes"].columns

Index(['Rank', 'Unnamed: 1', 'α-cells', 'β-cells', 'γ-cells', 'δ-cells',
       'ε-cells', 'unclass endocrine', 'acinar cells', 'ductal cells',
       'MHC class II', 'mast cells', 'PSCs', 'endothelial cells',
       'Unnamed: 14', 'all cells', 'endocrine cells', 'exocrine cells'],
      dtype='object')

In [9]:
n_top_genes = 50

# VariableGenes_Celltypes: Lists with genes ranked in descending order according to biological variation within the different cell types.
# ExpressedGenes_Celltypes: Lists with genes ranked in descending order according to magnitude of expression for the different cell types (used in Figure 2B). 

# The file is sorted in descending order by most relevant genes (they did not release pvals or logfc)
df = excel["ExpressedGenes_Celltypes"].drop(
    columns=["Rank", "Unnamed: 1", "co-expression"]
    ).applymap(
        lambda x: x.replace("'", "")
    ).iloc[:n_top_genes].melt(
    ).rename(columns={"variable": "celltype", "value": "gene"})


In [10]:
bidx = df['gene'].isin(genes_list)
print(f'Filtered {np.sum(~bidx)} out of {len(bidx)} genes')
df = df[bidx]

Filtered 24 out of 650 genes


In [11]:
df.head()

Unnamed: 0,celltype,gene
0,α-cells,GCG
1,α-cells,TTR
2,α-cells,B2M
3,α-cells,CHGB
4,α-cells,FTL


In [12]:
df.celltype.value_counts()

MHC class II         50
mast cells           50
endothelial cells    50
acinar cells         49
ductal cells         49
PSCs                 49
α-cells              48
unclass endocrine    48
δ-cells              47
ε-cells              47
unclass exocrine     47
β-cells              46
γ-cells              46
Name: celltype, dtype: int64

In [13]:
min_mean = 10
max_pval = 0.05
min_lfc = 1
max_gene_shares = 4

# filter by criteria
dfc = df # df.query(f"Marker == 1.0 & avg_logFC >= {min_lfc}")

# mask out genes that are shared between max_gene_shares cell type
non_repeat_genes = dfc["gene"].value_counts()[dfc["gene"].value_counts() < max_gene_shares].index.values

m = dfc[dfc.gene.isin(non_repeat_genes)]

# max number to sample is equal to the min number of genes across all celltype
n_sample = m["celltype"].value_counts().min()

In [14]:
m.celltype.value_counts()

endothelial cells    31
acinar cells         29
MHC class II         29
ductal cells         27
unclass exocrine     24
mast cells           22
ε-cells              16
PSCs                 16
α-cells              15
unclass endocrine    14
β-cells              12
δ-cells              10
γ-cells               8
Name: celltype, dtype: int64

In [15]:
# sample n_sample genes
markers = m.groupby("celltype").head(n_sample)
markers_dict = markers.groupby("celltype")["gene"].apply(lambda x: list(x)).to_dict()

In [16]:
markers.celltype.value_counts()

α-cells              8
β-cells              8
γ-cells              8
δ-cells              8
ε-cells              8
unclass endocrine    8
acinar cells         8
ductal cells         8
MHC class II         8
mast cells           8
PSCs                 8
endothelial cells    8
unclass exocrine     8
Name: celltype, dtype: int64

In [17]:
write_markers("markers.txt", markers_dict, header)

In [18]:
!cat markers.txt

# homo_sapiens	pancreas	hg19	https://doi.org/10.1016/j.cmet.2016.08.020	https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5069352/bin/mmc2.xlsx
MHC class II	CD74,IFI30,HLA-DRA,LYZ,FCER1G,CCL22,SDS,HIST1H4C
PSCs	IGFBP7,COL1A1,SPARC,COL1A2,MMP1,SFRP2,BGN,COL3A1
acinar cells	REG1A,PRSS1,REG3A,CLPS,REG1B,SPINK1,CTRB2,MT1G
ductal cells	SPP1,LCN2,SAT1,SERPINA3,MMP7,IGFBP7,KRT19,GSTP1
endothelial cells	PLVAP,MMP1,IGFBP7,CD36,ENG,THBS1,RGCC,SERPINE1
mast cells	TPSB2,TPSD1,TPSAB1,S100A4,LTC4S,CPA3,ALOX5AP,SH3BGRL3
unclass endocrine	GCG,VTRNA1-3,MAST1,RPL3,CRYBA2,SLIRP,RBP4,SPINT2
unclass exocrine	SPP1,CRYAB,RCAN1,ANXA2,DAD1,ANXA5,ANXA1,TGFBI
α-cells	GCG,TM4SF4,CRYBA2,CHGA,GPX3,SPINT2,PEMT,ALDH1A1
β-cells	RBP4,IAPP,SCGN,DLK1,RPL3,G6PC2,BEX1,CHGA
γ-cells	TM4SF4,SCG2,SPINK1,SCGB2A1,ALDH1A1,SERPINA1,PFDN5,RPS11
δ-cells	RBP4,BEX1,SCGN,PCP4,COX7C,GPX4,RPL38,RGS2
ε-cells	GHRL,SPINK1,TM4SF4,SERPINA1,HEPACAM2,HLA-A,PEMT,TMEM176B
