<a href="https://colab.research.google.com/github/cellatlas/human/blob/master/markers/kidney/markers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q gget
!pip install -q git+https://github.com/sbooeshaghi/ec

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ec (setup.py) ... [?25l[?25hdone


In [2]:

import pandas as pd
import numpy as np
from ec.utils import write_markers

In [3]:
# Get valid gene names
!wget -O genes.txt https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
genes_list = pd.read_csv('genes.txt', header = None)[0].values

--2023-03-18 04:41:54--  https://caltech.box.com/shared/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.box.com (caltech.box.com)... 74.112.186.144
Connecting to caltech.box.com (caltech.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:41:54--  https://caltech.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Reusing existing connection to caltech.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt [following]
--2023-03-18 04:41:54--  https://caltech.app.box.com/public/static/4r5prhstoq2j1lk8l56bpwqqvfj54q2u.txt
Resolving caltech.app.box.com (caltech.app.box.com)... 74.112.186.144
Connecting to caltech.app.box.com (caltech.app.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 3

# Kidney

In [4]:
species = "homo_sapiens"
organ = "kidney"
reference = "GRCh38"
paper_doi = "https://doi.org/10.1681/ASN.2018020125"
table_link = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6065085/bin/ASN.2018020125SupplementaryData3.xls"

# don't include in header
table_name = "ASN.2018020125SupplementaryData3.xls"

header = [
    {
      "species": species,
      "organ": organ,
      "reference": reference,
      "paper_doi": paper_doi,
      "table_link": table_link,
    }
]
    

In [5]:
excel = pd.read_excel(table_link, sheet_name=None)
ct = {i: i.split('. ')[-1] for i in excel.keys()}

# stacks the sheets together and makes a new column "cell_type" from the sheet name
df = pd.concat(
    excel, keys=list(excel.keys())
    ).reset_index(0).rename(
        columns={"level_0": "celltype_id"}
        )
# # rename the cell types to be human readable
df["celltype"] = df["celltype_id"].map(ct)



In [6]:
bidx = df['gene'].isin(genes_list)
print(f'Filtered {np.sum(~bidx)} out of {len(bidx)} genes')
df = df[bidx]

Filtered 32 out of 2253 genes


In [7]:
df.head()

Unnamed: 0,celltype_id,gene,p_val,avg_logFC,pct.1,pct.2,p_val_adj,celltype
0,1. PT,GPX3,6.780000000000001e-106,1.958635,0.442,0.109,1.39e-101,PT
1,1. PT,CUBN,5.4899999999999996e-136,1.821094,0.417,0.046,1.12e-131,PT
2,1. PT,CDH6,9.11e-154,1.764531,0.464,0.035,1.87e-149,PT
3,1. PT,LRP2,1.5300000000000002e-144,1.736607,0.48,0.049,3.14e-140,PT
4,1. PT,PDZK1IP1,1.01e-133,1.67329,0.431,0.038,2.06e-129,PT


In [8]:
min_mean = 100
max_pval = 1e-10
min_lfc = 1
max_gene_shares = 4
max_per_celltype = 20

# filter by criteria
dfc = df.query(f"p_val_adj <= {max_pval} & avg_logFC >= {min_lfc}")

# mask out genes that are shared between max_gene_shares cell type
non_repeat_genes = dfc["gene"].value_counts()[dfc["gene"].value_counts() < max_gene_shares].index.values

m = dfc[dfc.gene.isin(non_repeat_genes)].sort_values('pct.1', ascending = True)

# max number to sample is equal to the min number of genes across all celltype
n_sample = min(m["celltype"].value_counts().min(), max_per_celltype)

# sample n_sample genes
markers = m.groupby('celltype').tail(n_sample)
markers_dict = markers.groupby("celltype")["gene"].apply(lambda x: list(x)).to_dict()


In [9]:
markers.celltype.value_counts()

T cells          14
Plasma1          14
Cycling cells    14
PT               14
LOH (DL)         14
LOH (AL)         14
Pericyte         14
EC               14
Plasma2          14
B cells          14
Myofibroblast    14
CD               14
Mono2            14
Fibroblast       14
Mono1            14
Mast cells       14
Name: celltype, dtype: int64

In [10]:
write_markers("markers.txt", markers_dict, header)

In [11]:
!cat markers.txt

# homo_sapiens	kidney	GRCh38	https://doi.org/10.1681/ASN.2018020125	https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6065085/bin/ASN.2018020125SupplementaryData3.xls
B cells	IRF8,WDFY4,LCP1,FNBP1,HDAC9,HLA-DQA1,CD83,CST3,HLA-DPA1,HLA-DPB1,ACTB,HLA-DRB1,CCSER1,CD74
CD	HSD11B2,TMTC2,TRPM3,ST6GAL1,CADPS2,AQP3,CLU,COBLL1,KAZN,ATP1B1,KCNIP4,AQP2,PDE4D,MECOM
Cycling cells	UBE2C,PRC1,ARHGAP11B,ASPM,KIF20B,TPX2,NUSAP1,HMGB2,MKI67,HIST1H4C,STMN1,TOP2A,TUBB,CENPF
EC	MEIS2,EPAS1,TCF4,TIMP3,EMCN,CD59,HEG1,IFI27,ENG,PECAM1,KLF2,IGFBP5,RNASE1,IFITM3
Fibroblast	TIMP1,CALD1,NR2F2,COL6A3,BGN,SFRP1,COL1A2,DCN,COL3A1,IGFBP5,TNC,C7,COL1A1,IGFBP7
LOH (AL)	UMOD,NAALADL2,MAL,PLCB1,ATP1B1,SLC12A1,SPP1,ATP1A1,ESRRG,WFDC2,MECOM,CA12,KCNIP4,ERBB4
LOH (DL)	NTN4,IRX3,NEBL,AQP1,CRYAB,TACSTD2,AIF1L,FOXC1,ID1,MYO9A,LINC01320,BICC1,RBPMS,PKHD1
Mast cells	HSP90AA1,VIM,ZFP36,RGS1,SAMSN1,KIT,MS4A2,JUN,ADCYAP1,SRGN,FOS,CPA3,TPSB2,TPSAB1
Mono1	APOE,CD83,NAMPT,SRGN,HLA-DPA1,C1QB,CST3,PSAP,HLA-DPB1,C1QA,FTL,HLA-DRA,HLA-DRB1,CD74

In [12]:
markers.groupby("celltype")["pct.1"].mean().sort_values()


celltype
T cells          0.327500
Cycling cells    0.457786
PT               0.479786
LOH (DL)         0.509357
EC               0.545357
Pericyte         0.570929
Plasma1          0.629429
LOH (AL)         0.651000
CD               0.697214
Mono2            0.706500
B cells          0.725714
Myofibroblast    0.749929
Fibroblast       0.759286
Plasma2          0.777643
Mono1            0.862214
Mast cells       0.944571
Name: pct.1, dtype: float64

In [13]:
# Download table to have a local copy
!wget $table_link -O degs.xlsx

--2023-03-18 04:41:58--  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6065085/bin/ASN.2018020125SupplementaryData3.xls
Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 130.14.29.110, 2607:f220:41e:4290::110
Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|130.14.29.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 371200 (362K) [application/vnd.ms-excel]
Saving to: ‘degs.xlsx’


2023-03-18 04:41:59 (1.14 MB/s) - ‘degs.xlsx’ saved [371200/371200]

