In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import pickle

In [2]:
# THIS NOTEBOOK TAKES THE scATAC-seq DATA FROM BECKER ET AL NAT GEN (2022) FROM THE SAME COHORT AND COMPUTES
# SMOOTHED EPITHELIAL FRACTION PURITY DISTRIBUTIONS FOR POLYPS AND NORMAL MUCOSA
# Requires epithelial_celltypes_atac.tsv, stromal_celltypes_atac.tsv, and immune_celltypes_atac.tsv in annot_dir (provided in github repo)
# Saves a python pickle file (scATAC_purities.p) in the same folder, used in downstream preprocessing

annot_dir = "../../data/scATACseq_annotations/"
metadata_path = annot_dir+"hubmap_htan_metadata_atac_and_rna_final.csv"

In [4]:
meta = pd.read_csv(metadata_path).drop_duplicates("SampleNameOnly")
meta.set_index("SampleNameOnly", inplace=True)

file_to_celltype = {"epithelial_celltypes_atac.tsv":"epithelial", "stromal_celltypes_atac.tsv":"stromal", "immune_celltypes_atac.tsv":"immune"}
all_annots = {}
for filename, celltype in file_to_celltype.items():
    temp_annot = pd.read_csv(annot_dir+filename, sep="\t")
    temp_annot["sample"] = ["-".join(x.split("-")[:-1]) for x in temp_annot["Sample"]]
    temp_annot["sample"] = ["-".join(x.split("-")[:-1]) if "-D" in x else x for x in temp_annot["sample"]]

    all_samples = list(set(temp_annot["sample"]))
    for sample in all_samples:
        if sample not in all_annots:
            all_annots[sample] = {}
        only_sample = temp_annot[temp_annot["sample"]==sample]
        all_annots[sample][celltype] = len(only_sample)

sample_names = all_annots.keys()
immune_frac = []
stromal_frac = []
epi_frac = []
for sample in sample_names:
    try:
        epi = all_annots[sample]["epithelial"]
    except KeyError:
        epi = 0
    try:
        stromal = all_annots[sample]["stromal"]
    except KeyError:
        stromal = 0
    try:
        immune = all_annots[sample]["immune"]
    except KeyError:
        immune = 0
    total = epi + stromal + immune
    immune_frac.append(immune/total)
    epi_frac.append(epi/total)
    stromal_frac.append(stromal/total)
sc_types = pd.DataFrame({"immune":immune_frac, "stromal":stromal_frac, "epithelial":epi_frac}, index=sample_names)
sc_types["DiseaseState"] = [meta.loc[x]["DiseaseState"] if x in meta.index else "NONE" for x in sc_types.index]

polyp_kde = stats.gaussian_kde(sc_types[sc_types["DiseaseState"]=="Polyp"]["epithelial"])
unaffected_kde = stats.gaussian_kde(sc_types[sc_types["DiseaseState"]=="Unaffected"]["epithelial"])
adca_kde = stats.gaussian_kde(sc_types[sc_types["DiseaseState"]=="Adenocarcinoma"]["epithelial"])

polyp_discrete = polyp_kde.evaluate(np.linspace(0.01, 1, 100))
polyp_discrete = polyp_discrete/np.sum(polyp_discrete)
unaffected_discrete = unaffected_kde.evaluate(np.linspace(0.01, 1, 100))
unaffected_discrete = unaffected_discrete/np.sum(unaffected_discrete)
adca_discrete = adca_kde.evaluate(np.linspace(0.01, 1, 100))
adca_discrete = adca_discrete/np.sum(adca_discrete)
purity_dict = {"Mucosa":unaffected_discrete, "Benign":polyp_discrete, "Dysplasia":polyp_discrete, "AdCa":adca_discrete, "Carcinoma":adca_discrete, "Adenoma":polyp_discrete}

In [6]:
sc_types.value_counts("DiseaseState")

DiseaseState
Polyp             48
Unaffected        18
Normal             8
Adenocarcinoma     6
Name: count, dtype: int64

In [7]:
pickle.dump(purity_dict, open(annot_dir+"scATAC_purities.p", "wb"))
sc_types.to_csv(annot_dir+"scATAC_celltype_fracs.csv")