In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import binom
from tqdm import tqdm #optional! for timing code execution
import pickle
from ppVAF_utils import *

set2_colors = plt.get_cmap("Set2").colors
plt.rcParams['pdf.fonttype'] = 42

In [3]:
# root directory where processed mutation data are stored
# if using the zenodo data, this should point to the location of the base zenodo directory (including wes, wgs, etc subfolders)
# these mafs will be overwritten with a "is_clonal" column being added!
data_dir = ""

# both HTAN datasets ("HTAN_WGS", "HTAN_WES") will be automatically processed- they are required to run this notebook
# names of EXTERNAL datasets which you want to process- should match maf file names in data_dir
# external datasets ("PUTH" and "SCORT") not included with our repo- you will have to generate these yourself
external_dataset_names = ["PUTH", "SCORT"]

# directory to save the simulation data and table with clonal counts and polyclonal calls
# defaults to path in repo relative to current WD
output_dir = "../../data/clonal_count_estimation/"

annot_dir = "../../data/scATACseq_annotations/"
purity_dict = pickle.load(open(annot_dir+"scATAC_purities.p", "rb"))

In [4]:
filtered_maf_WGS = pd.read_csv(data_dir+"HTAN_WGS_filtered_ppVAFs.maf", sep="\t")
filtered_maf_WES = pd.read_csv(data_dir+"HTAN_WES_filtered_ppVAFs.maf", sep="\t")

sample_to_stage = dict(zip(filtered_maf_WGS["Tumor_Sample_Barcode"], filtered_maf_WGS["Stage"]))
sample_to_stage.update(dict(zip(filtered_maf_WES["Tumor_Sample_Barcode"], filtered_maf_WES["Stage"])))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
only_SNVs_WGS = filtered_maf_WGS[filtered_maf_WGS["Variant_Type"]=="SNP"]
only_SNVs_WES = filtered_maf_WES[filtered_maf_WES["Variant_Type"]=="SNP"]
true_subclonal_WGS = only_SNVs_WGS[only_SNVs_WGS["Stage"]=="Mucosa"]
true_subclonal_WES = only_SNVs_WES[only_SNVs_WES["Stage"]=="Mucosa"]

In [6]:
#FUNCTIONS

def simulate_clonal_muts(n_samples, coverage_fn, purity_fn):
    coverages = np.array([coverage_fn() for x in range(n_samples)])
    purities = np.array([purity_fn() for x in range(n_samples)])
    expected_vafs = purities/2
    alts = binom.rvs(coverages, expected_vafs)
    to_return = pd.DataFrame({"t_alt_count":alts, "t_ref_count":coverages-alts, "t_depth":coverages})
    to_return["vaf"] = to_return["t_alt_count"]/to_return["t_depth"]
    return filter_maf_depth(to_return, min_total=10, min_alt=2, min_vaf=0.01)

def filter_maf_depth(maf, min_total=0, min_alt=0, min_vaf=0):
    to_return = maf[maf["t_depth"] >= min_total]
    to_return = to_return[to_return["t_alt_count"] >= min_alt]
    return to_return[to_return["vaf"] >= min_vaf]

def simulate_clonal_ppVAFs(n_samples, coverage_fn, purity_fn, thresholds):
    simulated_clonal = simulate_clonal_muts(n_samples, coverage_fn, purity_fn)
    simulated_clonal["Tumor_Sample_Barcode"] = "clonal_sim"

    simulated_clonal_mat = np.zeros((100,1000,len(simulated_clonal)))

    for i in tqdm(range(len(simulated_clonal))):
        row = simulated_clonal.iloc[i]
        total_cn = 2
        mutant_cn = 1
        simulated_clonal_mat[:, :, i] = estimate_ccf_purity(total_cn, mutant_cn, row["t_alt_count"], row["t_depth"])
        simulated_clonal["Stage"] = "Benign"
    simulated_clonal["Variant_Type"] = "SNP"
    
    simulated_clonal, clonal_add, marg = add_ccfs_count_clonal(simulated_clonal_mat, simulated_clonal, purity_dict, thresholds)
    return simulated_clonal

In [7]:
only_polyps_WGS = only_SNVs_WGS[np.isin(only_SNVs_WGS["Stage"], ["Benign", "Dysplasia"])]
def coverage_fn_polyp_WGS():
    return int(np.random.choice(only_polyps_WGS["t_depth"]))

only_polyps_WES = only_SNVs_WES[np.isin(only_SNVs_WES["Stage"], ["Benign", "Dysplasia"])]
def coverage_fn_polyp_WES():
    return int(np.random.choice(only_polyps_WES["t_depth"]))

def purity_fn_polyp():
    return np.random.choice(np.linspace(0.01, 1, 100), p=purity_dict["Benign"])

def purity_fn_AdCa():
    return np.random.choice(np.linspace(0.01, 1, 100), p=purity_dict["AdCa"])

thresholds = [0.6, 0.7, 0.8, 0.9, 0.95]

In [None]:
simulated_clonal_WGS = simulate_clonal_ppVAFs(10000, coverage_fn_polyp_WGS, purity_fn_polyp, thresholds)
simulated_clonal_WES = simulate_clonal_ppVAFs(10000, coverage_fn_polyp_WES, purity_fn_polyp, thresholds)

In [None]:
simulated_clonal_WGS.to_csv(output_dir+"simulated_clonal_WGS.csv", index=False)
simulated_clonal_WES.to_csv(output_dir+"simulated_clonal_WES.csv", index=False)

In [6]:
simulated_clonal_WGS = pd.read_csv(output_dir+"simulated_clonal_WGS.csv")
simulated_clonal_WES = pd.read_csv(output_dir+"simulated_clonal_WES.csv")

In [7]:
poly_calls_WGS, _ = add_count_clonal(only_SNVs_WGS, "clonal_cont_0.8", simulated_clonal_WGS, true_subclonal_WGS, 63)
poly_calls_WES, _ = add_count_clonal(only_SNVs_WES, "clonal_cont_0.8", simulated_clonal_WES, true_subclonal_WES, 63*0.02)


_, WGS_clonal_save = add_count_clonal(filtered_maf_WGS, "clonal_cont_0.8", simulated_clonal_WGS, true_subclonal_WGS, 63)
_, WES_clonal_save = add_count_clonal(filtered_maf_WES, "clonal_cont_0.8", simulated_clonal_WES, true_subclonal_WES, 63*0.02)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maf["init_clonal"] = maf[posterior_colname] > initial_cutoff
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maf["total_muts"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_sample["final_clonal"] = only_sample[posterior_colname] > optimal_cutoff


In [8]:
WGS_clonal_save.to_csv(data_dir+"HTAN_WGS_filtered_ppVAFs_TEMPFORCLONAL.maf", index=False, sep="\t")
WES_clonal_save.to_csv(data_dir+"HTAN_WES_filtered_ppVAFs_TEMPFORCLONAL.maf", index=False, sep="\t")

In [9]:
poly_calls_WES["has_WES"] = True
poly_calls_WGS["has_WGS"] = True

all_clonal = poly_calls_WES.join(poly_calls_WGS, rsuffix="_WGS", lsuffix="_WES", how="outer")

all_clonal["has_WGS"] = all_clonal["has_WGS"].fillna(False)
all_clonal["has_WES"] = all_clonal["has_WES"].fillna(False)

sample_to_stage = dict(zip(filtered_maf_WGS["Tumor_Sample_Barcode"], filtered_maf_WGS["Stage"]))
sample_to_stage.update(dict(zip(filtered_maf_WES["Tumor_Sample_Barcode"], filtered_maf_WES["Stage"])))
all_clonal["stage"] = [sample_to_stage[x] for x in all_clonal.index]

sample_to_patient = dict(zip(filtered_maf_WGS["Tumor_Sample_Barcode"], filtered_maf_WGS["Patient"]))
sample_to_patient.update(dict(zip(filtered_maf_WES["Tumor_Sample_Barcode"], filtered_maf_WES["Patient"])))
all_clonal["patient"] = [sample_to_patient[x] for x in all_clonal.index]

all_clonal["is_poly"] = [all_clonal.iloc[i]["is_poly_WGS"] if all_clonal.iloc[i]["has_WGS"] else all_clonal.iloc[i]["is_poly_WES"] for i in range(len(all_clonal))]

all_clonal.drop(columns=["stage_WES", "stage_WGS"], inplace=True)

In [10]:
all_clonal.to_csv(output_dir+"clonal_SNVs_WES_WGS.csv")

In [8]:
for dataset in external_dataset_names:
    filtered_maf = pd.read_csv(data_dir+dataset+"_filtered_ppVAFs.maf", sep="\t")
    only_SNVs = filtered_maf[filtered_maf["Variant_Type"]=="SNP"]
    only_polyps = only_SNVs[np.isin(only_SNVs["Stage"], ["Benign", "Dysplasia", "Adenoma"])]
    
    only_mucosa = only_SNVs[np.isin(only_SNVs["Stage"], ["Mucosa"])]
    only_cancer = only_SNVs[np.isin(only_SNVs["Stage"], ["Carcinoma", "AdCa"])]
    def coverage_fn_polyp():
        return int(np.random.choice(only_polyps["t_depth"]))
    
    def coverage_fn_AdCa():
        return int(np.random.choice(only_cancer["t_depth"]))
    
    simulated_clonal = simulate_clonal_ppVAFs(10000, coverage_fn_polyp, purity_fn_polyp, thresholds)
    
    poly_to_concat = []
    
    poly_calls, _ = add_count_clonal(only_polyps, "clonal_cont_0.8", simulated_clonal, true_subclonal_WGS, 63*0.02)
    poly_to_concat.append(poly_calls)
    
    if len(only_mucosa) > 0:
        poly_calls, _ = add_count_clonal(only_mucosa, "clonal_cont_0.8", simulated_clonal, true_subclonal_WGS, 63*0.02)
        poly_to_concat.append(poly_calls)
        
    simulated_clonal = simulate_clonal_ppVAFs(10000, coverage_fn_AdCa, purity_fn_AdCa, thresholds)
    poly_calls, _ = add_count_clonal(only_cancer, "clonal_cont_0.8", simulated_clonal, true_subclonal_WGS, 63*0.02)
    poly_to_concat.append(poly_calls)
    
    poly_calls = pd.concat(poly_to_concat)
    
    sample_to_stage = dict(zip(filtered_maf["Tumor_Sample_Barcode"], filtered_maf["Stage"]))
    sample_to_patient = dict(zip(filtered_maf["Tumor_Sample_Barcode"], filtered_maf["Patient"]))
    
    poly_calls["stage"] = [sample_to_stage[x] for x in poly_calls.index]
    poly_calls["patient"] = [sample_to_patient[x] for x in poly_calls.index]
    poly_calls.to_csv(output_dir+dataset+"_clonal_SNVs.csv")
    

100%|██████████| 9985/9985 [04:15<00:00, 39.06it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maf["init_clonal"] = maf[posterior_colname] > initial_cutoff
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maf["total_muts"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_sample["final_clonal"] = only_sample[posterior_colname] > optimal