In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import pickle

In [None]:
# THIS NOTEBOOK TAKES THE ppVAF POSTERIORS AND FILTERS AND ANNOTATES THE WGS AND WES MAFS WITH A COLUMN WITH ppVAF POINT ESTIMATES
# ALSO OUTPUTS A .csv file (clonal_noshared_WES_WGS_polycalls.csv) CONTAINING THE EXPECTED CLONAL SNV COUNTS AND POLYCLONAL CALLS
# It will probably require a lot of memory to run (>100GB)

all_patients_to_process = ["A001", "A002", "A015", "A014", "F001", "G001"]

# directory containing the .npy unnormalized ppVAF probabilities for WES and WGS data and per-patient maf files
# calculated by calculate_ppVAF_posteriors.py, with filenames that are of the form [[PATIENT]][[WES_npy_postfix]]
# and [[PATIENT]][[WES_maf_postfix]] for the WES data, respectively
input_dir = "/path/to/input/root/dir/"

WES_npy_postfix = "_ppVAFgivenPurity_noprior_WES.npy"
WES_maf_postfix = "_muts_WES.maf"
WGS_npy_postfix = "_ppVAFgivenPurity_noprior_WGS.npy"
WGS_maf_postfix = "_muts_WGS.maf"

# directory to save the new mafs with ppVAFs in a new "purity_ccf" column
# must mirror structure of the zenodo directory (have subdirectories "wes", "wgs", and "wgs_wes" in which data will be saved)
# output files are inputs to most downstream plotting scripts and are provided in the zenodo distribution
output_dir = "/path/to/output/root/dir/"

In [None]:
annot_dir = "../../data/scATACseq_annotations/"
purity_dict = pickle.load(open(annot_dir+"scATAC_purities.p", "rb"))

In [None]:
#FUNCTIONS

def purity_dist_normalize(prob_mat, maf, purity_dict):
    all_stages = list(set(maf["Stage"]))
    for stage in all_stages:
        if stage not in purity_dict:
            raise AssertionError("invalid stage")
        has_stage = np.nonzero((maf["Stage"] == stage).tolist())[0]
        prob_mat[:, :, has_stage] = np.multiply(prob_mat[:, :, has_stage], purity_dict[stage].reshape((-1, 1, 1)))
    prob_mat = np.divide(prob_mat, np.sum(prob_mat, axis=(0,1)).reshape(1, 1, -1))
    return prob_mat

def get_ccfs_clonality_many(probs):
    num_ccf_grid = np.shape(probs)[0]
    
    ccfs = np.argmax(probs, axis=0)
    ccf_half_max = probs > (np.max(probs, axis=0).reshape((1, -1)) / 2)
    ccf_half_max = np.where(ccf_half_max==0, np.nan, np.arange(num_ccf_grid).reshape(-1,1))
    
    ccf_lower = np.maximum(np.nanmin(ccf_half_max, axis=0) - 1, 1) # closest ccf value before half-max range (within 0-1 range)
    ccf_upper = np.minimum(np.nanmax(ccf_half_max, axis=0) + 1, num_ccf_grid) # closest ccf value after half-max range (within 0-1 range)

    ccf_lower = ccf_lower / num_ccf_grid
    ccf_upper = ccf_upper / num_ccf_grid
    
    ccfs = ccfs/num_ccf_grid
    
    clonality = np.array(["SUBCLONAL-"] * np.shape(probs)[1])
    clonal_condition = np.logical_and(ccfs >= .9, np.sum(probs[850:, :], axis=0)>=0.75)
    clonality = np.where(clonal_condition, "CLONAL+", clonality)
    clonal_condition = np.logical_and(ccf_lower < .5, np.sum(probs[:500, :], axis=0)>=0.75)
    clonality = np.where(clonal_condition, "SUBCLONAL+", clonality)
    clonal_condition = np.logical_and(ccfs >= .9, np.logical_or(np.sum(probs[850:, :], axis=0)>=0.5, ccf_lower>=0.5))
    clonality = np.where(clonal_condition, "CLONAL-", clonality)
    return (ccfs, clonality, ccf_lower, ccf_upper)

def get_CCF_MAP(prob_mat, maf, ccf_col="purity_ccf", clonal_col="purity_clonal", bounds_prefix="purity"):
    #adds new columns to maf df with best ccf estimate marginalized over purity distribution
    marginalized = np.sum(prob_mat, axis=0)
    CCFs, clonality, lower, upper = get_ccfs_clonality_many(marginalized)
    maf[ccf_col] = CCFs
    maf[clonal_col] = clonality
    maf[bounds_prefix+"_lower"] = lower
    maf[bounds_prefix+"_upper"] = upper
    return marginalized

def expected_count_clonal(prob_mat, maf, clonal_threshold=0.95, filter_maf=None):
    num_ccf_grid = np.shape(prob_mat)[1]
    idx_threshold = int(clonal_threshold*num_ccf_grid)
    
    probs_clonal = np.sum(prob_mat[:, idx_threshold:, :], axis=1)
    
    all_samples = list(set(maf["Tumor_Sample_Barcode"]))
    to_return = []
    CI_clonal = []
    for sample in all_samples:
        if filter_maf is not None:
            is_sample = list(set(np.nonzero((maf["Tumor_Sample_Barcode"] == sample).tolist())[0]).intersection(set(filter_maf)))
        else:
            is_sample = np.nonzero((maf["Tumor_Sample_Barcode"] == sample).tolist())[0]
        sample_probs = probs_clonal[:, is_sample]
        to_return.append(np.nansum(sample_probs))
        CI_clonal.append(np.nansum(maf.iloc[is_sample]["purity_upper"]==1))
    return pd.DataFrame({"sample":all_samples, "exp_clonal":to_return, "CI_clonal":CI_clonal})

def add_ccfs_count_clonal(prob_mat, maf, purity_dict):
    maf_save = None
    clonal = None
    all_marg = np.zeros((np.shape(prob_mat)[2], np.shape(prob_mat)[1]))
    start_idx = 0
    for sample in list(set(maf["Tumor_Sample_Barcode"])):
        is_sample = np.nonzero((maf["Tumor_Sample_Barcode"] == sample).tolist())[0]
        new_maf = maf.iloc[is_sample]
        new_mat = prob_mat[:, :, is_sample]
        n_muts = len(new_maf)
        new_mat = purity_dist_normalize(new_mat, new_maf, purity_dict)
        marg = get_CCF_MAP(new_mat, new_maf)
        all_marg[start_idx:start_idx+n_muts,:] = np.transpose(marg)
        start_idx += n_muts
        
        filter_maf = np.logical_and(np.isin(new_maf["Reference_Allele"], ["A", "T", "C", "G"]), np.isin(new_maf["Tumor_Seq_Allele2"], ["A", "T", "C", "G"]))
        filter_maf = np.logical_and(filter_maf, new_maf["t_depth"] >= 10)
        filter_maf = np.logical_and(filter_maf, new_maf["t_alt_count"] >= 2)
        filter_maf = np.logical_and(filter_maf, new_maf["vaf"] >= 0.01)
        clonal_add = expected_count_clonal(new_mat, new_maf, filter_maf=np.nonzero((filter_maf).tolist())[0])
        clonal_add["patient"] = patient

        if clonal is None:
            clonal = clonal_add
        else:
            clonal = pd.concat([clonal, clonal_add])
        if maf_save is None:
            maf_save = new_maf
        else:
            maf_save = pd.concat([maf_save, new_maf], ignore_index=True)
    return maf_save, clonal, all_marg

def filter_maf_depth(maf, min_total=0, min_alt=0, min_vaf=0):
    to_return = maf[maf["t_depth"] >= min_total]
    to_return = to_return[to_return["t_alt_count"] >= min_alt]
    return to_return[to_return["vaf"] >= min_vaf]

def sample_to_patient(sample):
    if sample[0] == "A":
        patient = sample[:4]
    else:
        patient = sample[:1] + "001"
    return patient

In [None]:
clonal_WES = None
clonal_WGS = None
for patient in all_patients_to_process:
    print(patient)
    prob_mat = np.load(input_dir+patient+WES_npy_postfix)
    maf = pd.read_csv(input_dir+patient+WES_maf_postfix)
    
    maf_save, clonal_add, marg = add_ccfs_count_clonal(prob_mat, maf, purity_dict)
    if maf_save is not None:
        maf_save.to_csv(output_dir+"wes/"+patient+"_muts_WES_ppVAFs.csv", index=False)
        np.save(output_dir+patient+"wes/"+"_ppVAFmarginalizedPurity_WES.npy", marg)
    if clonal_WES is None:
        clonal_WES = clonal_add
    else:
        clonal_WES = pd.concat([clonal_WES, clonal_add])
    
    prob_mat = None
    
    prob_mat = np.load(input_dir+patient+WGS_npy_postfix)
    maf = pd.read_csv(input_dir+patient+WGS_maf_postfix)
    
    maf_save, clonal_add, marg = add_ccfs_count_clonal(prob_mat, maf, purity_dict)
    if maf_save is not None:
        maf_save.to_csv(output_dir+"wgs/"+patient+"_muts_WGS_ppVAFs.csv", index=False)
        np.save(output_dir+"wgs/"+patient+"_CCFmarginalizedPurity_WGS.npy", marg)
    if clonal_WGS is None:
        clonal_WGS = clonal_add
    else:
        clonal_WGS = pd.concat([clonal_WGS, clonal_add])
    
    prob_mat = None

In [None]:
combined_maf_WES = None
for patient in all_patients_to_process:
    try:
        maf = pd.read_csv(output_dir+"wes/"+patient+"_muts_WES_ppVAFs.csv")
        #print(len(maf))
        if combined_maf_WES is None:
            combined_maf_WES = maf
        else:
            combined_maf_WES = pd.concat([combined_maf_WES, maf])
    except FileNotFoundError:
        pass
        
combined_maf_WGS = None
for patient in all_patients_to_process:
    try:
        maf = pd.read_csv(output_dir+"wgs/"+patient+"_muts_WGS_ppVAFs.csv")
        if combined_maf_WGS is None:
            combined_maf_WGS = maf
        else:
            combined_maf_WGS = pd.concat([combined_maf_WGS, maf])
    except FileNotFoundError:
        pass

#combined_maf_WGS.to_csv(ccf_dir+"wgs/"+"combined_noshared_muts_WGS.csv", index=False)
#combined_maf_WES.to_csv(ccf_dir+"wes/"+"combined_noshared_muts_WES.csv", index=False)

filtered_maf_WGS = filter_maf_depth(combined_maf_WGS, min_total=10, min_alt=2, min_vaf=0.01)
filtered_maf_WES = filter_maf_depth(combined_maf_WES, min_total=10, min_alt=2, min_vaf=0.01)

filtered_maf_WGS = filtered_maf_WGS[filtered_maf_WGS["purity_ccf"] > 0]
filtered_maf_WES = filtered_maf_WES[filtered_maf_WES["purity_ccf"] > 0]
filtered_maf_WGS.to_csv(output_dir+"wgs/"+"combined_noshared_FILTERED_muts_WGS.maf", index=False, sep="\t")
filtered_maf_WES.to_csv(output_dir+"wes/"+"combined_noshared_FILTERED_muts_WES.maf", index=False, sep="\t")

In [None]:
merge_WES = clonal_WES.set_index("sample")
merge_WES["has_WES"] = True

merge_WGS = clonal_WGS.set_index("sample")
merge_WGS["has_WGS"] = True

all_clonal = merge_WES[["exp_clonal", "CI_clonal", "has_WES"]].join(merge_WGS, lsuffix="_WES", how="outer")
all_clonal["patient"] = [sample_to_patient(x) for x in all_clonal.index]

all_clonal["has_WGS"] = all_clonal["has_WGS"].fillna(False)
all_clonal["has_WES"] = all_clonal["has_WES"].fillna(False)

In [None]:
sample_to_stage = dict(zip(combined_maf_WGS["Tumor_Sample_Barcode"], combined_maf_WGS["Stage"]))
sample_to_stage.update(dict(zip(combined_maf_WES["Tumor_Sample_Barcode"], combined_maf_WES["Stage"])))
all_clonal["stage"] = [sample_to_stage[x] for x in all_clonal.index]

In [None]:
WGS_cutoff = 27 + 36
WES_cutoff = WGS_cutoff * 0.01

all_clonal["is_poly"] = [all_clonal.iloc[i]["exp_clonal"] < WGS_cutoff if all_clonal.iloc[i]["has_WGS"] else expected_clonal.iloc[i]["exp_clonal_WES"] < WES_cutoff for i in range(len(all_clonal))]


In [None]:
all_clonal.to_csv(output_dir+"wgs_wes/"+"clonal_noshared_WES_WGS_polycalls.csv")