In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import pickle
from ppVAF_utils import *

#pd.options.mode.copy_on_write = True

In [2]:
# THIS NOTEBOOK TAKES THE ppVAF POSTERIORS AND FILTERS AND ANNOTATES THE WGS AND WES MAFS WITH A COLUMN WITH ppVAF POINT ESTIMATES
# It will probably require a lot of memory to run (>100GB)

all_patients_to_process = ["A001", "A002", "A015", "A014", "F001", "G001"]

# directory containing the .npy unnormalized ppVAF probabilities for WES and WGS data and per-patient maf files
# calculated by calculate_ppVAF_posteriors.py, with filenames that are of the form [[PATIENT]][[WES_npy_postfix]]
# and [[PATIENT]][[WES_maf_postfix]] for the WES data, respectively
input_dir = ""
#input_dir = "/path/to/input/root/dir/"

WES_npy_postfix = "_ppVAFgivenPurity_noprior_WES.npy"
WES_maf_postfix = "_muts_WES.maf"
WGS_npy_postfix = "_ppVAFgivenPurity_noprior_WGS.npy"
WGS_maf_postfix = "_muts_WGS.maf"

# directory to save the new mafs with ppVAFs in a new "ppVAF" column
# must mirror structure of the zenodo directory (have subdirectories "wes", "wgs", and "wgs_wes" in which data will be saved)
# output files are inputs to most downstream plotting scripts and are provided in the zenodo distribution
output_dir = input_dir + "filtered_final/"
#output_dir = "/path/to/output/root/dir/"

In [3]:
annot_dir = "../../data/scATACseq_annotations/"
purity_dict = pickle.load(open(annot_dir+"scATAC_purities.p", "rb"))

In [4]:
thresholds = [0.6, 0.7, 0.8, 0.9, 0.95]

for patient in all_patients_to_process:
    print(patient)
    prob_mat = np.load(input_dir+patient+WES_npy_postfix)
    maf = pd.read_csv(input_dir+patient+WES_maf_postfix)
    maf_save, marg = add_ppVAFs(prob_mat, maf, purity_dict, thresholds)
    if maf_save is not None:
        maf_save.to_csv(input_dir+patient+"_muts_WES_ppVAFs.maf", sep="\t", index=False)
        extract_full_marginal(maf_save, marg, "KRAS").to_csv(output_dir+patient+"_KRAS_marginals_WES.csv", index=False)
        extract_full_marginal(maf_save, marg, "APC").to_csv(output_dir+patient+"_APC_marginals_WES.csv", index=False)
    
    prob_mat = None
    
    prob_mat = np.load(input_dir+patient+WGS_npy_postfix)
    maf = pd.read_csv(input_dir+patient+WGS_maf_postfix)
    
    maf_save, marg = add_ppVAFs(prob_mat, maf, purity_dict, thresholds)
    if maf_save is not None:
        maf_save.to_csv(input_dir+patient+"_muts_WGS_ppVAFs.maf", sep="\t", index=False)
        extract_full_marginal(maf_save, marg, "KRAS").to_csv(output_dir+patient+"_KRAS_marginals_WGS.csv", index=False)
        extract_full_marginal(maf_save, marg, "APC").to_csv(output_dir+patient+"_APC_marginals_WGS.csv", index=False)
    
    prob_mat = None

A001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maf[ccf_col] = CCFs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maf[bounds_prefix+"_lower"] = lower
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maf[bounds_prefix+"_upper"] = upper
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

A002
A015


  exec(code_obj, self.user_global_ns, self.user_ns)


A014
F001
G001


In [5]:
combined_maf_WES = None
for patient in all_patients_to_process:
    try:
        maf = pd.read_csv(input_dir+patient+"_muts_WES_ppVAFs.maf", sep="\t")
        #print(len(maf))
        if combined_maf_WES is None:
            combined_maf_WES = maf
        else:
            combined_maf_WES = pd.concat([combined_maf_WES, maf])
    except FileNotFoundError:
        pass
        
combined_maf_WGS = None
for patient in all_patients_to_process:
    try:
        maf = pd.read_csv(input_dir+patient+"_muts_WGS_ppVAFs.maf", sep="\t")
        if combined_maf_WGS is None:
            combined_maf_WGS = maf
        else:
            combined_maf_WGS = pd.concat([combined_maf_WGS, maf])
    except FileNotFoundError:
        pass

combined_maf_WGS.to_csv(output_dir+"HTAN_WGS_filtered_ppVAFs.maf", index=False, sep="\t")
combined_maf_WES.to_csv(output_dir+"HTAN_WES_filtered_ppVAFs.maf", index=False, sep="\t")

In [6]:
maf_dir = "/oak/stanford/groups/ccurtis2/users/rschenck/FAP_project/other_cohorts/"

# SPECIFY LOCATION OF MAF FILES FOR THE TWO EXTERNAL COHORTS (NOT PROVIDED)
# This notebook computes ppVAFs for each mutation in these mafs using the scATAC data from our cohort
# Those steps will take some time to run and will require a nontrivial amount of memory (64 GB is sufficient)
Li_FAP_WES_loc = maf_dir+"PKU/PKU_samples.consensus.filtered.mpileups_filtered.ccfs.ccfs_noprobs.maf"
PKU_maf = pd.read_csv(Li_FAP_WES_loc, sep="\t")
PUTH_sample_to_stage = dict(zip(PKU_maf["Tumor_Sample_Barcode"], PKU_maf["Stage"]))
PUTH_sample_to_stage['FAP4_LI_N1'] = "Mucosa"
PUTH_sample_to_stage['FAP4_LI_N2'] = "Mucosa"

Cross_sporadic_WES_loc = maf_dir+"BCI/Sporadic.consensus.filtered.mpileups_filtered.ccfs.ccfs_noprobs.maf"
BCI_maf = pd.read_csv(Cross_sporadic_WES_loc, sep="\t")
SCORT_sample_to_stage = dict(zip(BCI_maf["Tumor_Sample_Barcode"], BCI_maf["Stage"]))

In [7]:
# EXTERNAL COHORTS IF YOU HAVE THEM (SCORT AND PUTH)

prob_mat = np.load(input_dir+"PUTH_ppVAFgivenPurity_noprior.npy")
maf = pd.read_csv(input_dir+"PUTH_muts.maf")
maf["Stage"] = [PUTH_sample_to_stage[x[5:]] for x in maf["Tumor_Sample_Barcode"]]
maf_save, marg = add_ppVAFs(prob_mat, maf, purity_dict, thresholds)
maf_save.to_csv(output_dir+"PUTH_filtered_ppVAFs.maf", sep="\t", index=False)

prob_mat = np.load(input_dir+"SCORT_ppVAFgivenPurity_noprior.npy")
maf = pd.read_csv(input_dir+"SCORT_muts.maf")
maf["Stage"] = [SCORT_sample_to_stage[x[6:]] for x in maf["Tumor_Sample_Barcode"]]
maf_save, marg = add_ppVAFs(prob_mat, maf, purity_dict, thresholds)
maf_save.to_csv(output_dir+"SCORT_filtered_ppVAFs.maf", sep="\t", index=False)