In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# root directory where unfiltered mutation data are stored
# if using the zenodo data, this should point to the location of the base zenodo directory 
data_dir = ""
input_dir = data_dir + "unfiltered/"
save_dir = data_dir + "ppVAF_temp/"

maf_names = ['HTAN_WES_WGS', "PUTH", "SCORT"]

# path to /data directory in github repo where blacklist region info is stored; default relative path given
repo_data_dir = "../../data/"

In [3]:
padding = 1e5

removed_regions_loc = repo_data_dir+"resource/"+"hg38.UCSC.centromere.telomere.encode.bed"
removed_regions = pd.read_csv(removed_regions_loc, sep="\t", names=["chrom", "start_pos", "end_pos", "length", "source", "reason"])
removed_regions["is_centromere"] = ~np.isin(removed_regions["reason"], ['High Signal Region', 'Low Mappability','telomere'])
removed_regions.loc[removed_regions["is_centromere"], "start_pos"] -= padding
removed_regions.loc[removed_regions["is_centromere"], "end_pos"] += padding
removed_regions["length"] = removed_regions["end_pos"] - removed_regions["start_pos"]

removed_ranges = removed_regions.rename(columns={"chrom":"Chromosome", "start_pos":"Start", "end_pos":"End"})[["Chromosome", "Start", "End"]]
removed_ranges["Strand"] = "+"
removed_ranges = pr.PyRanges(removed_ranges)

In [4]:
def make_ID(components):
    return '_'.join([str(x) for x in components])

def check_if_male(patient):
    if patient in ["A001", "G001", "PUTH_FAP5", "SCORT_A02", "SCORT_A03", "SCORT_A06", "SCORT_A07", "SCORT_A08", "SCORT_C06", "SCORT_C07", "SCORT_C08"]:
        return True
    else:
        return False

all_mafs = []
for maf_name in maf_names:
    to_add = pd.read_csv(input_dir+maf_name+"_unfiltered.maf", sep="\t")
    to_add["Mut_ID"] = to_add[['Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2']].agg(make_ID, axis=1).tolist()
    to_add["vaf"] = to_add["t_alt_count"]/to_add["t_depth"]
    to_add["is_male"] = [check_if_male(x) for x in to_add["Patient"]]
    to_add = to_add[~np.logical_and(~to_add["is_male"], to_add["Chromosome"]=="chrY")]
    all_mafs.append(to_add)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
def filter_blacklist(maf):
    ranges_cols = ["Chromosome", "Start_Position", "End_Position", "Strand", "Mut_ID"]
    maf_ranges = maf[ranges_cols].rename(columns={"Start_Position":"Start", "End_Position":"End"})
    maf_ranges = pr.PyRanges(maf_ranges)
    to_remove = list(set(maf_ranges.overlap(removed_ranges).Mut_ID))
    filtered_maf = maf[~np.isin(maf["Mut_ID"], to_remove)]
    return filtered_maf

filtered_mafs = []
for maf in all_mafs:
    filtered_mafs.append(filter_blacklist(maf))

In [6]:
for i,save_name in enumerate(maf_names):
    if save_name == "HTAN_WES_WGS":
        continue
    to_save = filtered_mafs[i]
    to_save.to_csv(save_dir+save_name+"_filtered.maf", sep="\t", index=False)

In [7]:
driver_list = pd.read_csv(repo_data_dir+"resource/"+"PanCanDrivers_COADREAD_Cell2018.csv")
coad_drivers = driver_list[driver_list["Cancer"]=="COADREAD"]

In [8]:
# Removes mutations shared between samples in the HTAN cohort ONLY
# Also adds lesion stage annotations from Table S2
# Should not be run on other cohorts, particularly those with multiregion sequencing

assert "HTAN_WES_WGS" in maf_names

non_silent = ["Missense_Mutation", "Nonsense_Mutation", "Nonstop_Mutation", "Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", "Frame_Shift_Ins"]

HTAN_maf = filtered_mafs[maf_names.index("HTAN_WES_WGS")]

HTAN_maf["Driver"] = np.logical_and(np.isin(HTAN_maf["Hugo_Symbol"], coad_drivers["Gene"]), np.isin(HTAN_maf["Variant_Classification"], non_silent))
to_shared = HTAN_maf[~HTAN_maf["Driver"]]
to_shared.drop_duplicates(subset=["Mut_ID", "Tumor_Sample_Barcode"], inplace=True)
shared = to_shared.value_counts("Mut_ID")
shared = shared[shared > 1]
HTAN_maf_noshared = HTAN_maf[~np.isin(HTAN_maf["Mut_ID"], shared.index)]
print("Shared mutations removed:", str(len(HTAN_maf)-len(HTAN_maf_noshared)))  

sample_metadata = pd.read_csv(repo_data_dir+"Table_S2.csv", index_col=0)
sample_to_stage = dict(zip(sample_metadata.index, sample_metadata["Stage"]))
HTAN_maf["Stage"] = [sample_to_stage[x] for x in HTAN_maf["Tumor_Sample_Barcode"]]

HTAN_WGS = HTAN_maf[HTAN_maf["Method"]=="WG"]
HTAN_WES = HTAN_maf[HTAN_maf["Method"]=="WE"]

HTAN_WGS.to_csv(save_dir+"HTAN_WGS_filtered.maf", sep="\t", index=False)
HTAN_WES.to_csv(save_dir+"HTAN_WES_filtered.maf", sep="\t", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HTAN_maf["Driver"] = np.logical_and(np.isin(HTAN_maf["Hugo_Symbol"], coad_drivers["Gene"]), np.isin(HTAN_maf["Variant_Classification"], non_silent))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Shared mutations removed: 32872


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HTAN_maf["Stage"] = [sample_to_stage[x] for x in HTAN_maf["Tumor_Sample_Barcode"]]
