In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# root directory where unfiltered mutation data are stored
# if using the zenodo data, this should point to the location of the base zenodo directory 
data_dir = ""

# filtered mafs (filename suffix "_filtered.maf") must be present in input_dir
input_dir = data_dir + "ppVAF_temp/"
save_dir = data_dir + "ppVAF_temp/"

# names of datasets which you want to process- should match maf and copy number file names.
# external datasets ("PUTH" and "SCORT") not included with our repo- you will have to generate these yourself

# if you are processing the HTAN WES cohort, the WGS calls must also be present under name "HTAN_WGS". the WGS calls 
# are used for the HTAN WES ppVAF calculations.
maf_names = ['HTAN_WGS', "HTAN_WES", "PUTH", "SCORT"]

# directory containing the .tsv FACETS CN merged and filtered files for each dataset. Default repo location given.
CN_dir = "../../data/copy_number/"

In [3]:
ranges_cols = ["Chromosome", "Start_Position", "End_Position", "Strand", "Mut_ID"]
CN_cols = ["chrom", "loc_start", "loc_end", "tcn_em", "lcn_em"]

CN_mafs = []
for i,save_name in enumerate(maf_names):
    maf = pd.read_csv(input_dir+save_name+"_filtered.maf", sep="\t")
    CNs = pd.read_csv(CN_dir+save_name+"_CN_filtered_merged.tsv", sep="\t")
    if save_name == "HTAN_WES":
        WGS_CNs = pd.read_csv(CN_dir+"HTAN_WGS_CN_filtered_merged.tsv", sep="\t")
        WGS_samples = list(set(WGS_CNs["sample_id"]))
    maf_samples = list(set(maf["Tumor_Sample_Barcode"]))
    to_concat = []
    for sample in maf_samples:
        only_sample = maf[maf["Tumor_Sample_Barcode"] == sample]
        sample_ranges = only_sample[ranges_cols].rename(columns={"Start_Position":"Start", "End_Position":"End"})
        sample_ranges = pr.PyRanges(sample_ranges)
        if sample not in list(set(CNs["sample_id"])):
            print(sample, "not found in copy number")
            continue
        
        if save_name == "HTAN_WES" and sample in WGS_samples:
            CN_sample = WGS_CNs[WGS_CNs["sample_id"]==sample]
        else:
            CN_sample = CNs[CNs["sample_id"]==sample]
        
        CN_ranges = CN_sample[CN_cols].rename(columns={"loc_start":"Start", "loc_end":"End", "chrom":"Chromosome"})
        CN_ranges["Strand"] = "+"
        CN_ranges = pr.PyRanges(CN_ranges)
        
        CN_intersect = sample_ranges.join(CN_ranges).df
        
        duplicates = CN_intersect.value_counts("Mut_ID")
        duplicates = duplicates[duplicates > 1]

        if len(duplicates) > 0:
            CN_intersect["start_intersect"] = np.maximum(CN_intersect["Start"], CN_intersect["Start_b"])
            CN_intersect["end_intersect"] = np.minimum(CN_intersect["End"], CN_intersect["End_b"])
            CN_intersect["len_intersect"] = CN_intersect["end_intersect"] - CN_intersect["start_intersect"]
            deduplicated = []
            for mut in duplicates.index:
                both_listings = CN_intersect[CN_intersect["Mut_ID"]==mut]
                to_add = pd.DataFrame(both_listings.iloc[0]).transpose()
                longest_segment = both_listings.sort_values("len_intersect", ascending=False).iloc[0]
                to_add["tcn_em"] = longest_segment["tcn_em"]
                to_add["lcn_em"] = longest_segment["lcn_em"]
                deduplicated.append(to_add)

            deduplicated = pd.concat(deduplicated)
            CN_intersect = CN_intersect[~np.isin(CN_intersect["Mut_ID"], duplicates.index)]
            CN_intersect = pd.concat([CN_intersect, deduplicated])
            
            print("Resolved breakpoint within mutation")
        mutid_to_tcn = dict(zip(CN_intersect["Mut_ID"], CN_intersect["tcn_em"]))
        mutid_to_lcn = dict(zip(CN_intersect["Mut_ID"], CN_intersect["lcn_em"]))
        only_sample["tcn"] = [mutid_to_tcn[only_sample.iloc[i]["Mut_ID"]] if only_sample.iloc[i]["Chromosome"] != "chrY" else 1 for i in range(len(only_sample))]
        only_sample["lcn"] = [mutid_to_lcn[only_sample.iloc[i]["Mut_ID"]] if only_sample.iloc[i]["Chromosome"] != "chrY" else 0 for i in range(len(only_sample))]
        to_concat.append(only_sample)
    CN_mafs.append(pd.concat(to_concat))
        

  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_sample["tcn"] = [mutid_to_tcn[only_sample.iloc[i]["Mut_ID"]] if only_sample.iloc[i]["Chromosome"] != "chrY" else 1 for i in range(len(only_sample))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_sample["lcn"] = [mutid_to_lcn[only_sample.iloc[i]["Mut_ID"]] if only_sample.iloc[i]["Chromosome"] != "chrY" else 0 for i in range(len(only_sample))]


In [4]:
for i,save_name in enumerate(maf_names):
    to_save = CN_mafs[i]
    to_save = to_save[to_save["tcn"] != 0]
    to_save.to_csv(save_dir+save_name+"_filtered_CNs.maf", sep="\t", index=False)