In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# root directory where processed mutation data are stored
# if using the zenodo data, this should point to the location of the base zenodo directory 
data_dir = "

# directory where the simulation data and table with clonal counts and polyclonal calls are saved
# defaults to path in repo relative to current WD
output_dir = "../data/clonal_count_estimation/"

# paths to gene-level copy number calls for WGS dataset (HTAN_WGS_gene_CNs.tsv and HTAN_WES_gene_CNs.tsv)
WGS_CN_loc = "../data/copy_number/gene_CN_calls/HTAN_WGS_gene_CNs.tsv"
WES_CN_loc = "../data/copy_number/gene_CN_calls/HTAN_WES_gene_CNs.tsv"

# path to driver gene list
driver_loc = "../data/resource/PanCanDrivers_Cell2018.csv"

# path to genome doubling info
WGD_loc = "../data/copy_number/genome_doubling/HTAN_WGS_doubled.tsv"

In [3]:
driver_list = pd.read_csv(driver_loc, skiprows=3)
coad_drivers = driver_list[driver_list["Cancer"]=="COADREAD"]
WGD_info = pd.read_csv(WGD_loc, sep="\t", index_col=0)

In [4]:
clonal_calls = pd.read_csv(output_dir+"clonal_SNVs_WES_WGS.csv", index_col=0)
WGS_maf = pd.read_csv(data_dir+"HTAN_WGS_filtered_ppVAFs.maf", sep="\t")
WES_maf = pd.read_csv(data_dir+"HTAN_WES_filtered_ppVAFs.maf", sep="\t")
WES_maf = WES_maf[~np.isin(WES_maf["Tumor_Sample_Barcode"], list(set(WGS_maf["Tumor_Sample_Barcode"])))]
combined_maf = pd.concat([WGS_maf, WES_maf])
clonal_WGS = clonal_calls[clonal_calls["has_WGS"]]
CN_calls = pd.read_csv(WGS_CN_loc, sep="\t")
WES_CNs = pd.read_csv(WES_CN_loc, sep="\t")
WES_CNs = WES_CNs[~np.isin(WES_CNs["sample_id"], list(set(CN_calls["sample_id"])))]
CN_calls = pd.concat([WES_CNs, CN_calls])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
frac_benign_poly = clonal_WGS[clonal_WGS["stage"]=="Benign"].value_counts("is_poly", normalize=True)
print("Fraction of benign polyps that are polyclonal:", frac_benign_poly.loc[True])

frac_dys_poly = clonal_WGS[clonal_WGS["stage"]=="Dysplasia"].value_counts("is_poly", normalize=True)
print("Fraction of dysplastic polyps that are polyclonal:", frac_dys_poly.loc[True])

Fraction of benign polyps that are polyclonal: 0.4
Fraction of dysplastic polyps that are polyclonal: 0.2777777777777778


In [6]:
only_drivers = combined_maf[combined_maf["Driver"]]
only_APC_drivers = only_drivers[only_drivers["Hugo_Symbol"]=="APC"]
only_KRAS_drivers = only_drivers[only_drivers["Hugo_Symbol"]=="KRAS"]

CN_drivers = CN_calls[np.isin(CN_calls["gene_name"], coad_drivers["Gene"])]
CN_drivers = CN_drivers[np.logical_or(CN_drivers["CN_call"] != "NORMAL", CN_drivers["LOH_call"]=='LOH')]
CN_drivers = CN_drivers[~np.isin(CN_drivers["sample_id"], WGD_info[WGD_info["genome_doubled"]].index)]
CN_drivers = CN_drivers[CN_drivers["gene_chrom"] != "chrX"]
APC_CN = CN_drivers[np.logical_and(CN_drivers["gene_name"]=="APC", CN_drivers["LOH_call"]=='LOH')]

total_driver_set = list(set(only_drivers["Tumor_Sample_Barcode"]).union(set(CN_drivers["sample_id"])))
APC_driver_set = list(set(only_APC_drivers["Tumor_Sample_Barcode"]).union(set(APC_CN["sample_id"])))
clonal_calls["has_driver"] = [x in total_driver_set for x in clonal_calls.index]
clonal_calls["has_APC_driver"] = [x in APC_driver_set for x in clonal_calls.index]
clonal_calls["has_KRAS_driver"] = [x in list(set(only_KRAS_drivers["Tumor_Sample_Barcode"])) for x in clonal_calls.index]
clonal_calls["has_both_drivers"] = np.logical_and(clonal_calls["has_APC_driver"], clonal_calls["has_KRAS_driver"])


In [7]:
frac_mucosa_driver = clonal_calls[clonal_calls["stage"]=="Mucosa"].value_counts("has_driver", normalize=True)
print("Fraction of normal samples that have a driver:", frac_mucosa_driver.loc[True])
frac_benign_driver = clonal_calls[clonal_calls["stage"]=="Benign"].value_counts("has_driver", normalize=True)
print("Fraction of benign polyps that have a driver:", frac_benign_driver.loc[True])
frac_dys_driver = clonal_calls[clonal_calls["stage"]=="Dysplasia"].value_counts("has_driver", normalize=True)
print("Fraction of dysplastic polyps that have a driver:", frac_dys_driver.loc[True])


Fraction of normal samples that have a driver: 0.10344827586206896
Fraction of benign polyps that have a driver: 0.9428571428571428
Fraction of dysplastic polyps that have a driver: 0.9473684210526315


In [8]:
frac_mucosa_APC = clonal_calls[clonal_calls["stage"]=="Mucosa"].value_counts("has_APC_driver", normalize=True)
print("Fraction of normal samples that have a driver in APC:", frac_mucosa_APC.loc[True])
frac_benign_APC = clonal_calls[clonal_calls["stage"]=="Benign"].value_counts("has_APC_driver", normalize=True)
print("Fraction of benign polyps that have a driver in APC:", frac_benign_APC.loc[True])
frac_dys_APC = clonal_calls[clonal_calls["stage"]=="Dysplasia"].value_counts("has_APC_driver", normalize=True)
print("Fraction of dysplastic polyps that have a driver in APC:", frac_dys_APC.loc[True])


Fraction of normal samples that have a driver in APC: 0.06896551724137931
Fraction of benign polyps that have a driver in APC: 0.8285714285714286
Fraction of dysplastic polyps that have a driver in APC: 0.8245614035087719


In [9]:
frac_mucosa_APC_KRAS = clonal_calls[clonal_calls["stage"]=="Mucosa"].value_counts("has_both_drivers", normalize=True)
try:
    print("Fraction of normal samples that have a driver in APC and KRAS:", frac_mucosa_APC_KRAS.loc[True])
except KeyError:
    print("Fraction of normal samples that have a driver in APC and KRAS: 0")
frac_benign_APC_KRAS = clonal_calls[clonal_calls["stage"]=="Benign"].value_counts("has_both_drivers", normalize=True)
print("Fraction of benign polyps that have a driver in APC and KRAS:", frac_benign_APC_KRAS.loc[True])
frac_dys_APC_KRAS = clonal_calls[clonal_calls["stage"]=="Dysplasia"].value_counts("has_both_drivers", normalize=True)
print("Fraction of dysplastic polyps that have a driver in APC and KRAS:", frac_dys_APC_KRAS.loc[True])

Fraction of normal samples that have a driver in APC and KRAS: 0
Fraction of benign polyps that have a driver in APC and KRAS: 0.17142857142857143
Fraction of dysplastic polyps that have a driver in APC and KRAS: 0.3508771929824561


In [12]:
max_mucosal_clonal_SNVs = np.max(clonal_WGS[clonal_WGS["stage"]=="Mucosa"]["final_clonal_WGS"])
print("Maximum number of clonal SNVs in mucosa:", max_mucosal_clonal_SNVs)

Maximum number of clonal SNVs in mucosa: 14.0


In [20]:
polyp_clonals = clonal_WGS[np.isin(clonal_WGS["stage"], ["Benign", "Dysplasia"])]["final_clonal_WGS"]
max_polyp_clonal_SNVs = np.max(polyp_clonals)
print("Maximum number of clonal SNVs in polyps:", max_polyp_clonal_SNVs)

Maximum number of clonal SNVs in polyps: 3882.0
