In [None]:
from __future__ import print_function
import os.path
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../../JKBio/')
import CCLF_processing
from IPython.display import Image, display, HTML

# Import requirements for making CNV plots
from matplotlib import pyplot as plt
import cnvlib

# Import requirements for interactive features
import qgrid
import ipywidgets as widgets
from ipywidgets import interact, interact_manual, interactive
import re # used for regex

# Extra options
qgrid.set_defaults(remote_js=True, precision=3, export_mode=True)
# qgrid.set_grid_option('maxVisibleRows', 10)

In [None]:
# Using Phil's code for card view so I can have a dropdown in a static HTML. This is in the next three chunks.


In [None]:
%%javascript

var script = document.createElement('script');
script.type = 'text/javascript';
script.src = '//code.jquery.com/jquery-3.3.1.min.js';
document.head.appendChild(script);

function updateVisibleCard(id_prefix) {
    $("."+id_prefix).hide()
    $("#"+$("#"+id_prefix+"-select").children("option:selected").val()).show()
}

console.log(window)
window.updateVisibleCard = updateVisibleCard

In [None]:
import IPython
from IPython import get_ipython
def _get_html(obj):
    """Get the HTML representation of an object"""
    # TODO: use displaypub to make this more general
    ip = get_ipython()
    png_rep = ip.display_formatter.formatters['image/png'](obj)

    if png_rep is not None:
        #if isinstance(obj, plt.Figure):
        #    plt.close(obj)  # keep from displaying twice
        return ('<img src="data:image/png;'
                'base64,{0}">'.format(png_rep))
    else:
        return "<p> {0} </p>".format(str(obj))

    rep = ip.display_formatter.formatters['text/html'](obj)

    if rep is not None:
        return rep
    elif hasattr(obj, '_repr_html_'):
        return obj._repr_html_()

In [None]:
import random
import string

def random_id(n):
    return ''.join([random.choice(string.ascii_uppercase+string.ascii_lowercase) for _ in range(n)])

def _select_html(id_prefix, labels):
    select_tmpl = "<select id=\"{id_prefix}-select\" onchange=\"updateVisibleCard('{id_prefix}')\">{options}</select>"
    option_tmpl = "<option value=\"{id_prefix}-{i}\" {selected}>{label}</option>"
    options = "".join([option_tmpl.format(selected={True: "selected", False: ""}[i == 0],
                                          i=i, label=label, id_prefix=id_prefix) for i, label in enumerate(labels)])
    return select_tmpl.format(**locals())

def _cards_html(id_prefix, images):
    div_tmpl = "<div class=\"{id_prefix}\" id=\"{id_prefix}-{i}\" {style}>{inner_html}</div>"
    img_divs = [div_tmpl.format(inner_html=_get_html(image), 
                                style={False: "style=\"display: none\"", True: ""}[i == 0],
                                id_prefix=id_prefix, 
                                i=i) for i, image in enumerate(images)]
    return "".join(img_divs)

def _select_card_html(labels, images):
    assert len(labels) == len(images)
    id_prefix = random_id(8)
    select_html = _select_html(id_prefix, labels)
    cards_html = _cards_html(id_prefix, images)
    return select_html + cards_html

def display_card_view(labels, images):
    return IPython.display.HTML(_select_card_html(labels, images))
    
#IPython.display.HTML("""
#<select id="zzz-select">
#<option value="zzz-1" selected>google</option>
#<option value="zzz-2">bird</option>
#</select>
#<div style='background-color:red; width: 100px'>aaa<div class='zzz' id='zzz-1'>"""
#                     +_get_html(i1)+"</div><div class='zzz' id='zzz-2' style='display: none'>"+_get_html(i2)+"</div>")

In [None]:
cwd = os.getcwd()
# print(cwd)

In [None]:
# path is the participant-specific path
# path = "gs://cclf_results/targeted/neekesh_201912/Wilms_Tumor/PEDS204/"
path = "gs://cclf_results/targeted/neekesh_TEST/Wilms_Tumor/PEDS196/"

In [None]:
participant = str(os.path.basename(os.path.dirname(path))).replace("_", " ")
disease = str(os.path.basename(os.path.dirname(os.path.dirname(path)))).replace("_", " ")

***
***

# Report for participant: {{participant}} ({{disease}})

In [None]:
def read_in_tables(filepaths, index_col = None):
    """ takes list of filepaths to TSVs and returns dataframes read in by Pandas"""
    return [pd.read_csv(f, sep='\t', index_col = index_col) for f in filepaths]

In [None]:
%%capture
# A list of file paths for the selected participant
filepaths = ! gsutil ls -r {path}**

# Get all the table filepaths in the bucket
table_filepaths = ! gsutil ls -r {path}*.txt # check: will this search recursively for all .txt files?
to_add = ! gsutil ls -r {path}**.tsv
table_filepaths += to_add

# Get all the png filepaths in the bucket
img_filepaths = ! gsutil ls -r {path}**.png

# Copy all the pngs in the bucket to a tmp folder
# TODO: need to delete the files afterwards
tempdir='../temp/cclfreport/'
imagetempdir='../temp/cclfreport/images/'
! gsutil cp -r {path}**.png {imagetempdir} # copy images from google bucket to local temp folder
# local_img_filepaths = ! ls {imagetempdir}*.png
local_img_filepaths = [imagetempdir + os.path.basename(i) for i in img_filepaths]

# TODO: currently not exactly the external ID - it's the name of the image itself. # UPDATE: fixed!
# Dictionary mapping external IDs to their respective horizontal CN plot GS links
ext_id_to_GS_img = {os.path.basename(link).split("_wes_copy_number_map")[0].split("_copy_number_map")[0]:link for link in img_filepaths}

# Directory mapping external IDs to the local filepaths for the horizontal CN plots
ext_id_to_local_img = {os.path.basename(link).split("_wes_copy_number_map")[0].split("_copy_number_map")[0]:link for link in local_img_filepaths}

In [None]:
def make_interactive_table(filepath, cols_to_include = None, index_col_name = None, forcefit = False):
    """Takes single pd dataframe as input"""
#     if type(filepath) is str:
#         try:
#             filepath = pd.read_csv(filepath, sep="\t", index_col = None)
#         except:
#             raise Exception("The function expected a pandas dataframe as input, but got a string that wasn't a path to a table: ", str(filepath))
    if type(filepath) != pd.core.frame.DataFrame:
        raise Exception("The function expected a pandas dataframe as input, but got: ", str(type(filepath)))
    data = filepath
    
    # Set the index if it is a column in the data table
    if index_col_name in data.columns.tolist():
        data.set_index(index_col_name, inplace=True, drop=True)
    
    # Subset the data to include the specified columns, if any passed in
    if cols_to_include is not None:
#         if index_col_name is None:
#             # The index is the first column listed
#             index_col_name = cols_to_include[0]
        data = data[cols_to_include]
        if 'keep' in cols_to_include:
            data = data.loc[data['keep'] == True]
            
    # Create and display interactive table
    qgrid_widget = qgrid.show_grid(data, show_toolbar=False, grid_options = {'forceFitColumns': forcefit, 'defaultColumnWidth': 150, 'enableColumnReorder':True}, precision=3)
    display(qgrid_widget)
    print("\n")

***
***

# Sample information and identifiers
This section details the external IDs for all the samples we discovered when searching the existing targeted probe data and WES data.

In [None]:
# TODO: add whether we have processed data for each external ID. Currently, I don't track the mutation data for the normal samples... because we don't call them separately.

In [None]:
# Sample information and identifiers
all_external_ids = ! gsutil ls -r {path}**all_external_ids.tsv
all_failed_external_ids = ! gsutil ls -r {path}**all_failed_external_ids.tsv

# Read in the tables
all_external_ids_df = read_in_tables(all_external_ids, index_col = "external_id")
all_failed_external_ids_df = read_in_tables(all_failed_external_ids, index_col = "external_id")

## Table: all external IDs & associated metadata
The below table is sortable and filterable. You can triple-click on the cells in the table if you want to copy the contents, like if you wanted to copy the link to the file in the Google storage console.

In [None]:
# Make a table with metadata for each external ID

summary_df = all_external_ids_df[0]
# summary_df['participant'] = str(os.path.basename(os.path.dirname(all_external_ids[0])))
# summary_df['disease'] = str(os.path.basename(os.path.dirname(os.path.dirname(all_external_ids[0]))))
summary_df['filepath'] = str(os.path.dirname(all_external_ids[0]))
summary_df['link'] = str(re.sub('gs://', 'https://console.cloud.google.com/storage/browser/', os.path.dirname(all_external_ids[0])))

# Print some summary information
print("We found a total of", summary_df.shape[0],"external IDs that passed the depth of coverage QC.")
print("Do note that some of these samples may not have been processed yet, so this report may not have CN plots or mutation information for all the external IDs listed below.")

# Display an interactive table
qgrid_widget = qgrid.show_grid(summary_df, show_toolbar=False, grid_options = {'forceFitColumns': False, 'enableColumnReorder':False, 'maxVisibleRows':min(10, summary_df.shape[0])})
display(qgrid_widget)

In [None]:
min(10, summary_df.shape[0])

In [None]:
display(summary_df)

## Samples that failed the depth of coverage QC
This summary details all the the external IDs of each sample that failed the depth of coverage QC in the targeted probe pipeline. The depth of coverage QC in the targeted probe pipeline requires that the average gene-level or interval-level coverage is >=50x.

In [None]:
# Print the external IDs that passed the QC
failed_qc = False

# Make a table with metadata for each external ID
failed_df = all_failed_external_ids_df[0]

# Print some summary information
print("We found a total of", failed_df.shape[0],"external IDs that failed the depth of coverage QC.")

# failed_df['participant'] = str(os.path.basename(os.path.dirname(all_external_ids[0])))
# failed_df['disease'] = str(os.path.basename(os.path.dirname(os.path.dirname(all_external_ids[0]))))
failed_df['filepath'] = str(os.path.dirname(all_external_ids[0]))
failed_df['link'] = str(re.sub('gs://', 'https://console.cloud.google.com/storage/browser/', os.path.dirname(all_failed_external_ids[0])))

# Display an interactive table
if failed_df.shape[0] > 0:
    qgrid_widget = qgrid.show_grid(failed_df, show_toolbar=False, grid_options = {'forceFitColumns': True, 'enableColumnReorder':True})
    display(qgrid_widget)        


***
***

# Copy number data

In [None]:
# Get the CN tables from the Google storage bucket and create list of paths
tsca_cn = ! gsutil ls -r {path}**TSCA_copy_number.tsv
twist_cn = ! gsutil ls -r {path}**TWIST_copy_number.tsv
wes_cn = ! gsutil ls -r {path}**WES_copy_number.tsv
all_cn_tables = wes_cn + tsca_cn + twist_cn

# Create dictionary with filepaths as keys and pandas DF as the values
tsca_cn_dict = {f:pd.read_csv(f, sep="\t", index_col = False) for f in tsca_cn if 'CommandException' not in f}
twist_cn_dict = {f:pd.read_csv(f, sep="\t", index_col = False) for f in twist_cn if 'CommandException' not in f}
wes_cn_dict = {f:pd.read_csv(f, sep="\t", index_col = False) for f in wes_cn if 'CommandException' not in f}

# CN columns to display in tables
cn_col_names = ['Sample', 'condition','Chromosome', 'Start', 'End','Segment_Mean', 'Segment_Call', 'Num_Probes']

# Temporary directory for CN tables
tabletempdir = '../temp/cclfreport/tables/'

# Local filepaths for the CN tables
local_cn_filepaths = [tabletempdir + os.path.basename(i) for i in all_cn_tables]

In [None]:
# Create single seg file containing all samples from targeted and WES

# Create dict of dicts: {participant_id: {external_id: seg file}}
seg_file_dict = dict()
seg_columns = ["external_id","Chromosome", "Start", "End", "Num_Probes", "Segment_Mean"]

# Read in tables and merge together into one giant seg file
seg_df = pd.concat((pd.read_csv(f,sep="\t", index_col = None) for f in all_cn_tables if 'CommandException' not in f), sort=True)

# Convert to properly-formatted seg file and 
# convert the relative copy ratio to the log2(relative copy ratio)
seg_df = seg_df.loc[:,seg_columns]
seg_df["Segment_Mean"] = np.log2(seg_df["Segment_Mean"])

# Write table to a seg file (TSV format)
tmp_path = tabletempdir + participant + ".tsv"
seg_df.to_csv(tmp_path, sep = "\t", index = False)

In [None]:
def sort_list(list1, list2): 
    # Sort list1 by values in list2
    zipped_pairs = zip(list2, list1) 
  
    z = [x for _, x in sorted(zipped_pairs)] 
      
    return z

In [None]:
# Convert to .cns format using CNVkit
converted = ! cnvkit.py import-seg {tmp_path} -d {tabletempdir}

# Extract .cns filepaths
cns_files = []
# TODO: fix pattern so that has backslash before .cns. Otherwise, can end with cns instead of requiring .cns!
pattern = re.compile(r'{}.*.cns'.format(tabletempdir))
for file in converted:
    cns_files += [re.search(pattern,file)[0]]
    
cns_ext_ids = [os.path.basename(path)[:-4] for path in cns_files]
cns_id_file_dict = dict(zip(cns_ext_ids, cns_files))
cns_file_id_dict = dict(zip(cns_files, cns_ext_ids))
    
# Sort the cns files so normals are at end
cns_normal_status = [summary_df.loc[ext_id, 'is_normal'] for ext_id in cns_ext_ids]
sorted_cns_ext_ids = sort_list(cns_ext_ids, cns_normal_status)
cns_files = [cns_id_file_dict[id] for id in sorted_cns_ext_ids]

# Create list of TSCA .cns filepaths
# cns_tsca_ids = set(summary_df[summary_df.loc[:,"dataset"] == "TSCA"].loc[:,"external_id"])
# cns_tsca = [tabletempdir+ext_id+".cns" for ext_id in cns_tsca_ids if ext_id in set(seg_df.loc[:,"external_id"])]
cns_tsca_ids = set(summary_df.index[summary_df['dataset'] == 'TSCA'].tolist())
cns_tsca = [f for f in cns_files if cns_file_id_dict[f] in cns_tsca_ids]


# Create list of TWIST .cns filepaths
# cns_twist_ids = set(summary_df[summary_df.loc[:,"dataset"] == "TWIST"].loc[:,"external_id"])
# cns_twist = [tabletempdir+ext_id+".cns" for ext_id in cns_twist_ids if ext_id in set(seg_df.loc[:,"external_id"])]
cns_twist_ids = set(summary_df.index[summary_df['dataset'] == 'TWIST'].tolist())
cns_twist = [f for f in cns_files if cns_file_id_dict[f] in cns_twist_ids]


# Creat list of all targeted .cns filepaths
# cns_targeted_ids = set(summary_df[summary_df.loc[:,"is_targeted"] == True].loc[:,"external_id"])
# cns_targeted = [tabletempdir+ext_id+".cns" for ext_id in cns_targeted_ids if ext_id in set(seg_df.loc[:,"external_id"])]
cns_targeted_ids = set(summary_df.index[summary_df['is_targeted'] == True].tolist())
cns_targeted = [f for f in cns_files if cns_file_id_dict[f] in cns_twist_ids.union(cns_tsca_ids)]


# Create list of WES .cns filepaths
# cns_wes_ids = set(summary_df[summary_df.loc[:,"dataset"] == "WES"].loc[:,"external_id"])
# cns_wes = [tabletempdir+ext_id+".cns" for ext_id in cns_wes_ids if ext_id in set(seg_df.loc[:,"external_id"])]
cns_wes_ids = set(summary_df.index[summary_df['dataset'] == 'WES'].tolist())
cns_wes = [f for f in cns_files if cns_file_id_dict[f] in cns_wes_ids]


## Copy number heat maps
Select the plot to display from the dropdown menu. There are plots for targeted samples alone, WES samples alone, and all samples combined. A desaturated version of the CN heatmap for all samples combined is included as well. These CN heatmaps were constructed using log2(Segment_Mean) values from the seg file. 

To look at any one sample in more detail, you can look either at the corresponding horizontal CN plot in the next section titled "Copy number horizontal plots" or look at the CN seg file itself (see either the tables below or the TSV available at the link specified in the "Sample information and identifiers" section. The Segment_Mean value in the seg file itself is not transformed.

In [None]:
cn_heatmap_img_paths = []

In [None]:
def make_cn_heat_map(cns_files, participant, disease, sample_source = "targeted", title = None, desaturated = False):
    segments = [cnvlib.read(f) for f in cns_files]
    plt.rcParams["font.size"] = 32
    plt.rcParams['figure.figsize'] = (32,max(6,len(segments)*3))

    if desaturated:
        ax = cnvlib.do_heatmap(segments, do_desaturate=True)
        if title is None:
            ax.set_title("Desaturated copy number heatmap for {} {} ({}) samples".format(sample_source, participant, disease))
        else:
            ax.set_title(title)
    
    else:
        ax = cnvlib.do_heatmap(segments, do_desaturate=False)
        if title is None:
            ax.set_title("Copy number heatmap for {} {} ({}) samples".format(sample_source, participant, disease))
        else:
            ax.set_title(title)

    plt.tight_layout()
    
    # Save the file locally so able to use Phil's display_card_view function
    outpath = "_".join([participant, sample_source, "desat", str(desaturated)]) + ".png"
    plt.savefig(outpath, bbox_inches='tight')

In [None]:
%%capture
cn_heatmap_img_paths = []
cns_list = [cns_files, cns_wes, cns_targeted, cns_twist, cns_tsca]
cns_type = ["all", "WES","targeted","TWIST","TSCA"]
cns_dict = dict(zip(cns_type, cns_list))
for key,val in cns_dict.items():
    desat = False
    make_cn_heat_map(cns_files = val, participant = participant, disease = disease, sample_source = key, desaturated = False)
#     # Make a desaturated version in addition when plotting all samples
#     if key == cns_files:
#         desat = True
#         make_cn_heat_map(cns_files = key, participant = participant, disease = disease, 
#                          sample_source = val, desaturated = False)

    cn_heatmap_img_paths += ["_".join([participant, key, "desat", str(desat)]) + ".png"]

**Note**: The copy number heatmaps will display all available samples given your selection from the dropdown menu. If there are no samples available, the heatmap will be a grey box.

In [None]:
display_card_view(labels = cns_type, images = [Image(i) for i in cn_heatmap_img_paths])

## Copy number horizontal plots

Select the copy number plot you would like to display from the dropdown menu. The dropdown menu includes CN plots from both targeted probe (TSCA and TWIST) and WES data. The source of the data will be displayed on the title of the image. You can also refer to the table of all external IDs that maps each external ID to the source of the data (see "Sample information and identifiers").

The dropdown menu also includes a merged version of the horizontal copy number maps. This PNG file contains all the horizontal CN plots for the participant in a single place for ease of quick comparison.

In [None]:
# TODO: the WES samples have the sample ID instead of the external ID, and I don't include the sample ID metadata anywhere in the report.
# extID_source_disease = [id+"("+summary_df[summary_df.loc[:,"external_id"] == id].loc[:,"dataset"]+", " + summary_df[summary_df.loc[:,"external_id"] == id].loc[:,"disease"]+")" for id in list(image_names)]

In [None]:
# Using Phil's code for dropdown menu in static HTML (display_card_view)
image_names = ext_id_to_local_img.keys()
horizontal_plots = [Image(file) for file in local_img_filepaths]

# horizontal_CN_plots = {os.path.basename(file)[:-4]:Image(file) for file in local_img_filepaths}
display_card_view(labels = image_names, images = horizontal_plots)

## Targeted CN table (seg files)
The targeted copy number tables contains all the seg files for each sample that underwent targeted sequencing. The segment mean in the tables below represents the relative copy number. The relative copy number was calculated following this [GATK Somatic CNV calling tutorial](https://gatkforums.broadinstitute.org/gatk/discussion/9143/how-to-call-somatic-copy-number-variants-using-gatk4-cnv). The PON used when calculating the CNV information was created using all the normals from the same sequencing batch. This usually ranges from 8-11 normals for a batch of ~40 tumor samples.

Note that while the horizontal CN plots display the relative copy number, the CN heatmaps display the log2(relative copy number). No pseudocount was included because none of the relative copy number values are ever exactly 0.

In [None]:
def get_gs_file_meta(filepath):
    # Print key information about the filepath
    participant = str(os.path.basename(os.path.dirname(filepath)))
    link = str(re.sub('gs://', 'https://console.cloud.google.com/storage/browser/', filepath))
    return {"participant":participant,
           "filepath":filepath,
           "link":link}

In [None]:
def display_table(file, file_table_dict = None, cols_to_use = None, index_col = None, forcefit = False):
    if type(file) is str:
        try:
#             filepath = pd.read_csv(file, sep="\t", index_col = None)
            # Get the TSV from the dict
            df = file_table_dict[file]
            
            # Print key information about the file
            print("Participant: ", str(os.path.basename(os.path.dirname(file))))
            print("\nFilepath: "+ file)
            print("\nLink:", str(re.sub('gs://', 'https://console.cloud.google.com/storage/browser/', file)))
            
        except:
            if type(filepath) is pd.core.frame.DataFrame:
                df = file
            else:
                raise Exception("The function expected a pandas dataframe as input, but got a string that wasn't a path to a table: ", str(filepath))
    
    print("\nTo examine data for a particular external ID, filter the 'external_id' column.")

    # Make and display an interactive table
    try:
        make_interactive_table(df, cols_to_include = cols_to_use, index_col_name = index_col, forcefit=forcefit)
    except:
        print("These are the columns in common between cols_to_use and the df: ", set(cols_to_use).intersection(set(df.columns.tolist())))
        print("These are the columns in common in cols_to_use that aren't in the df: ", set(cols_to_use)- set(df.columns.tolist()))
    
        print("\nAt least one of this participant's mutation file is in a different format from the output of the newest pipeline. This data may be old, and have different column names. No filtering is performed on the displayed table, but you can add additional filters if desired.")
        make_interactive_table(df, cols_to_include = None, index_col_name = index_col, forcefit=forcefit)


In [None]:
# Participant: {{tsca_meta['participant']}}

# Filepath: {{tsca_meta['filepath']}}

# # TODO: get rid of extra apostrophes
# Link: {{tsca_meta['link']}}
    
# To examine data for a particular external ID, filter the 'external_id' column.

In [None]:
## TODO: delete / start work 
tsca_cn_dict.values()
if 'CommandException' not in tsca_cn[0]:
    tmp = pd.read_csv(tsca_cn[0], sep="\t", index_col = 'external_id') 
    
if 'CommandException' not in twist_cn[0]:
    tmp2 = pd.read_csv(twist_cn[0], sep="\t", index_col = 'external_id') 

try:
    merged = pd.merge(tmp, tmp2, how='left')
except:
    pass

In [None]:
display_table(file = tsca_cn[0], file_table_dict = tsca_cn_dict, cols_to_use = cn_col_names, index_col = "external_id")

## WES CN table  (seg files)
The WES copy number tables contains all the seg files for each sample that underwent WES sequencing. The segment mean in the tables below represents the relative copy number, as calculated using __(TODO:)__

Note that while the horizontal CN plots display the relative copy number, the CN heatmaps display the log2(relative copy number). No pseudocount was included because none of the relative copy number values are ever exactly 0.

In [None]:
display_table(file = wes_cn[0],file_table_dict = wes_cn_dict, cols_to_use = cn_col_names, index_col = "external_id")

***
***

# Mutation data

In [None]:
# # Check difference between lists including duplicates
# from collections import Counter 
  
# # initializing lists 
# test_list1 = mut_col_names_targeted
# test_list2 = list(set(mut_col_names_targeted))
  
# # printing original lists 
# print("The original list 1 : " + str(test_list1)) 
# print("The original list 2 : " + str(test_list2)) 
  
# # Using collections.Counter() 
# # Difference of list including duplicates 
# res = list((Counter(test_list1) - Counter(test_list2)).elements()) 
  
# # print result 
# print("The list after performing the subtraction : " + str(res))

In [None]:
# Get the mutation TSVs
tsca_mut = ! gsutil ls -r {path}**TSCA_mutation.tsv
twist_mut = ! gsutil ls -r {path}**TWIST_mutation.tsv
wes_mut = ! gsutil ls -r {path}**WES_mutation.tsv

# Create dictionary with filepaths as keys and pandas DF as the values
tsca_mut_dict = {f:pd.read_csv(f, sep="\t", index_col = False) for f in tsca_mut if 'CommandException' not in f}
twist_mut_dict = {f:pd.read_csv(f, sep="\t", index_col = False) for f in twist_mut if 'CommandException' not in f}
wes_mut_dict = {f:pd.read_csv(f, sep="\t", index_col = False) for f in wes_mut if 'CommandException' not in f}

# Mutation TSV columns to display in tables
# mut_col_names_targeted = mut_col_names_wes plus 'data source' and 'condition'
mut_col_names_targeted = ['data source', 'condition', 'Hugo_Symbol', 'Protein_Change','Variant_Classification', 'Variant_Type', 'tumor_f', 't_alt_count', 't_ref_count', 'COSMIC_total_alterations_in_gene',
                 'CGC_Tumor_Types_Somatic', 'CGC_Tumor_Types_Germline',
                 'Matched_Norm_Sample_Barcode','Chromosome', 'Start_position', 'End_position','Genome_Change','keep']

# mut_col_names = mut_col_names_wes plus 'data source'
mut_col_names = ['data source', 'Hugo_Symbol', 'Protein_Change','Variant_Classification', 'Variant_Type', 'tumor_f', 't_alt_count', 't_ref_count', 'COSMIC_total_alterations_in_gene',
                 'CGC_Tumor_Types_Somatic', 'CGC_Tumor_Types_Germline',
                 'Matched_Norm_Sample_Barcode','condition','Chromosome', 'Start_position', 'End_position','Genome_Change','keep']

mut_col_names_wes = [ 'Hugo_Symbol', 'Protein_Change','Variant_Classification', 'Variant_Type', 'tumor_f', 't_alt_count', 't_ref_count', 'COSMIC_total_alterations_in_gene',
                 'CGC_Tumor_Types_Somatic', 'CGC_Tumor_Types_Germline',
                 'Matched_Norm_Sample_Barcode','condition','Chromosome', 'Start_position', 'End_position','Genome_Change','keep']

Below are interactive tables containing *select* mutation information from the targeted probe data and the WES data. If there were multiple external IDs in either dataset, they have been combined into one table. The external_id column can be used to filter the data so only the mutations for a single external ID is displayed.

Note that this report only includes samples from the targeted data that pass the depth of coverage QC. Samples that did not pass this QC are not included in this report, and their data is not included in the Google bucket. A list of the samples that failed this QC is included earlier in this document (search for "Table: failed QC external IDs").

Also, note that the below tables have been filtered such that the keep column equals True. What this means is that only the variants that passed the filtering steps in the pipeline are included in the tables below. However, the raw mutation TSVs included in the Google bucket contain all the variants regardless of whether keep is True or False if you are interested in that information.

Generally speaking, if you are looking for more detailed information about why a mutation you expected to see was filtered out or if you want to get access to all of the columns available in the mutation TSV rather than the ones selected here, you can download the raw mutation TSV from the Google bucket. The full TSV contains some boolean columns that, when combined, explain the logic behind whether a variant passes our filters. 

For targeted WES data, we use the filters built into Mutect1 (GATK v3) and Mutect2 (GATK v4) in addition to additional filtering logic. For example, we rescue known TCGA hotspots and COSMIC mutations even if Mutect1 or Mutect2 would have filtered these out.

<!-- TODO: I think I should probably add more information about the filters here? I asked Neekesh, and he doesn't think this is necessary. -->

## Targeted mutation table
The targeted mutation table contains select columns from the MAF files for each sample that underwent targeted sequencing. The PON used was generated using all the normals we had at the time for the targeted sequencing technology. For example, if we had 45 normals from all the TWIST sequencing so far we would have a PON with 45 normals. This differs from the PON used for CNVs.

In [None]:
# make_interactive_table(pd.read_csv(tsca_mut[0], sep = "\t", index_col = False), cols_to_include = mut_col_names_targeted, index_col_name = 'external_id')

In [None]:
display_table(file = tsca_mut[0],file_table_dict = tsca_mut_dict, cols_to_use = mut_col_names_targeted, index_col = "external_id")

## WES mutation table

In [None]:
display_table(file = wes_mut[0],file_table_dict = wes_mut_dict, cols_to_use = mut_col_names_wes, index_col = "external_id",  forcefit=False)

## Summary information about mutations
The goal here is to answer a series of common questions. For example:

1. How many of the samples have a given gene + protein change?
2. For the samples that share a given gene + protein change, what are the allele fractions?
3. Do we see any gene + protein changes that are not present in all samples? Any changes that are present in cell lines but not the primary tumor?

It's harder to answer questions comparing T vs N because we use the N samples in the variant filtering steps (built into Mutect1/2, additional scripts after calling with these GATK tools).
It's also tricky to compare between WES and targeted sequencing samples that are older. These samples don't have consistent column names, and some may not even have the protein change reported at all.

In [None]:
mut_summary = pd.read_csv(tsca_mut[0], sep = "\t", index_col = False)
mut_summary.columns.tolist()


columns_of_interest = ['Protein_Change', 'external_id', 'data source', 'keep', 'Hugo_Symbol']
mut_summary = mut_summary[columns_of_interest]
if 'keep' in columns_of_interest:
    mut_summary = mut_summary.loc[mut_summary['keep'] == True]
    
mut_summary
# display_table(file = mut_summary,  forcefit=False)

In [None]:
# TODO: Now we should delete all the data from the local temporary directories
# aka everything except for the folders in: tempdir (tempdir='../temp/cclfreport/') or even just in '../temp/' if we're truly done for the moment.

# TODO: maybe include summary info about the mutation file, like number of mutations per sample? IDK. this would be better if could select dropdowns, like summary stats on either per gene or per external ID or the combo