In [None]:
from __future__ import print_function
import os.path
# import os
import dalmatian as dm
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../../JKBio/')
import TerraFunction as terra
import CCLF_processing
%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
from IPython.display import Image, display, HTML
import ipdb

In [None]:
import qgrid # interactive tables
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import gcsfs # to be able to read in files from GCS in Python
import re # used for regex

# # Extra options
# pd.options.display.max_rows = 30
# pd.options.display.max_columns = 25
qgrid.set_grid_option('maxVisibleRows', 10)

# # Show all code cells outputs
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = 'all'

In [None]:
cwd = os.getcwd()
print(cwd)

***
***

# Pretty report generation
After grabbing and making all of the files we want for a given participant (e.g. PEDS182), we want to make a pretty, interactive report. This will be similar to a README except that we will directly embed tables and images. This involves using Jupyter widgets to create dropdown menus and the like. Here are the main functionalities I'd like:

1. kable-like tables that are interactive: sorting, filtering, typing in text or numbers to search, (ability to download sorted/filtered table as a CSV?)
2. ability to quickly go to any image in the directory. I want this so that the user can quickly look through the copy number maps (horizontal plots). Ideally, I'd like to be able to select which one(s) I'd like to view. This could be useful if they want to see two or more at once (i.e. to compare two treatment conditions).

## Automate generation of separate Jupyter notebook for each participant
To do this, we will use Papermill. Papermill automates notebook to notebook generation, and also executes the generated notebook. We may also want to convert the generated notebook to HTML. We can use *nbconvert* for this operation (see https://github.com/jupyter/nbconvert).

## Note: I may want to show the conversion between gs:// and https://console.cloud.google.com/storage/browser/ so that people who are not comfortable with using terminal will be able to easily browse and download the data in the Google bucket. I just need to make sure we won't have privacy issues (we shouldn't, right?)

In [None]:
# path would be the participant-specific path OR the path to the directory for the full analysis
path = "gs://cclf_results/targeted/neekesh_201912/" 
# a list of file paths for the selected participant
filepaths = ! gsutil ls -r {path}**

# get all the tables in the bucket
table_filepaths = ! gsutil ls -r {path}*.txt # check: will this search recursively for all .txt files?
to_add = ! gsutil ls -r {path}**.tsv
table_filepaths += to_add
# get all the pngs in the bucket
img_filepaths = ! gsutil ls -r {path}**.png

# copy all the pngs in the bucket to a tmp folder
# TODO: need to delete the files afterwards
tempdir='../temp/cclfreport/images/'
! gsutil cp -r {path}**.png {tempdir} # copy images from google bucket to local temp folder
# local_img_filepaths = ! ls {tempdir}*.png
local_img_filepaths = ['../temp/cclfreport/images/'+os.path.basename(i) for i in img_filepaths]
print(local_img_filepaths)
os.chdir(tempdir)
# local_img_file_names = ! ls **.png # list of all pngs in tempdir
local_img_file_names = [os.path.basename(i) for i in local_img_filepaths]
os.chdir(cwd)

In [None]:
# TO DELETE
# these should match up
display(local_img_filepaths[:5])
display(local_img_file_names[:5])

In [None]:
def make_interactive_table(filepath, cols_to_include = None): # assuming single filepath
    if type(filepath) != str:
        raise Exception("The function expected a str filepath as input, but got: ", str(type(filepath)))
#     print("Table: "+filepath)
    data = pd.read_csv(filepath, sep='\t')
    index_name = data.columns.tolist()[0]
    data.set_index(index_name, inplace=True, drop=True)
    if cols_to_include is not None:
        # subset the data
        data = data[cols_to_include]
        if 'keep' in cols_to_include:
            data = data.loc[data['keep'] == True]
    qgrid_widget = qgrid.show_grid(data, show_toolbar=False, grid_options = {'forceFitColumns': False,
    'defaultColumnWidth': 150})
    display(qgrid_widget)
    print("\n")
    
def make_interactive_table_orig(filepath, cols_to_include = None): # assuming single filepath
    if type(filepath) != str:
        raise Exception("The function expected a str filepath as input, but got: ", str(type(filepath)))
#     print("Table: "+filepath)
    data = pd.read_csv(filepath, sep='\t')
    if cols_to_include is not None:
        # subset the data
        data = data[cols_to_include]
        if 'keep' in cols_to_include:
            data = data.loc[data['keep'] == True]
    qgrid_widget = qgrid.show_grid(data, show_toolbar=False, grid_options = {'forceFitColumns': False,
    'defaultColumnWidth': 150})
    display(qgrid_widget)
    print("\n")
    
# def tmp(filepaths): # assuming single filepath
#     for filepath in filepaths:
#         print("Table: "+filepath)
#         data = pd.read_csv(filepath, sep='\t')
#         qgrid_widget = qgrid.show_grid(data, show_toolbar=False, grid_options = {'forceFitColumns': False,
#         'defaultColumnWidth': 150})
#         display(qgrid_widget)
#         print("\n")

# Sample information and identifiers
This section details the external IDs for all the samples we discovered when searching the existing targeted probe data and WES data.

In [None]:
all_external_ids = ! gsutil ls -r {path}**all_external_ids.tsv
all_failed_external_ids = ! gsutil ls -r {path}**all_failed_external_ids.tsv
# check: should I make them interactive??

## Table: all external IDs & associated metadata
The below table is sortable and filterable. You can double-click on the cells in the table if you want to copy the contents, like if you wanted to copy the link to the file in the Google storage console.

In [None]:
# instead of interactive for each participant, might be nice to combine all into one and add a column for the participant ID. This makes for less waiting and clicking overall.

df1 = pd.read_csv(all_external_ids[0], sep='\t')
df1['participant'] = str(os.path.basename(os.path.dirname(all_external_ids[0])))
df1['disease'] = str(os.path.basename(os.path.dirname(os.path.dirname(all_external_ids[0]))))
df1['filepath'] = str(os.path.dirname(all_external_ids[0]))
df1['link'] = str(re.sub('gs://', 'https://console.cloud.google.com/storage/browser/', os.path.dirname(all_external_ids[0])))
for filepath in all_external_ids[1:]:
    df2 = pd.read_csv(filepath, sep='\t')
    df2['participant'] = str(os.path.basename(os.path.dirname(filepath)))
    df2['disease'] = str(os.path.basename(os.path.dirname(os.path.dirname(filepath))))
    df2['filepath'] = str(os.path.dirname(filepath))
    df2['link'] = str(re.sub('gs://', 'https://console.cloud.google.com/storage/browser/', os.path.dirname(filepath)))
    df1 = pd.concat([df1, df2], ignore_index=True)
df1.set_index('participant', drop=True, inplace=True)

# print some summary information
print("We found a total of", df1.shape[0],"external IDs that passed the depth of coverage QC.")

# allow for filtering
qgrid_widget = qgrid.show_grid(df1, show_toolbar=False, grid_options = {'forceFitColumns': False})
display(qgrid_widget)

## Samples that failed the depth of coverage QC
This summary details all the the external IDs of each sample that failed the depth of coverage QC in the targeted probe pipeline. The depth of coverage QC in the targeted probe pipeline requires that the average gene-level or interval-level coverage is >=50x. 

The summary also lists the participants for which no samples failed the depth of coverage QC.

In [None]:
no_failed = []
for filepath in all_failed_external_ids:
    tmp_df = pd.read_csv(filepath, sep='\t')
    participant_name = str(os.path.basename(os.path.dirname(filepath)))
    if tmp_df.shape[0] ==1:
        print("There was", str(tmp_df.shape[0]), "failed sample for participant", participant_name,":")
        display(sorted(tmp_df.iloc[:,0].tolist()))
    elif tmp_df.shape[0] >1:
        print("There were", str(tmp_df.shape[0]), "failed samples for participant", participant_name)
    else:
        no_failed += [participant_name]
print("There were no failed samples for participant(s):")
display(sorted(no_failed))

# Copy number data

## Copy number heat maps
There are two plots in this section, one for CN data from the targeted probe data and a second for CN data from WES data. To look at any one sample in more detail, you can look either at the corresponding horizontal CN plot in the next section titled "Copy number horizontal plots" or look at the CN table (see either the tables below or the TSV available at the link specified in the "Sample information and identifiers" section.

These tables are searchable and filterable.

### Targeted CN heat map

In [None]:
# create a heat map specific to the samples requested (either per participant basis or per list of participants) using the plotSomaticCNV workflow in Terra
# Steps: create a new sample set with the appropriate samples, submit a new job, wait for it to finish, copy the picture to the temp dir (and add it to the list of local files), then display it here

### WES CN heat map

In [None]:
# what's the best way to create a CN heat map for the WES samples? create just using the segmented CN tsv I pull in from Terra? create new workflow?

## Copy number horizontal plots

Select the copy number plot you would like to display from the dropdown menu. The dropdown menu includes CN plots from both targeted probe (TSCA and TWIST) and WES data. The source of the data will be displayed on the title of the image. You can also refer to the table of all external IDs that maps each external ID to the source of the data (see "Sample information and identifiers").

The dropdown menu also includes merged copy number maps for each participant. This PNG file contains all the horizontal CN plots for a given participant in a single place for ease of quick comparison.

**check:** can I add a linked reference to this table so that they can quickly jump there? Might be best to just make it it's own section so that it shows up in the TOC.

<!-- Note that to get nice dropdown menu names, I'm changing directories for now. There's probably a better way to do this. -->

In [None]:
os.chdir(tempdir)

In [None]:
# TODO: this isn't the most helpful format / layout currently. Make it possible to select by WES vs Targeted, and select by participant as well. I don't currently know how to do this.
# select image to display from dropdown menu    
@interact
def show_images(file=local_img_file_names):
    print("File name:", file)
    display(Image(file))

In [None]:
## must change back to the main directory
os.chdir(cwd)

In [None]:
# get the CN tables from the Google storage bucket
tsca_cn = ! gsutil ls -r {path}**copy_number.tsv
wes_cn = ! gsutil ls -r {path}**wes_copy_number.tsv

## Targeted CN table
Select from the dropdown menu to get the targeted CN table for each participant.

In [None]:
# this code allows for the display of interactive tables with a dropdown menu to switch between participants
@interact
def show_tables(file=tsca_cn):
    cn_col_names = ['external_id', 'Sample', 'condition','Chromosome', 'Start', 'End','Segment_Mean', 'Segment_Call', 'Num_Probes']
    print("Participant: ", str(os.path.basename(os.path.dirname(file))))
    print("Filepath: "+ file)
    print("Link:", str(re.sub('gs://', 'https://console.cloud.google.com/storage/browser/', file)))
    make_interactive_table_orig(file, cols_to_include = cn_col_names)
    
    
    

## WES CN table
Select from the dropdown menu to get the WES CN table for each participant, when available. The TSV will contain the data for all the different external IDs.

In [None]:
# this code allows for the display of interactive tables with a dropdown menu to switch between participants
@interact
def show_tables(file=wes_cn):
    cn_col_names = ['external_id', 'Sample', 'condition','Chromosome', 'Start', 'End','Segment_Mean', 'Segment_Call', 'Num_Probes']
    print("Participant: ", str(os.path.basename(os.path.dirname(file))))
    print("Filepath: "+ file)
    print("Link:", str(re.sub('gs://', 'https://console.cloud.google.com/storage/browser/', file)))
    make_interactive_table_orig(file, cols_to_include = cn_col_names)
    make_interactive_table_orig(file, cols_to_include = cn_col_names)
    

## Maybe think about including mutations found in targeted that WEREN'T found in WES. Or, alternatively, just plot a venn diagram. I can only do this for samples where we have both WES and Targeted data - this shouldn't be difficult to figure out.

# Mutation data

Below are interactive tables containing *select* mutation information from the targeted probe data and the WES data. If there were multiple external IDs in either dataset, they have been combined into one table. The external_id column can be used to filter the data so only the mutations for a single external ID is displayed.

Note that this report only includes samples from the targeted data that pass the depth of coverage QC. Samples that did not pass this QC are not included in this report, and their data is not included in the Google bucket. A list of the samples that failed this QC is included earlier in this document (search for "Table: failed QC external IDs").

Also, note that the below tables have been filtered such that the keep column equals True. What this means is that only the variants that passed the filtering steps in the pipeline are included in the tables below. However, the raw mutation TSVs included in the Google bucket contain all the variants regardless of whether keep is True or False if you are interested in that information. This TSV will also contain columns explaining why a mutation was removed during filtration.

Generally speaking, if you are looking for more detailed information about why a mutation you expected to see was filtered out or if you want to get access to all of the columns available in the mutation TSV rather than the ones selected here, you can download the raw mutation TSV from the Google bucket.

In [None]:
tsca_mut = ! gsutil ls -r {path}**mutation.tsv
wes_mut = ! gsutil ls -r {path}**wes_mutations.tsv

## Targeted mutation table

In [None]:
# this code allows for the display of interactive tables with a dropdown menu to switch between participants
@interact
def show_tables(file=tsca_mut):
    mut_col_names = ['external_id', 'Genome_Change', 'Protein_Change','Variant_Classification', 'Variant_Type', 'tumor_f', 't_alt_count', 't_ref_count', 'COSMIC_total_alterations_in_gene',
                     'CGC_Tumor_Types_Somatic', 'CGC_Tumor_Types_Germline',
                     'Hugo_Symbol','Matched_Norm_Sample_Barcode','condition','Chromosome', 'Start_position', 'End_position','keep']
    print("Participant: ", str(os.path.basename(os.path.dirname(file))))
    print("Filepath: "+ file)
    print("Link:", str(re.sub('gs://', 'https://console.cloud.google.com/storage/browser/', file)))
    make_interactive_table_orig(file, cols_to_include = mut_col_names)
    

## WES mutation table

In [None]:
# this code allows for the display of interactive tables with a dropdown menu to switch between participants
@interact
def show_tables(file=wes_mut):
    try:
        mut_col_names = ['external_id', 'Genome_Change', 'Protein_Change','Variant_Classification', 'Variant_Type', 'tumor_f', 't_alt_count', 't_ref_count', 'COSMIC_total_alterations_in_gene',
                     'CGC_Tumor_Types_Somatic', 'CGC_Tumor_Types_Germline',
                     'Hugo_Symbol','Matched_Norm_Sample_Barcode','condition','Chromosome', 'Start_position', 'End_position','keep']
        print("Participant: ", str(os.path.basename(os.path.dirname(file))))
        print("Filepath: "+ file)
        print("Link:", str(re.sub('gs://', 'https://console.cloud.google.com/storage/browser/', file)))
        make_interactive_table_orig(file, cols_to_include = mut_col_names)
    except:
        print("\nThis participant's mutation file is in a different format from the output of the newest pipeline. This data may be old.")
        make_interactive_table_orig(file)
