# Produce CCLF report with all information for each specified cell line

## Acquire / produce all the data for mutations and copy number
Pull from CCLF_WES and the most updated TSCA workspace. Currently, trying to transition to CCLF_targeted. 

In [1]:
from __future__ import print_function
import os.path
# import os
import dalmatian as dm
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../../JKBio/')
import TerraFunction as terra
import CCLF_processing
%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
from IPython.display import Image, display, HTML
import ipdb

In [2]:
## widgets
# !pip install -U -q ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

## qgrid for interactive plots
# !pip install qgrid
# !jupyter nbextension enable --py --sys-prefix qgrid

In [2]:
import qgrid # interactive tables
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import gcsfs # to be able to read in files from GCS in Python

# # Extra options
# pd.options.display.max_rows = 30
# pd.options.display.max_columns = 25
qgrid.set_grid_option('maxVisibleRows', 10)

# # Show all code cells outputs
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = 'all'

In [3]:
cwd = os.getcwd()
print(cwd)

/Users/gmiller/Documents/Work/GitHub/ccle_processing/ccle_tasks


In [4]:
specificSamples_both = ["CCLF_PEDS1012",
                   "PEDS172",
                   "PEDS182",
                   "PEDS196",
                   "PEDS204"]
specificSamples_onlyWES = ["PEDS012",
                   "PEDS018",
                   "PEDS110",
                   "PEDS117"]
specificSamples = specificSamples_both + specificSamples_onlyWES

In [5]:
df = '../../ccle_processing/ccle_tasks/data/kim_sept/kim_sample_disease_info.csv'

df = "/Users/gmiller/Documents/Work/GitHub/ccle_processing/ccle_tasks/data/kim_sept/kim_sample_disease_info.csv"

In [None]:
# gather all the existing files
CCLF_processing.getReport(datadir = "gs://cclf_results/targeted/test/", specificlist = ["PEDS172"], specificlist_disease=df)
# CCLF_processing.getReport(datadir = "gs://cclf_results/targeted/kim_sept_6/", specificlist = specificSamples, specificlist_disease=df)

We want to create heat map style copy number plots for each participant. Want to have all the culture conditions, primary tissue, matched normal that exist side by side.

We might have to make separate CN heat map for TSCA vs WES samples because can't create sample set containing both since they're in separate workspaces... or at least I think this is problematic. But maybe there's a workaround.

* step 1: create sample set for each participant (add each sample_id to a sample set list?)
   
* step 2: create submission for each participant to generate the CN heat map
    + Terra.waitForSubmission needed before step 3
    + try/except style?
* step 3: copy the image from the workspace into the output location

In [None]:
# create heat map style copy number plots for each participant
# want to have all the culture conditions, primary tissue, matched normal that exist side by side

# step 1: create sample set for each participant (add each sample_id to a sample set list?)
# step 2: create submission for each participant to generate the CN heat map
# - Terra.waitForSubmission needed before step 3
# - try/except style?
# step 3: copy the image from the workspace into the output location

In [None]:
# ! gsutil -m rm -r 'gs://cclf_results/targeted/kim_sept_2/'


***
***

# Pretty report generation
After grabbing and making all of the files we want for a given participant (e.g. PEDS182), we want to make a pretty, interactive report. This will be similar to a README except that we will directly embed tables and images. This involves using Jupyter widgets to create dropdown menus and the like. Here are the main functionalities I'd like:

1. kable-like tables that are interactive: sorting, filtering, typing in text or numbers to search, (ability to download sorted/filtered table as a CSV?)
2. ability to quickly go to any image in the directory. I want this so that the user can quickly look through the copy number maps (horizontal plots). Ideally, I'd like to be able to select which one(s) I'd like to view. This could be useful if they want to see two or more at once (i.e. to compare two treatment conditions).

## Automate generation of separate Jupyter notebook for each participant
To do this, we will use Papermill. Papermill automates notebook to notebook generation, and also executes the generated notebook. We may also want to convert the generated notebook to HTML. We can use *nbconvert* for this operation (see https://github.com/jupyter/nbconvert).

In [55]:
# path would be the participant-specific path
path = "gs://cclf_results/targeted/kim_sept_6/Alveolar_Rhabdomyosarcoma/PEDS172/" 
# a list of file paths for the selected participant
filepaths = ! gsutil ls -r {path}**

# get all the tables in the bucket
table_filepaths = ! gsutil ls -r {path}*.txt # check: will this search recursively for all .txt files?
to_add = ! gsutil ls -r {path}*.tsv
table_filepaths += to_add
# get all the pngs in the bucket
img_filepaths = ! gsutil ls -r {path}*.png

# copy all the pngs in the bucket to a tmp folder
tempdir='./temp/cclfreport/images/'
! gsutil cp -r {path}*.png {tempdir} # copy images from google bucket to local temp folder
local_img_filepaths = ! ls {tempdir}*.png
os.chdir(tempdir)
local_img_file_names = ! ls *.png # list of all pngs in tempdir
os.chdir('../../../')

Copying gs://cclf_results/targeted/kim_sept_6/Alveolar_Rhabdomyosarcoma/PEDS172/merged_copy_number_map.png...
- [1 files][  1.2 MiB/  1.2 MiB]                                                
Operation completed over 1 objects/1.2 MiB.                                      


In [57]:
print(local_img_filepaths)
print(local_img_file_names)

['./temp/cclfreport/images/merged_copy_number_map.png', './temp/cclfreport/images/test.png']
['merged_copy_number_map.png', 'test.png']


In [7]:
def make_interactive_table(filepath): # assuming single filepath
    print("Table: "+filepath[0])
    data = pd.read_table(filepath[0])
    qgrid_widget = qgrid.show_grid(data, show_toolbar=True, grid_options = {'forceFitColumns': False,
    'defaultColumnWidth': 150})
    display(qgrid_widget)
    print("\n")

In [8]:
tsca_cn = ! gsutil ls -r {path}*copy_number.tsv
wes_cn = ! gsutil ls -r {path}*wes_copy_number.tsv

In [9]:
for i in [tsca_cn, wes_cn]:
    make_interactive_table(i)

Table: gs://cclf_results/targeted/kim_sept_6/Alveolar_Rhabdomyosarcoma/PEDS172/copy_number.tsv


QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…



Table: gs://cclf_results/targeted/kim_sept_6/Alveolar_Rhabdomyosarcoma/PEDS172/wes_copy_number.tsv


QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…





In [10]:
tsca_mut = ! gsutil ls -r {path}*mutation.tsv
wes_mut = ! gsutil ls -r {path}*wes_mutations.tsv

In [11]:
for i in [tsca_mut, wes_mut]:
    make_interactive_table(i)

Table: gs://cclf_results/targeted/kim_sept_6/Alveolar_Rhabdomyosarcoma/PEDS172/mutation.tsv


QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…



Table: gs://cclf_results/targeted/kim_sept_6/Alveolar_Rhabdomyosarcoma/PEDS172/wes_mutations.tsv


QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…





In [None]:
# data = pd.read_table(table_filepaths[4])
# qgrid_widget = qgrid.show_grid(data, show_toolbar=True)
# qgrid_widget

In [14]:
# select table to display from dropdown menu
## doesn't work with the interactive tables, unfortunately.
@interact
def show_tables(file=table_filepaths):
    print(file)
    data = pd.read_table(file)
    qgrid_widget = qgrid.show_grid(data, show_toolbar=True)
    qgrid_widget
    display(data)

interactive(children=(Dropdown(description='file', options=('gs://cclf_results/targeted/kim_sept_6/Alveolar_Rh…

In [None]:
## reading in image from GCS
# method: https://pypi.org/project/fs-gcsfs/
from fs_gcsfs import GCSFS
gcsfs = GCSFS(bucket_name="cclf_results")
# gcsfs.fix_storage() # see https://fs-gcsfs.readthedocs.io/en/latest/#limitations
# gcsfs.tree()

# with open("/targeted/kim_sept_6/Alveolar_Rhabdomyosarcoma/PEDS172/PEDS172T_PF_AR5_p7_sample_statistics.txt") as f:
#     df = pd.read_csv(f)
    
# method: https://gcsfs.readthedocs.io/en/latest/
# fs = gcsfs.GCSFileSystem(project='my-google-project')
# fs.ls('my-bucket')
# with fs.open('my-bucket/my-file.txt', 'rb') as f:
#     df = pd.read_csv(f)
#         display(f)

# @interact
# def show_images(file=filepaths): # can Image work with gcsfs/GCS file paths? It doesn't look like it.
#     print(file)
#     display(Image(file))

## Copy number plots
Select the copy number plot you would like to display from the dropdown menu.

<!-- Note that to get nice dropdown menu names, I'm changing directories for now. There's probably a better way to do this. -->

In [66]:
os.chdir(tempdir)

In [67]:
# select image to display from dropdown menu    
@interact
def show_images(file=local_img_file_names): # can Image work with gcsfs/GCS file paths? It doesn't look like it.
    print(file)
    display(Image(file))

interactive(children=(Dropdown(description='file', options=('merged_copy_number_map.png', 'test.png'), value='…

In [70]:
## must change back to the main directory
os.chdir(cwd)

In [None]:
# fdir = '/Users/gmiller/Documents/Pictures/'

# @interact
# def show_images(file=os.listdir(fdir)):
#     print(fdir+file)
#     display(Image(fdir+file))