In [None]:
from __future__ import print_function
import os.path
import dalmatian as dm
import pandas as pd
import sys
sys.path.insert(0, '../')
from CCLE_postp_function import CreateDatasetWithNewCellLines
sys.path.insert(0, '../../JKBio/')
import datanalytics as da 
import TerraFunction as terra
%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
from taigapy import TaigaClient
tc = TaigaClient()

In [None]:
refnamespace="broad-firecloud-ccle"
refworkspace="DepMap_hg38_RNAseq"
source="Unknown"
samplesetname="james_rna"
gcpfolder= 'jkobject'
prefix='rnaseq_james/'
release = samplesetname

In [None]:
refwm = dm.WorkspaceManager(refnamespace, refworkspace)

# Generate sample set from new samples

In [None]:
terra.UploadFromFolder(gcpfolder, prefix, refwm, samplesetname=samplesetname, sep='@')

# run the pipeline

### if submitted as bams

In [None]:
samtofastq = refwm.get_config("samtofastq_v1-0_BETA_cfg")
samtofastq

In [None]:
samtofastq['samtofastq_workflow.samtofastq.input_bam_cram']= 'this.WES_bam'
refwm.update_config(samtofastq)
submission_id = refwm.create_submission(samtofastq['name'], samplesetname,'sample_set',expression='this.samples')


In [None]:
wait_for_submission(refwm, submission_id1)

### else as fastqs

In [None]:
star = refwm.get_config("star_v1-0_BETA_cfg")
star

In [None]:
refwm.update_config(star)
submission_id = refwm.create_submission(star['name'], samplesetname,'sample_set',expression='this.samples')

In [None]:
terra.wait_for_submission(refwm, submission_id)

In [None]:
rsem = refwm.get_config("rsem_v1-0_BETA_cfg")
rsem

In [None]:
refwm.update_config(rsem)
submission_id1 = refwm.create_submission(rsem['name'], samplesetname,'sample_set',expression='this.samples')

In [None]:
terra.wait_for_submission(refwm, submission_id1)

In [None]:
fusion = refwm.get_config("hg38_STAR_fusion") 
fusion

In [None]:
fusion['inputs']['trinity_cleaned.samtofastq.input_bam'] = 'this.WES_bam'
refwm.update_config(fusion)
submission_id2 = refwm.create_submission(fusion['name'], samplesetname,'sample_set',expression='this.samples')

In [None]:
wait_for_submission(refwm, submission_id1)

In [None]:
aggregate = refwm.get_config("rsem_aggregate_results_v1-0_BETA_cfg")
aggregate

In [None]:
submission_id1 = refwm.create_submission(aggregate['name'], samplesetname)

## Expression post processing

In [None]:
terra.wait_for_submission(refwm, submission_id1)

In [None]:
%%R
source('../JKBio/gkugener/RScripts/load_libraries_and_annotations.R')

In [None]:
%%R
source('CCLE_postp_function.R')
release <- "james_rna"

In [None]:
res = refwm.get_sample_sets().loc[samplesetname]
res

In [None]:
rsem_genes_expected_count = res['rsem_genes_expected_count']
rsem_genes_tpm = res['rsem_genes_tpm']
rsem_transcripts_tpm = res['rsem_transcripts_tpm']
! gsutil cp $rsem_genes_expected_count "temp/expression/expectedcount.txt.gz" & gsutil cp $rsem_genes_tpm "temp/expression/genestpm.txt.gz" & gsutil cp $rsem_transcripts_tpm "temp/expression/transcripttpm.txt.gz"

In [None]:
%%R
# These files are downloaded from FireCloud/Terra
download_paths <- list(
  tpm_genes='temp/expression/genestpm.txt.gz',
  tpm_transcripts='temp/expression/transcripttpm.txt.gz',
  counts_genes='temp/expression/expectedcount.txt.gz')

In [None]:
%%R
tpm_transcripts = readTranscripts(download_paths$tpm_transcripts)
counts_genes = readCounts(download_paths$counts_genes)
tpm_genes = readTPM(download_paths$tpm_genes)

%%R
# Fix the colnames (for cases where there are mixed values (CCLE_name and DepMap_IDs))
colnames(counts_genes) %<>% renameFunction(.)
colnames(tpm_genes) %<>% renameFunction(.)
colnames(tpm_transcripts) %<>% renameFunction(.)

### data exploration and QC

In [None]:
%%R 
# Quick QC
# We are looking for samples with a worrying amount of zeros
zero_threshold <- 39000

number_zeros <- apply(tpm_genes[,3:ncol(tpm_genes)], 2, FUN = function(x) length(x[x == 0]))
nzdf <- data.frame(CL=names(number_zeros), nz=number_zeros, stringsAsFactors = F)

In [None]:
%%R
# Plot of the samples that fail
plot <- ggplot(nzdf, aes(nz)) +
  geom_histogram(bins = 100, color='black', fill='white') +
  geom_vline(xintercept = zero_threshold, linetype=2) +
  geom_label_repel(data = nzdf %>% filter(nz > zero_threshold), aes(x=nz, y=0, label=CL), size=5, fill=rgb(1,1,1,0.5))

ggsave(plot, filename ='temp/ggplot.pdf', width=20, height = 20)

In [None]:
from wand.image import Image as WImage
img = WImage(filename='temp/ggplot.pdf')
img

In [None]:
%%R
pass <- number_zeros[order(-number_zeros)] %>% .[. < zero_threshold] %>% names()

# These samples failed
setdiff(colnames(tpm_genes), pass) %>% .[!(. %in% c('gene_id', 'transcript_id(s)'))]

counts_genes %<>% dplyr::select(c("gene_id","transcript_id(s)", pass))
tpm_genes %<>% dplyr::select(c("gene_id","transcript_id(s)", pass))
tpm_transcripts %<>% dplyr::select("transcript_id", "gene_id", pass)

In [None]:
%%R
pass

## Save files for taiga

In [None]:
%%R
write.table(
  counts_genes, 
  file = paste0('temp/expression.', release,'.counts.tsv'), 
  sep = '\t', row.names = F, quote = F)
write.table(
  tpm_genes, 
  file = paste0('temp/expression.', release,'.tpm.tsv'), 
  sep = '\t', row.names = F, quote = F)
write.table(
  tpm_transcripts, 
  file = paste0('temp/expression.', release,'.transcripts.tsv'),
  sep = '\t', row.names = F, quote = F)

In [None]:
! mv 

# validation

In [None]:
upload_file_path_dict={'../data/james_rna/expression.'+ release + '.transcripts.tsv': 'TableTSV',
                          '../data/james_rna/expression.'+ release + '.counts.tsv': 'TableTSV',
                          '../data/james_rna/expression.'+ release + '.tpm.tsv': 'TableTSV'}

In [None]:

tc.create_dataset(dataset_name=release,
    dataset_description='Here is the rna seq data processed for James McFarland by Jeremie Kalfon using the CCLE processing pipeline. Some Cell lines have been flagged as containing many zero valued gene. Else the processing went ok.',
    upload_file_path_dict=upload_file_path_dict, folder_id='public')

In [None]:
tc.update_dataset(dataset_permaname="james-rna-8479",
                     upload_file_path_dict=upload_file_path_dict,
                 dataset_description="Updating to "+release,
                 force_remove=True)


## Fusion post processing

In [None]:
terra.wait_for_submission(submission_id2)

In [None]:
aggregate = refwm.get_config('Aggregate_Fusion_Calls')
aggregate

In [None]:
refwm.update_config(aggregate)
submission_id2 = refwm.create_submission(aggregate['name'], 'All_samples')

In [None]:
terra.wait_for_submission(submission_id2)

In [None]:
refwm.get_sample_sets().loc['All_samples']['fusions_star']

In [None]:
! gsutil cp $aggregated "temp/fusion.Fusions.aggregated.tsv"

### Overview

This document contains the code used to generate the unfiltered and filtered versions of the fusion datasets for the release. The bottom of the document also contains some comparisons between the release fusion dataset, CCLE2 fusion calls, and the translocation data from CCLE2.

In [None]:
%%R
source("CCLE_postp_function.R")
filepath <- 'temp/fusion.Fusions.aggregated.tsv'

## Generate filtered fusion table

Release: `r release`

We want to apply filters to the fusion table to reduce the number of artifacts in the dataset. Specifically, we filter the following:

* Remove fusions involving mitochondrial chromosomes, or HLA genes, or immunoglobulin genes
* Remove red herring fusions (from STAR-Fusion annotations column)
* Remove recurrent in CCLE (>= 25 samples)
* Remove fusion with (SpliceType=" INCL_NON_REF_SPLICE" and LargeAnchorSupport="No" and FFPM < 0.1)
* Remove fusions with FFPM < 0.05 (STAR-Fusion suggests using 0.1, but looking at the translocation data, this looks like it might be too aggressive)

In [None]:
%%R
unfiltered_fusions <- read_fusions(filepath)
filtered_fusions <- filter_fusions(unfiltered_fusions)

In [None]:
%%R
# Save the files (to be uploaded to taiga)
write.table(
  unfiltered_fusions,
  file = paste0('temp/fusions.',release, '.unfiltered.tsv'),
  sep = '\t', quote = F, row.names = F
)
write.table(
  filtered_fusions,
  file = paste0('temp/fusions.', release, '.filtered.tsv'),
  sep = '\t', quote = F, row.names = F
)

In [None]:
from taigapy import TaigaClient

tc = TaigaClient()
tc.update_dataset(dataset_permaname="depmap-fusions-7990",
                     upload_file_path_dict={'temp/fusions.'+release+'.filtered.tsv': 'TableTSV',
                                        'temp/fusions.'+release+'.unfiltered.tsv': 'TableTSV'},
                 dataset_description="Updating to "+release,
                 force_remove=True)


# IF want to merge here instead of on Terra:

In [None]:
res = refwm.get_samples().loc['ibm_ACH-001616']
genes_fusion = res['fusion_predictions_abridged']
rsem_genes_transcripts = res['rsem_isoforms']
rsem_genes_expected_count = res['rsem_genes']

! gsutil cp $rsem_genes_expected_count "temp/expression.genes.results" && gsutil cp $rsem_genes_transcripts "temp/expression.transcripts.results" && gsutil cp $genes_fusion "temp/expression.fusion.tsv"