In [None]:
from __future__ import print_function
import os.path
import dalmatian as dm
import pandas as pd
import gzip
import sys
from src.CCLE_postp_function import *
sys.path.insert(0, '../JKBio/')
import Datanalytics as da 
import TerraFunction as terra
from Helper import * 
%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
from IPython.display import Image,display
from taigapy import TaigaClient
tc = TaigaClient()
import numpy as np
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from bokeh.plotting import *
from bokeh.models import HoverTool
output_notebook()
from collections import OrderedDict
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

In [None]:
samplesetname="19Q4"
prevname="19Q3"
prevprevname = "19Q2"
virtual_public='public-19q4-93d9'
virtual_dmc='dmc-19q4-d5f3'
virtual_internal='internal-19q4-fb11'

workspace1="broad-genomics-delivery/Getz_IBM_CellLines_RNASeqData"
workspace2="broad-firecloud-ccle/CCLE_DepMap_RNAseq"
workspace3="broad-genomics-delivery/CCLE_DepMap_RNAseq"
workspace4="broad-genomics-delivery/Cancer_Cell_Line_Factory_CCLF_RNAseq"
workspace5="nci-mimoun-bi-org/CCLF_RNA_2_0"
refworkspace="broad-firecloud-ccle/DepMap_hg38_RNAseq"
source1="ibm"
source2="ccle"
source3="ccle"
source4="cclf"
source5="cclf"
release = samplesetname
sheeturl = "https://docs.google.com/spreadsheets/d/115TUgA1t_mD32SnWAGpW9OKmJ2W5WYAOs3SuSdedpX4"

In [None]:
%%R
genome_version <- 'hg38'
release <- '19Q4'
hg38_cyto_band_reference <- '../JKBio/data/hg38_cytoband.gz'

# Generate sample set from new samples

In [None]:
extract_to_change = {'ref_bams':'RNAseq_bam'}

In [None]:
wm1 = dm.WorkspaceManager(workspace1)
wm2 = dm.WorkspaceManager(workspace2)
wm3 = dm.WorkspaceManager(workspace3)
# wm4 = dm.WorkspaceManager(workspace4)
refwm = dm.WorkspaceManager(refworkspace)

In [None]:
newsamples = createDatasetWithNewCellLines(refwm, samplesetname, wmfroms = [wm1, wm2, wm3], sources=[source1, source2, source3], gsfolderto='gs://ccle_bams/rna/', match='ACH', participantslicepos=10, accept_unknowntypes=True, extract=extract_to_change, dry_run = True)

## Check that we have all the cell lines we expect for this release
This involves comparing to the list in the Google sheet "Cell Line Profiling Status."

In [None]:
# this function may not work - it hasn't been tested
url = 'https://docs.google.com/spreadsheets/d/1qus-9TKzqzwUMNWp8S1QP4s4-3SsMo2vuQRZrNXf7ag/edit?ts=5db85e27#gid=0&fvid=1627883727'

compareToCuratedGS(url, sample = newsample[0], samplesetname = samplesetname, colname = 'RNA New to internal')

### Manually resolving missing files for 19Q4
These 4 missing cell lines (['ACH-001016', 'ACH-001098', 'ACH-001113', 'ACH-001194']) are actually in the broad-genomics-delivery/CCLE_DepMap_RNAseq workspace. However, their individual_alias column does not contain a DepMap ID, so we cannot find the data automatically. Currently in discussion is re-patienting the samples in BSP so that we can have the DepMap IDs. For now, however, I'm going to add these 4 samples manually. Here are the steps:
- get the required metadata columns, but manually curate their new sample ID with appropriate tacked on number
- copy their bam (and bai?) files into the correct ccle_bams subdirectory
- download the full sample entity
- append this new sample to the full sample entity
- upload the new sample entity
- update the 19Q4 sample set with these 4 samples

Note: I added these 4 lines manually.

In [None]:
submission_id = refwm.create_submission("samtofastq_v1-0_BETA_cfg", samplesetname,'sample_set',expression='this.samples')
terra.waitForSubmission(refworkspace, submission_id)

In [None]:
submission_id = refwm.create_submission("star_v1-0_BETA_cfg", samplesetname,'sample_set',expression='this.samples')
terra.waitForSubmission(refworkspace, submission_id)

In [None]:
submission_id1 = refwm.create_submission("rsem_v1-0_BETA_cfg", samplesetname,'sample_set',expression='this.samples')
terra.waitForSubmission(refworkspace, submission_id1)

In [None]:
submission_id2 = refwm.create_submission("hg38_STAR_fusion", samplesetname,'sample_set',expression='this.samples')

In [None]:
submission_id_snp = refwm.create_submission("rnaseq-germline-snps-indels", samplesetname,'sample_set',expression='this.samples')

In [None]:
# update a samples set with another sampleset
updateAllSampleSet(refworspace, samplesetname)

In [None]:
submission_id1 = refwm.create_submission("rsem_aggregate_results", 'All_samples')

In [None]:
terra.waitForSubmission(refworkspace, [submission_id1, submission_id_snp])

### Save the workflow configurations used

In [None]:
terra.saveConfigs(refworkspace,'data/'+samplesetname+'/RNAconfig')

## Expression post processing

In [None]:
res = refwm.get_sample_sets().loc['All_samples']
rsem_genes_expected_count = res['rsem_genes_expected_count']
rsem_genes_tpm = res['rsem_genes_tpm']
rsem_transcripts_tpm = res['rsem_transcripts_tpm']
! gsutil cp $rsem_genes_expected_count "temp/expression.expectedcount" & gsutil cp $rsem_genes_tpm "temp/expression.genes.tpm" & gsutil cp $rsem_transcripts_tpm "temp/expression.transcript.tpm"

In [None]:
for val in ["temp/expression.expectedcount","temp/expression.transcript.tpm","temp/expression.genes.tpm"]:
    file = pd.read_csv(val, compression='gzip', header=0, sep='\t', quotechar='"', error_bad_lines=False)
    file = removeDuplicates(file)
    file.columns = [i.split('_')[0] if i not in ['gene_id', "transcript_id" "transcript_id(s)"] else i for i in file.columns]
    file.set_index('gene_id').to_csv(val,sep='\t',)

In [None]:
%%R
library('taigr')
source('../gkugener/RScripts/load_libraries_and_annotations.R')
source('src/CCLE_postp_function.R')
library('cdsomics')

In [None]:
%%R
# These files are downloaded from FireCloud/Terra
download_paths <- list(
  tpm_genes='temp/expression.genes.tpm',
  tpm_transcripts='temp/expression.transcript.tpm',
  counts_genes='temp/expression.expectedcount')

In [None]:
%%R
tpm_transcripts = readTranscripts(download_paths$tpm_transcripts)
counts_genes = readCounts(download_paths$counts_genes)
tpm_genes = readTPM(download_paths$tpm_genes)

In [None]:
%%R
head(tpm_transcripts)

In [None]:
%%R
# Fix the colnames (for cases where there are mixed values (CCLE_name and DepMap_IDs))
colnames(counts_genes) %<>% renameFunction(.)
colnames(tpm_genes) %<>% renameFunction(.)
colnames(tpm_transcripts) %<>% renameFunction(.)

### data exploration and QC

In [None]:
%%R 
# Quick QC
# We are looking for samples with a worrying amount of zeros
zero_threshold <- 39000
number_zeros <- apply(tpm_genes[,2:ncol(tpm_genes)-1] ==0, 2, FUN = sum)
nzdf <- data.frame(CL=names(number_zeros), nz=number_zeros, stringsAsFactors = F)

In [None]:
%%R
number_zeros <- number_zeros[order(-number_zeros)]
number_zeros <- number_zeros[number_zeros < zero_threshold]
pass <- number_zeros %>% names()

In [None]:
%%R
counts_genes

In [None]:
%%R
# These samples failed
failed <- setdiff(colnames(tpm_genes), pass) %>% .[!(. %in% c('gene_id', 'transcript_id(s)'))]

In [None]:
%%R
counts_genes %<>% dplyr::select(c("gene_id","transcript_id(s)", pass))

In [None]:
%%R
failed

In [None]:
%%R
colnames(tpm_transcripts)[which(colnames(tpm_transcripts)=='transcript_id')] <- 'transcript_id(s)'

In [None]:
%%R
tpm_genes %<>% dplyr::select(c("gene_id","transcript_id(s)", pass))
tpm_transcripts %<>% dplyr::select(c("transcript_id(s)", "gene_id", pass))

In [None]:
%%R
# Plot of the samples that fail
plot <- ggplot(nzdf, aes(nz)) +
  geom_histogram(bins = 100, color='black', fill='white') +
  geom_vline(xintercept = zero_threshold, linetype=2) +
  geom_label_repel(data = nzdf %>% filter(nz > zero_threshold), aes(x=nz, y=0, label=CL), size=5, fill=rgb(1,1,1,0.5))

ggsave(plot, filename ='temp/ggplot.png', width=20, height = 20)

In [None]:
display(Image(filename='temp/ggplot.png'))

### Allie post processing 

In [None]:
%%R
print(colnames(tpm_genes)[1:10])
tpm_gene_ids <- gsub("\\..*", "", tpm_genes$gene_id)
if(nrow(tpm_genes) != length(unique(tpm_gene_ids))){
  print("Duplicated ensembl ids")
  print(nrow(tpm_genes) - length(unique(tpm_gene_ids)))
  tpm_genes <- tpm_genes[-which(duplicated(tpm_gene_ids)==T),]
}

In [None]:
%%R
annotations<-download.raw.from.taiga(data.name='gene-annotations-e737', data.version=1, data.file='gencode.v29.annotation')
tpm_protein_coding <- prepare_depmap_TPM_for_taiga(tpm_genes, log_transform=T, just_protein_coding=T, gencode_annotations=annotations)
tpm_genes <- prepare_depmap_TPM_for_taiga(tpm_genes, log_transform=T, just_protein_coding=F, gencode_annotations=annotations)

In [None]:
%%R
### transcripts
print(colnames(tpm_transcripts)[1:10])
tpm_transcript_id <- gsub("\\..*", "", tpm_transcripts$`transcript_id(s)`)
if(nrow(tpm_transcripts) != length(unique(tpm_transcript_id))) {
  print("Duplicated transcript ids")
  print(nrow(tpm_transcripts) - length(unique(tpm_transcript_id)))
  tpm_transcript <- tpm_transcripts[-which(duplicated(tpm_transcript_id)==T),]
}
tpm_transcripts <- prepare_depmap_transcripts_for_taiga(tpm_transcript, gencode_annotations = annotations) 

In [None]:
%%R
### counts
print(colnames(counts_genes)[1:10])
counts_gene_ids <- gsub("\\..*", "", counts_genes$gene_id)
if(nrow(counts_genes) != length(unique(counts_gene_ids))) {
  print("Duplicate ensembl ids")
  print(length(which(duplicated(counts_gene_ids)==T)))
  counts_genes <- counts_genes[-which(duplicated(counts_gene_ids)==T),]
}
counts_genes <- prepare_depmap_TPM_for_taiga(counts_genes, gencode_annotations =annotations) 

In [None]:
%%R
write.table(
  counts_genes, 
  file = paste0('temp/expression.', release,'.counts.tsv'), 
  sep = '\t', quote = F)
write.table(
  tpm_genes, 
  file = paste0('temp/expression.', release,'.genes.tsv'), 
  sep = '\t', quote = F)
write.table(
  tpm_protein_coding, 
  file = paste0('temp/expression.', release,'.protein_coding.tsv'), 
  sep = '\t', quote = F)
write.table(
  tpm_transcripts, 
  file = paste0('temp/expression.', release,'.transcripts.tsv'),
  sep = '\t', quote = F)

## Loading

In [None]:
counts_genes = pd.read_csv('temp/expression.'+ release + '.counts.tsv', sep='\t',index_col=0)
tpm_genes = pd.read_csv('temp/expression.'+ release + '.genes.tsv', sep='\t' ,index_col=0)
tpm_proteincoding = pd.read_csv('temp/expression.'+ release + '.protein_coding.tsv', sep='\t',index_col=0)
tpm_transcripts = pd.read_csv('temp/expression.'+ release + '.transcripts.tsv', sep='\t', index_col=0)

In [None]:
tpm_transcripts = tpm_transcripts.drop('gene_id').set_index('transcript')

In [None]:
tpm_transcripts.to_csv('temp/expression.'+ release + '.transcripts.tsv', sep='\t')

In [None]:
tpm_transcripts

In [None]:
tpm_transcripts = tpm_transcripts.drop('transcript_id',1)

## validation

In [None]:
prevcounts = tc.get(name='depmap-expression-87f8', file="expression."+prevname+'.counts')
prevgenes = tc.get(name='depmap-expression-87f8', file="expression."+prevname+'.genes')
prevtranscripts = tc.get(name='depmap-expression-87f8', file="expression."+prevname+'.transcripts')
prevproteincoding = tc.get(name='depmap-expression-87f8', file="expression."+prevname+'.protein')

In [None]:
new1 = set(counts_genes.columns.values.tolist())
prev = set(prev.columns.values.tolist())

In [None]:
new3 = set(tpm_transcripts.columns.values.tolist())
new2 = set(tpm_genes.columns.values.tolist())

In [None]:
print(len(prev), len(prev & new2), len(new1), len(new1 & new2 & new3), new2-prev, prev-new2)

In [None]:
metadata = tc.get(name='depmap-a0ab', file='sample_info')

In [None]:
metadata

In [None]:
# finding train and test set
trainame = [val for val in new1&prev if val[:3] == 'ACH']
testname = [val for val in new1-prev if val[:3] == 'ACH']

#looking at the 2000 most variable genes in the two sets
genetolookfor = 2000
gene_var = counts_genes[trainame].var(1).values
print(len(gene_var))
sorting = np.argsort(gene_var)[-genetolookfor:]

In [None]:
unregistered = set(testname) - set(metadata["DepMap_ID"].values.tolist())
unregistered

In [None]:
len(counts_genes['ACH-001767']) - np.count_nonzero(counts_genes['ACH-001767'].values)

In [None]:
# creating and reodering train and test sets
traindata = counts_genes[set(trainame)-unregistered].values[sorting].T
trainlabels = [metadata[metadata["DepMap_ID"]==val]["disease"].values[0] for val in counts_genes[set(trainame)-unregistered].columns.tolist() if val not in unregistered]

testdata = counts_genes[set(testname)-unregistered].values[sorting].T
testlabels = [metadata[metadata["DepMap_ID"]==val]["disease"].values[0] for val in counts_genes[set(testname)-unregistered].columns.tolist() if val not in unregistered]

In [None]:
traindata.shape

In [None]:
# learn KNN classifier to the metadata diseases
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(traindata, trainlabels) 
predicted = neigh.predict(testdata)

In [None]:
predicted[1]

In [None]:
labels = trainlabels + testlabels
colors=[0]*len(trainlabels)
colors.extend([1,2,2,2,2,1,2,2,2,1,2])
data = np.vstack([traindata,testdata])

In [None]:
# plot them with TSNE, highlight the points that failed and show colors for diseases
dimred = TSNE(2,10).fit_transform(data)

In [None]:
scatter(dimred, labels=labels,colors=colors, radi=1.9)

## Save files for taiga

In [None]:
! ls temp/expression.*

In [None]:
tc.update_dataset(dataset_permaname="depmap-expression-87f8",
                 upload_file_path_dict={'temp/expression.'+release+'.transcripts.tsv': 'NumericMatrixTSV',
                                       'temp/expression.'+release+'.genes.tsv': 'NumericMatrixTSV',
                                       'temp/expression.'+release+'.counts.tsv': 'NumericMatrixTSV',
                                       'temp/expression.'+release+'.protein_coding.tsv': 'NumericMatrixTSV'},
                  dataset_description=
"""
# RNAseq

Combined segment and gene-level CN calls from Broad WES, Sanger WES, and Broad SNP. Relative CN, not log2 transformed.

PORTAL TEAM SHOULD NOT USE THIS: There are lines here that should not make it even to internal. Must use subsetted dataset instead. These data will not make it on the portal starting 19Q1. With the DMC portal, there is new cell line release prioritization as to which lines can be included, so a new taiga dataset will be created containing CN for the portal.

version 1-8: guillaume releases
version 9: 19Q3 release
version 10:  adding missing samples in Terra merge files
version 11: 19Q4 new release.

Adding 93 new cell lines. 
Some cells lines have been removed because they:

 - had too many 0 values = ["ACH-001388" "ACH-001577" "ACH-001767" "ACH-002463"] 

Some cells lines have been flagged as:

 - having too many..
 
version 21:
    uploading as matrices 
 
transcriptions (Transcripts rpkm):

genes (gene rpkm):
__Rows__:
__Columns__:
Counts (gene counts):
__Rows__:
__Columns__:
Gene level CN data:
__Rows__:
__Columns__:
 DepMap cell line IDs
 gene names in the format HGNC\_symbol (Entrez\_ID)
DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean
""")

In [None]:
prevprevcells = set(tc.get(name='depmap-rnaseq-expression-data-363a', file='CCLE_depMap_'+prevprevname+'_TPM_ProteinCoding',version=16).index.tolist())

In [None]:
gsheets = sheets.get(sheeturl).sheets[6].to_frame()
rna_ibm_embargo = [i for i in gsheets['RNAseq_IBM_embargo'].values.tolist() if i is not np.nan]
rna_dmc_embargo = [i for i in gsheets['RNAseq_DMC_embargo'].values.tolist() if i is not np.nan]
blacklist = [i for i in gsheets['blacklist'].values.tolist() if i is not np.nan]

In [None]:
blacklist

### internal

In [None]:
## removing first blacklisted, then embargoed, to create two datasets
counts_genes = counts_genes[~counts_genes.index.isin(blacklist)]
print(len(counts_genes))
counts_genes.to_csv('temp/internal_'+release+'_counts', sep='\t')
tpm_genes = tpm_genes[~tpm_genes.index.isin(blacklist)]
print(len(tpm_genes))
tpm_genes.to_csv('temp/internal_'+release+'_tpm', sep='\t')
tpm_proteincoding = tpm_proteincoding[~tpm_proteincoding.index.isin(blacklist)]
print(len(tpm_proteincoding))
tpm_proteincoding.to_csv('temp/internal_'+release+'_proteincoding_tpm', sep='\t')
tpm_transcripts = tpm_transcripts[[i for i in tpm_transcripts.columns if i not in blacklist]]
print(len(tpm_transcripts.columns))
tpm_transcripts.to_csv('temp/internal_'+release+'_transcripts_tpm', sep='\t')

In [None]:
tc.update_dataset(dataset_permaname="depmap-rnaseq-expression-data-363a",
                 upload_file_path_dict={'temp/internal_'+release+'_counts': 'NumericMatrixCSV',
                                       'temp/internal_'+release+'_tpm': 'NumericMatrixCSV',
                                       'temp/internal_'+release+'_proteincoding_tpm': 'NumericMatrixCSV',
                                       'temp/internal_'+release+'_transcripts_tpm': 'NumericMatrixTSV'},
                  dataset_description=
"""
# INTERNAL RNA

* Version 1-3 Internal 18Q1*

All CCLE cell lines with RNAseq data.

Original data source:

`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_depMap_18Q1_RNAseq_reads_20180201.gct`
`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_depMap_18Q1_RNAseq_RPKM_20180201.gct`

Version 2 of RPKM data are log2-transformed with a noisy floor at -3 (-3 + N(0, 0.1))

* Version 4-6 Internal 18Q2*

Original data source:

`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_depMap_18q2_RNAseq_reads_20180420.gct` `/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_depMap_18q2_RNAseq_RPKM_20180420.gct`

RPKM data are log2-transformed with a noisy floor at -3 (-3 + N(0, 0.1)). Reads file unaltered aside from formatting row / column names.

* Version 7 Internal 18Q2*

Includes a matrix with genes filtered by HGNC protein-coding gene locus group.

* Version 8-10 Internal 18Q3*

use version 10

Original data source:

`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_depMap_18q3_RNAseq_reads_20180716.gct` `/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_depMap_18q3_RNAseq_RPKM_20180716.gct`

RPKM data are log2-transformed with a noisy floor at -3 (-3 + N(0, 0.1)). Reads file unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are Broad (Arxspan) cell line IDs.

Columns: In the complete RPKM and read datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding RPKM and read datasets column names are HGNC_symbol (Entrez_ID)

version 9 updates names, and slightly different RPKM values due to randomly added noisy floor (using a seed of 4)

version 10 removes duplicate gene names from the protein coding datasets

* Version 11-12 Internal 18Q4*

18Q4 transcript level data is found in version 14. (In versions 1-13 transcript data contains only gene level not transcript level data)

changing to TPM expression

Original data source:

`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_DepMap_18Q4_rsem_genes_tpm_20181029.txt` `/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_DepMap_18Q4_rsem_transcripts_tpm_20181029.gct` `/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_DepMap_18Q4_RNAseq_reads_20181029.gct` `/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_DepMap_18Q4_RNAseq_RPKM_20181029.gct`

TPM data is the primary expression data now. It is log2-transformed with a pseudo count of 1 added. The TPM data contains 4 cell lines not included in the RPKM data.

RPKM data are log2-transformed with a pseudo count of 1 added. RPKM values are no longer thresholded.

Reads/transcript files unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are DepMap (Arxspan) cell line IDs

Columns: In the complete TPM, TPM transcripts, RPKM and read datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding TPM dataset column names are HGNC_symbol (Entrez_ID)

* Version 13-15 Internal 19Q1*

Version 15 contains the correct data sets for 19Q1 - 2 cell lines are removed

Version 14 contains the correct transcript level data for 18Q4

* Version 16 Internal 19Q2*

* Version 17 Internal 19Q3*

* Version 18 Internal 19Q4

Adding 93 new cell lines - Blacklisted
Some cells lines have been removed because they:

 - had too many 0 values = ["ACH-001388" "ACH-001577" "ACH-001767" "ACH-002463"] 

Some cells lines have been flagged as:

 - None
* Version 19 Internal 19Q4
removing blacklisted

* Version 20 Internal 19Q4
removing blacklisted in transcripts

* Version 21 Internal 19Q4
uploading as matrices 

data is aligned to hg38

read count data is created using RSEM, which, when a read maps to multiple places, splits the counts between genes, with the weight based on the likelihood that it came from one gene or the other, so counts data may not be integers

TPM data is log2-transformed with a pseudo count of 1 added. log2(X+1)

Reads/transcript files unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are DepMap cell line IDs Mapping between Broad IDs and CCLE IDs can be done using a R or python package

To install R implementation: options(repos = c("https://iwww.broadinstitute.org/~datasci/R-packages", "https://cran.cnr.berkeley.edu")) install.packages('celllinemapr')

To install python implementation: pip install https://intranet.broadinstitute.org/~datasci/python-packages/cell_line_mapper-0.1.9.tar.gz)

Columns: In the complete TPM and read counts datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding TPM dataset column names are HGNC_symbol (Entrez_ID). In the TPM transcript dataset column names are HGNC_symbol (Transcript_ID) - the HGNC symbols are not unique, use the transcript IDs for unique identifiers.
""")

In [None]:
AddToVirtual('depmap-a0ab', "depmap-rnaseq-expression-data-363a", files=[('CCLE_RNAseq_reads', 'internal_'+release+'_counts'),('CCLE_expression_full', 'internal_'+release+'_tpm'),('CCLE_expression', 'internal_'+release+'_proteincoding_tpm'),('CCLE_RNAseq_transcripts', 'internal_'+release+'_transcripts_tpm')])

AddToVirtual(virtual_internal, "depmap-rnaseq-expression-data-363a", files=[('CCLE_RNAseq_reads', 'internal_'+release+'_counts'),('CCLE_expression_full', 'internal_'+release+'_tpm'),('CCLE_expression', 'internal_'+release+'_proteincoding_tpm'),('CCLE_RNAseq_transcripts', 'internal_'+release+'_transcripts_tpm')])

### IBM

like internal

### DMC

In [None]:
## removing first blacklisted, then embargoed, to create two datasets
counts_genes = counts_genes[~counts_genes.index.isin(rna_ibm_embargo)]
print(len(counts_genes))
counts_genes.to_csv('temp/dmc_'+release+'_counts', sep='\t')
tpm_genes = tpm_genes[~tpm_genes.index.isin(rna_ibm_embargo)]
print(len(tpm_genes))
tpm_genes.to_csv('temp/dmc_'+release+'_tpm', sep='\t')
tpm_proteincoding = tpm_proteincoding[~tpm_proteincoding.index.isin(rna_ibm_embargo)]
print(len(tpm_proteincoding))
tpm_proteincoding.to_csv('temp/dmc_'+release+'_proteincoding_tpm', sep='\t')
tpm_transcripts = tpm_transcripts[[i for i in tpm_transcripts.columns if i not in rna_ibm_embargo]]
print(len(tpm_transcripts.columns))
tpm_transcripts.to_csv('temp/dmc_'+release+'_transcripts_tpm', sep='\t')

In [None]:
tc.update_dataset(dataset_permaname="depmap-rnaseq-expression-data-80ef",
                 upload_file_path_dict={'temp/dmc_'+release+'_counts': 'NumericMatrixTSV',
                                       'temp/dmc_'+release+'_tpm': 'NumericMatrixTSV',
                                       'temp/dmc_'+release+'_proteincoding_tpm': 'NumericMatrixTSV',
                                       'temp/dmc_'+release+'_transcripts_tpm': 'NumericMatrixTSV'},
                  dataset_description=
"""
# DMC RNA

* Version 1-3 DMC 19Q1*

version 3 contains the correct data for 19Q1

version 2 contains correct TPM transcript data (in version 1 transcript data contains only gene level not transcript level data)

* Version 4 DMC 19Q2*

* Version 5 DMC 19Q3*

* Version 6 DMC 19Q4*

Adding 93 new cell lines - Blacklisted - IBM
Some cells lines have been removed because they:

 - had too many 0 values = ["ACH-001388" "ACH-001577" "ACH-001767" "ACH-002463"] 

Some cells lines have been flagged as:

 - None

* Version 7 DMC 19Q4

removing blacklisted

* Version 8 DMC 19Q4

removing blacklisted in transcripts

* Version 9 DMC 19Q4

uploading as numeric matrix

data is aligned to hg38

read count data is created using RSEM, which, when a read maps to multiple places, splits the counts between genes, with the weight based on the likelihood that it came from one gene or the other, so counts data may not be integers

TPM data is log2-transformed with a pseudo count of 1 added.

Reads/transcript files unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are DepMap cell line IDs

Columns: In the complete TPM and read counts datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding TPM dataset column names are HGNC_symbol (Entrez_ID). In the TPM transcript dataset column names are HGNC_symbol (Transcript_ID) - the HGNC symbols are not unique, use the transcript IDs for unique identifiers.
""")
AddToVirtual(virtual_dmc, "depmap-rnaseq-expression-data-80ef", files=[('CCLE_RNAseq_reads', 'dmc_'+release+'_counts'),('CCLE_expression_full', 'dmc_'+release+'_tpm'),('CCLE_expression', 'dmc_'+release+'_proteincoding_tpm'),('CCLE_RNAseq_transcripts', 'dmc_'+release+'_transcripts_tpm')])

### public

In [None]:
prevprevcells

In [None]:
## removing first blacklisted, then embargoed, to create two datasets
counts_genes = counts_genes[counts_genes.index.isin(prevprevcells)]
print(len(counts_genes))
counts_genes[~counts_genes.index.isin(rna_dmc_embargo)].to_csv('temp/public_'+release+'_counts', sep='\t')
tpm_genes = tpm_genes[tpm_genes.index.isin(prevprevcells)]
print(len(tpm_genes))
tpm_genes[~tpm_genes.index.isin(rna_dmc_embargo)].to_csv('temp/public_'+release+'_tpm', sep='\t')
tpm_proteincoding = tpm_proteincoding[tpm_proteincoding.index.isin(prevprevcells)]
print(len(tpm_proteincoding))
tpm_proteincoding[~tpm_proteincoding.index.isin(rna_dmc_embargo)].to_csv('temp/public_'+release+'_proteincoding_tpm', sep='\t')
tpm_transcripts = tpm_transcripts[[i for i in tpm_transcripts.columns if i in list(prevprevcells) +['gene_id']]]
print(len(tpm_transcripts.columns))
tpm_transcripts[[i for i in tpm_transcripts.columns if i not in rna_dmc_embargo]].to_csv('temp/public_'+release+'_transcripts_tpm', sep='\t')

In [None]:
tc.update_dataset(dataset_permaname="depmap-rnaseq-expression-data-ccd0",
                 upload_file_path_dict={'temp/public_'+release+'_counts': 'NumericMatrixTSV',
                                       'temp/public_'+release+'_tpm': 'NumericMatrixTSV',
                                       'temp/public_'+release+'_proteincoding_tpm': 'NumericMatrixTSV',
                                       'temp/public_'+release+'_transcripts_tpm': 'NumericMatrixTSV'},
                  dataset_description=
"""
# PUBLIC RNA

* Version 1-2 Public 18Q1*

Original source (`CCLE_DepMap_18Q1_RNAseq_reads_20180214.gct`, `CCLE_DepMap_18Q1_RNAseq_RPKM_20180214.gct`) downloaded from portals.broadinstitute.org/ccle
RPKM file is log2(RPKM) with a "noisy floor" around -3 (-3 + N(0, 0.1))

* Version 3-5 Public 18Q2*

gene expression data (RNAseq for1,076 cell lines, including data for 28 newly released cell lines)

original source: (`/xchip/ccle_dist/public/DepMap_18Q2/CCLE_DepMap_18Q2_RNAseq_RPKM_20180502.gct`, `/xchip/ccle_dist/public/DepMap_18Q2/CCLE_DepMap_18Q2_RNAseq_reads_20180502.gct`)
* Version 6-7 Public 18Q3*

gene expression data (80 newly released cell lines)

Original data source:

`/xchip/ccle_dist/public/DepMap_18Q3/CCLE_DepMap_18q3_RNAseq_reads_20180718.gct`
`/xchip/ccle_dist/public/DepMap_18Q3/CCLE_DepMap_18q3_RNAseq_RPKM_20180718.gct`

RPKM data are log2-transformed with a noisy floor at -3 (-3 + N(0, 0.1)). Reads file unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are Broad (Arxspan) cell line IDs. Mapping between Broad IDs and CCLE IDs can be done using a R or python package

To install R implementation: options(repos = c("https://iwww.broadinstitute.org/~datasci/R-packages", "https://cran.cnr.berkeley.edu")) install.packages('celllinemapr')

To install python implementation: pip install https://intranet.broadinstitute.org/~datasci/python-packages/cell_line_mapper-0.1.9.tar.gz)

Columns: In the complete RPKM and read datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding RPKM and read datasets column names are HGNC_symbol (Entrez_ID)

version 7 removes duplicate gene names from the protein coding datasets

* Version 8-9 Public 18Q4*

_ 18Q4 transcript level data is found in version 11. (In versions 8-9 transcript data contains only gene level not transcript level data)

changing to TPM expression

Original data source:

`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_DepMap_18Q4_rsem_genes_tpm_20181029.txt`
`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_DepMap_18Q4_rsem_transcripts_tpm_20181029.gct`
`/xchip/ccle_dist/public/DepMap_18Q4/CCLE_DepMap_18q4_RNAseq_reads_20181029.gct`

`/xchip/ccle_dist/public/DepMap_18Q4/CCLE_DepMap_18q4_RNAseq_RPKM_20181029.gct`

TPM data is subsetted to just public cell lines using the cell line found in the RPKM dataset.

TPM data is the primary expression data now. It is log2-transformed with a pseudo count of 1 added

RPKM data are log2-transformed with a pseudo count of 1 added. RPKM values are no longer thresholded.

Reads/transcript files unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are DepMap (Arxspan) cell line IDs

Columns: In the complete TPM, TPM transcripts, RPKM and read datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding TPM and read datasets column names are HGNC_symbol (Entrez_ID)

* Version 10-12 Public 19Q1*

version 12 contains the correct data for 19Q1

version 11 contains the correct transcript level data for 19Q1 and 18Q4

* Version 13 Public 19Q2*

* Version 14 Public 19Q3*

* Version 15 Public 19Q4*
Adding 93 new cell lines - Blacklisted - IBM - DMC
Some cells lines have been removed because they:

 - had too many 0 values = ["ACH-001388" "ACH-001577" "ACH-001767" "ACH-002463"] 

Some cells lines have been flagged as:

 - None
 
* Version 16 Public 19Q4*
removing unauthorized cell lines


data is hg38 aligned

read count data is created using RSEM, which, when a read maps to multiple places, splits the counts between genes, with the weight based on the likelihood that it came from one gene or the other, so counts data may not be integers

TPM data is log2-transformed with a pseudo count of 1 added. log2(X+1)

Reads/transcript files unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are DepMap cell line IDs

Columns: In the complete TPM and read counts datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding TPM dataset column names are HGNC_symbol (Entrez_ID). In the TPM transcript dataset column names are HGNC_symbol (Transcript_ID) - the HGNC symbols are not unique, use the transcript IDs for unique identifiers.
""")
AddToVirtual(virtual_public, "depmap-rnaseq-expression-data-ccd0", files=[('CCLE_RNAseq_reads', 'public_'+release+'_counts'),('CCLE_expression_full', 'public_'+release+'_tpm'),('CCLE_expression', 'public_'+release+'_proteincoding_tpm'),('CCLE_RNAseq_transcripts', 'public_'+release+'_transcripts_tpm')])

## Fusion post processing

In [None]:
terra.waitForSubmission(refworkspace, submission_id2)

In [None]:
submission_id2 = refwm.create_submission("Aggregate_Fusion_Calls", 'All_samples')
terra.waitForSubmission(refworkspace, submission_id2)

In [None]:
aggregated = refwm.get_sample_sets().loc['All_samples']['fusions_star']

In [None]:
! gsutil cp $aggregated "temp/expression.fusion.tsv"

In [None]:
fusions = pd.read_csv('temp/expression.fusion.tsv', sep='\t')
fusions['DepMap_ID'] = [i.split('.')[0] for i in fusions.DepMap_ID]
fusions['DepMap_ID'] = [i.split('_')[1] if i.split('_')[0] in ['dm','ccle','ibm'] else i for i in fusions.DepMap_ID]
a = list(set(fusions['DepMap_ID']))
a.sort()
todrop = []
for i in range(len(a)- 1):
    e = a[i + 1].split('_')
    if len(e[-1]) == 1:
        if int(e[-1]) > 1 and e[0] == a[i].split('_')[0]:
            todrop.append(a[i])
            print(a[i])
            print(e)
fusions = fusions.set_index('DepMap_ID').drop(todrop).reset_index()
fusions['DepMap_ID'] = [i.split('_')[0] for i in fusions.DepMap_ID]
fusions.to_csv('temp/expression.fusion.tsv',sep='\t')

### Overview

This document contains the code used to generate the unfiltered and filtered versions of the fusion datasets for the release. The bottom of the document also contains some comparisons between the release fusion dataset, CCLE2 fusion calls, and the translocation data from CCLE2.

In [None]:
%%R
source('../gkugener/RScripts/load_libraries_and_annotations.R')
source("src/CCLE_postp_function.R")
library('cdsomics')

## Generate filtered fusion table

Release: `r release`

We want to apply filters to the fusion table to reduce the number of artifacts in the dataset. Specifically, we filter the following:

* Remove fusions involving mitochondrial chromosomes, or HLA genes, or immunoglobulin genes
* Remove red herring fusions (from STAR-Fusion annotations column)
* Remove recurrent in CCLE (>= 25 samples)
* Remove fusion with (SpliceType=" INCL_NON_REF_SPLICE" and LargeAnchorSupport="No" and FFPM < 0.1)
* Remove fusions with FFPM < 0.05 (STAR-Fusion suggests using 0.1, but looking at the translocation data, this looks like it might be too aggressive)

In [None]:
%%R
unfiltered_fusions <- readFusions('temp/expression.fusion.tsv')
filtered_fusions <- filterFusions(unfiltered_fusions)

filtered_fusions <- prepare_depmap_fusion_data_for_taiga(filtered_fusions)
unfiltered_fusions <- prepare_depmap_fusion_data_for_taiga(unfiltered_fusions)

filtered_fusions <- filtered_fusions[,2:ncol(filtered_fusions)]
unfiltered_fusions <- unfiltered_fusions[,2:ncol(unfiltered_fusions)]

In [None]:
%%R
# Save the files (to be uploaded to taiga)
write.table(
  unfiltered_fusions,
  file = paste0('temp/fusions.',release, '.unfiltered.tsv'),
  sep = '\t', quote = F, row.names = F
)
write.table(
  filtered_fusions,
  file = paste0('temp/fusions.', release, '.filtered.tsv'),
  sep = '\t', quote = F, row.names = F
)

# Validation

## Validation Protocol:

to validate fusions, one should be able to list all cells with known fusions (i.e. elwing sarcoma) and check for each new cell in this set of knownfusioncells, if the fusion is present or not. and validate the fusion quality this way.

In [None]:
# check that all cells lines are present on fusion unfiltered
df = pd.read_csv('temp/fusions.'+release+'.unfiltered.tsv', sep='\t')
new = set([i.split('_')[1][:10] for i in list(set(df["DepMap_ID"].tolist()))])
print(prev - new, new-prev)

In [None]:
unfiltered = pd.read_csv('temp/fusions.'+release+'.unfiltered.tsv', sep='\t')
filtered = pd.read_csv('temp/fusions.'+ release+ '.filtered.tsv',sep='\t')

In [None]:
prevfiltered = tc.get(name='depmap-fusions-7990', file="fusions."+prevname+".filtered")
prevunfiltered = tc.get(name='depmap-fusions-7990', file='fusions.'+prevname+'.unfiltered')

# Uploading to Taiga

In [None]:
tc.update_dataset(dataset_permaname="depmap-fusions-7990",
                     upload_file_path_dict={'temp/fusions.'+release+'.filtered.tsv': 'TableTSV',
                                        'temp/fusions.'+release+'.unfiltered.tsv': 'TableTSV'},
                 dataset_description="""
# Fusions

filtered and unfiltered fusion files from Broad RNAseq data mapped to hg38

PORTAL TEAM SHOULD NOT USE THIS: There are lines here that should not make it even to internal. Must use subsetted dataset instead. These data will not make it on the portal starting 19Q1. With the DMC portal, there is new cell line release prioritization as to which lines can be included, so a new taiga dataset will be created containing CN for the portal.

version 1-4: guillaume releases
version 5: 19Q3 release
version 6:  adding missing samples in Terra merge files
version 7: 19Q4 new release.

## ** Internal 19Q4****

Adding 17 new cell lines, 3 reprioritized cell lines. log2(COPY RATIO+1). 
Some cells lines have been flagged as:

 - having bad looking copy ration plots = 
 - Genes having a similar CN value accross all []

transcriptoins (Transcripts rpkm):

genes (gene rpkm):
__Rows__:
__Columns__:
Counts (gene counts):
__Rows__:
__Columns__:
Gene level CN data:
__Rows__:
__Columns__:
 DepMap cell line IDs
 gene names in the format HGNC\_symbol (Entrez\_ID)
DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean
 """)

## Internal

In [None]:
filtered

In [None]:
## removing first blacklisted, then embargoed, to create two datasets
filtered = filtered[~filtered.DepMap_ID.isin(blacklist)]
filtered.to_csv('temp/filtered_fusions_'+release, sep='\t')
unfiltered= unfiltered[~unfiltered.DepMap_ID.isin(blacklist)]
unfiltered.to_csv('temp/unfiltered_fusions_'+release, sep='\t')

In [None]:
tc.update_dataset(dataset_permaname="gene-fusions-8b7a",
                 upload_file_path_dict={'temp/filtered_fusions_'+release: 'TableTSV',
                                       'temp/unfiltered_fusions_'+release: 'TableTSV'},
                  dataset_description=
"""
# Internal Fusions

Description: Gene fusions derived from RNAseq data.

Rows: cell lines

Original Raw Data: Generated by Mahmoud Ghandi on April 25, 2017. Can be found at xchip_ccle_dist/broad_only/unpublished_Novartis_data/RNAseq/fusions.txt

Version 3: added a column containing the Broad_ID

* Version 4-5 Internal 19Q1*

version 5 contains the correct data for 19Q1

Description: Gene fusions derived from RNAseq data.

Rows: cell lines

* Version 6 Internal 19Q2*

* Version 7*

Josh D added "common_fusion_matrix".

Binary matrix of the most common gene fusions (those where the two involved genes are fused in at least 5 cell lines) with no additional filtering. Use at your own risk.

Description: Gene fusions derived from RNAseq data.

Rows: cell lines

* Version 8 Internal 19Q3*

Unfiltered data contains all output fusions, while the filtered data uses the following filters:

Removing fusion involving mitochondrial chromosomes or HLA genes
Removed common false positive fusions (red herring annotations as described in the STAR-Fusion docs)
Recurrent fusions observed in CCLE across cell lines (in 25 or more samples)
Removed fusions where SpliceType='INCL_NON_REF_SPLICE' and LargeAnchorSupport='NO_LDAS' and FFPM < 0.1
FFPM < 0.05
Columns:

LeftGene and RightGene separated by an ampersand ("&").

Rows:

DepMap_IDs
""")
AddToVirtual('depmap-a0ab', "gene-fusions-8b7a", files=[('CCLE_fusions_unfiltered', 'filtered_fusions_'+release),('CCLE_fusions', 'unfiltered_fusions_'+release)])

AddToVirtual(virtual_internal, "gene-fusions-8b7a", files=[('CCLE_fusions_unfiltered', 'filtered_fusions_'+release),('CCLE_fusions', 'unfiltered_fusions_'+release)])

## IBM

same as internal

## DMC

In [None]:
## removing first blacklisted, then embargoed, to create two datasets
filtered = filtered[~filtered.DepMap_ID.isin(rna_ibm_embargo)]
filtered.to_csv('temp/filtered_fusions_'+release, sep='\t')
unfiltered  = unfiltered[~unfiltered.DepMap_ID.isin(rna_ibm_embargo)]
unfiltered.to_csv('temp/unfiltered_fusions_'+release, sep='\t')

In [None]:
tc.update_dataset(dataset_permaname="gene-fusions-375f",
                 upload_file_path_dict={'temp/filtered_fusions_'+release: 'TableTSV',
                                       'temp/unfiltered_fusions_'+release: 'TableTSV'},
                  dataset_description=
"""
# DMC Fusions

* Version 1-2 DMC 19Q1*

version 2 contains the correct data for 19Q1

Description: Gene fusions derived from RNAseq data.

Rows: cell lines

Unfiltered data contains all output fusions, while the filtered data uses the filters suggested by the star fusion docs. These filters are:
- FFPM > 0.1 -  a cutoff of 0.1 means&nbsp;at least 1 fusion-supporting RNAseq fragment per 10M total reads
- Remove known false positives, such as GTEx recurrent fusions and certain paralogs
- Genes that are next to each other
- Fusions with mitochondrial breakpoints

* Version 3 DMC 19Q2*

* Version 4 DMC 19Q3*

Description: Gene fusions derived from RNAseq data.

Rows: cell lines

Unfiltered data contains all output fusions. Filtered data was generated by performing the following:

Removing fusion involving mitoch
""")
AddToVirtual(virtual_dmc, "gene-fusions-375f", files=[('CCLE_fusions_unfiltered', 'filtered_fusions_'+release),('CCLE_fusions', 'unfiltered_fusions_'+release)])

## Public

In [None]:
len(unfiltered)

In [None]:
## removing first blacklisted, then embargoed, to create two datasets
filtered = filtered[filtered.DepMap_ID.isin(prevprevcells)]
filtered[~filtered.DepMap_ID.isin(rna_dmc_embargo)].to_csv('temp/filtered_fusions_'+release, sep='\t')
unfiltered = unfiltered[unfiltered.DepMap_ID.isin(prevprevcells)]
unfiltered[~unfiltered.DepMap_ID.isin(rna_dmc_embargo)].to_csv('temp/unfiltered_fusions_'+release, sep='\t')

In [None]:
tc.update_dataset(dataset_permaname="gene-fusions-6212",
                 upload_file_path_dict={'temp/filtered_fusions_'+release: 'TableTSV',
                                       'temp/unfiltered_fusions_'+release: 'TableTSV'},
                  dataset_description=
"""
# PUBLIC Fusions

* Version 1 Public 2017 data*

Description: Gene fusions derived from RNAseq data.

Rows: cell lines, ID contained in the column Broad_ID

Original Raw Data: Generated by Mahmoud Ghandi on April 25, 2017. Can be found at xchip_ccle_dist/broad_only/unpublished_Novartis_data/RNAseq/fusions.txt

* Version 2-3 Public 19Q1*

version 3 contains the correct data for 19Q1

* Version 4-5 Public 19Q2*

in version 5 formatting of the columns is improved

* Version 6 Public 19Q3*

Description: Gene fusions derived from RNAseq data.

Rows: cell lines, IDs contained in the column DepMap_ID

Unfiltered data contains all output fusions, while the filtered data uses the filters suggested by the star fusion docs. These filters are:
- FFPM > 0.1 -  a cutoff of 0.1 means&nbsp;at least 1 fusion-supporting RNAseq fragment per 10M total reads
- Remove known false positives, such as GTEx recurrent fusions and certain paralogs
- Genes that are next to each other
- Fusions with mitochondrial breakpoints

""")
AddToVirtual(virtual_public, "gene-fusions-6212", files=[('CCLE_fusions_unfiltered', 'filtered_fusions_'+release),('CCLE_fusions', 'unfiltered_fusions_'+release)])

# [Additional] IF want to merge here instead of on Terra:

In [None]:
newsamples

In [None]:
prevsamplesets = ['CCLE_19Q3interim',samplesetname]
samples = []
for i in prevsamplesets:
    samples.extend(refwm.get_sample_sets().loc[i].samples)
res = []
terrasamp = refwm.get_samples()
for i, sample in enumerate(samples):
    res.append(terrasamp.loc[sample])
    genes_fusion = res[i]['fusion_predictions_abridged']
    rsem_genes_transcripts = res[i]['rsem_isoforms']
    rsem_genes_expected_count = res[i]['rsem_genes']
    ! gsutil cp $rsem_genes_expected_count 'temp/' && gsutil cp $rsem_genes_transcripts 'temp/' && gsutil cp $genes_fusion 'temp/'

In [None]:
mainres = refwm.get_sample_sets().loc['DM19Q2_PATHS_CORRECTED_V2']
maingenes_fusion = mainres['fusions_star']
mainrsem_genes_tpm = mainres['rsem_genes_tpm']
mainrsem_genes_transcripts = mainres['rsem_transcripts_tpm']
mainrsem_genes_expected_count = mainres['rsem_genes_expected_count']
! gsutil cp $mainrsem_genes_expected_count "temp/" && gsutil cp $mainrsem_genes_transcripts "temp/" && gsutil cp $maingenes_fusion "temp/expression.fusion.tsv" && gsutil cp $mainrsem_genes_tpm "temp/"

In [None]:
mainres['rsem_genes_expected_count']

In [None]:
addSamplesRSEMToMain(res,mainres)

In [None]:
genes_fusion = ['temp/'+val['fusion_predictions_abridged'].split('/')[-1] for val in res]
addToMainFusion(genes_fusion,'temp/expression.fusion.tsv')