Loading the necessary packages

In [None]:
from __future__ import print_function

from depmapomics import tracker, loading, fusion
from depmapomics import postprocess_expression as postrna
from depmapomics.qc import utils as myQC
from genepy import terra
from genepy.utils import helper as h
from genepy import rna

from gsheets import Sheets

from taigapy import TaigaClient
import dalmatian as dm
import seaborn as sns

from collections import Counter
from bokeh.plotting import output_notebook

import re

%load_ext autoreload
%autoreload 2

tc = TaigaClient()
output_notebook()

immutable parameters (user specific)

In [None]:
#GENERAL PARAMS

isCCLE = True
doCleanup = True
samplesetname="21Q2"
release = samplesetname

## current age at which to consider the sample already loaded in previous release
maxage = '2020-11-01'

## genomic annotations (v35)
gencode = 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_35/gencode.v35.annotation.gff3.gz'
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

# Terra workflow names
RNAmethods = ['']

#fusion data
fusionSamplecol = "DepMap_ID"
fusionCountCol = "CCLE_count"

## version 102
ensemblserver = "http://nov2020.archive.ensembl.org/biomart" 
datatype = 'rna'

# USER SPECIFIC

my_id = '~/.client_secret.json'
mystorage_id = "~/.storage.json"

## do the first steps of https://medium.com/craftsmenltd/from-csv-to-google-sheet-using-python-ef097cb014f9
creds = '../.credentials.json'
sheets = Sheets.from_files(my_id, mystorage_id)

## lines that have issues
toraise = ["ACH-001195"]
previousQCfail = ['CDS-12DTEw', 'CDS-9hv1zM', 'CDS-A6GSeQ', 'CDS-aWlMRt', 'CDS-B1ywOH', 'CDS-BixxtG', 'CDS-DRM3l2', 'CDS-jOlYT4', 'CDS-KMhiT9', 'CDS-M6mnMA', 'CDS-pYwECX', 'CDS-v6E624', 'CDS-vxTqNJ', 'CDS-YxtmkI',"CDS-fk564T","CDS-kU30H5","CDS-G0F5f5","CDS-ABH0uZ"]

colstoclean = ['fastq1', 'fastq2','recalibrated_bam','recalibrated_bam_index']

# CCLE SPECIFIC

## old GP storage buckets
workspace2="broad-firecloud-ccle/CCLE_DepMap_RNAseq"
workspace4="broad-genomics-delivery/Cancer_Cell_Line_Factory_CCLF_RNAseq"
workspace5="nci-mimoun-bi-org/CCLF_RNA_2_0"
workspace3="broad-genomics-delivery/CCLE_DepMap_RNAseq"
workspace1="broad-genomics-delivery/Getz_IBM_CellLines_RNASeqData"

## curent GP buckets
workspace6="terra-broad-cancer-prod/CCLE_DepMap_RNAseq"
workspace7="terra-broad-cancer-prod/Getz_IBM_CellLines_RNASeqData"

## and their correesponding sample source
source1="ibm"
source2="ccle"
source3="ccle"
source4="cclf"
source5="cclf"

source6="ccle"
source7="ibm"

## our working workspace (reference)
refworkspace="broad-firecloud-ccle/DepMap_hg38_RNAseq"

## info/metadata google spreadsheets (info about cell lines)
refsheet_url = "https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY"
privacy_release_url = "https://docs.google.com/spreadsheets/d/115TUgA1t_mD32SnWAGpW9OKmJ2W5WYAOs3SuSdedpX4"
depmap_pv = "https://docs.google.com/spreadsheets/d/1uqCOos-T9EMQU7y2ZUw4Nm84opU5fIT1y7jet1vnScE"
depmap_taiga = "arxspan-cell-line-export-f808"

sampletrackername='ccle sample tracker'
## values we need to rename from the GP workspaces
extract_to_change = {'from_arxspan_id': 'participant'}

## things to match to from the GP workspaces
match = ['ACH-','CDS-']

## taiga to test
tocompare = {"genes_expected_count":"CCLE_RNAseq_reads", "genes_tpm":"CCLE_expression_full", "proteincoding_genes_tpm":"CCLE_expression"}


# Generate sample set from new samples

we retrieve all the samples we can find from the GP workspaces

__CCLE specific__

In [None]:
if isCCLE:
    print("loading new RNAseq data")
    samples = loading.loadRNA(samplesetname,workspaces=[workspace6, workspace7],sources=["ccle", "ibm"],maxage=maxage, baits='polyA', stype="rna", toraise=toraise)

In [None]:
if isCCLE:
    print("uploading samples to the tracker and Terra")
    loading.update(samples, samplesetname, stype="rna", bucket="", refworkspace,
          name_col="index", values=['legacy_bam_filepath', 'legacy_bai_filepath'],
          filetypes=['bam', 'bai'],
          my_id=my_id,
          mystorage_id=mystorage_id,
          creds=creds,
          sampletrackername=sampletrackername, refsheet_url=refsheet_url)

# run the pipeline

We are using Dalmatian to send request to Terra, we are running a set of 6 functions To generate the expression/fusion dataset:

We use the GTEx pipeline ([https://github.com/broadinstitute/gtex-pipeline/blob/v9/TOPMed_RNAseq_pipeline.md](https://github.com/broadinstitute/gtex-pipeline/blob/v9/TOPMed_RNAseq_pipeline.md)).

To generate the expression dataset, run the following tasks on all samples that you need, in this order:



*   samtofastq_v1-0_BETA_cfg 

    (broadinstitute_gtex/samtofastq_v1-0_BETA Snapshot ID: 5)

*   star_v1-0_BETA_cfg

(broadinstitute_gtex/star_v1-0_BETA Snapshot ID: 7)



*   rsem_v1-0_BETA_cfg 

    (broadinstitute_gtex/rsem_v1-0_BETA Snapshot ID: 4)

*   rsem_aggregate_results_v1-0_BETA_cfg (broadinstitute_gtex/rsem_aggregate_results_v1-0_BETA Snapshot ID: 3)

The outputs to be downloaded will be saved under the sample set that you ran. The outputs we use for the release are:



*   rsem_genes_expected_count
*   rsem_genes_tpm
*   rsem_transcripts_tpm

****Make sure that you delete the intermediate files. These files are quite large so cost a lot to store. To delete, you can either write a task that deletes them or use gsutil rm*****


##### Fusions {#fusions}

We use STAR-Fusion [https://github.com/STAR-Fusion/STAR-Fusion/wiki](https://github.com/STAR-Fusion/STAR-Fusion/wiki). The fusions are generated by running the following tasks



*   hg38_STAR_fusion (gkugener/STAR_fusion Snapshot ID: 14)
*   Aggregate_Fusion_Calls (gkugener/Aggregate_files_set Snapshot ID: 2)

The outputs to be downloaded will be saved under the sample set you ran. The outputs we use for the release are: 



*   fusions_star

This task uses the same samtofastq_v1-0_BETA_cfg task as in the expression pipeline, although in the current implementation, this task will be run twice. It might be worth combing the expression/fusion calling into a single workflow. This task also contains a flag that lets you specify if you want to delete the intermediates (fastqs). 

There are several other tasks in this workspace. In brief:



*   Tasks prefixed with **EXPENSIVE** or **CHEAP** are identical to their non-prefixed version, except that they specify different memory, disk space, etc. parameters. These versions can be used when samples fail the normal version of the task due to memory errors.
*   The following tasks are part of the GTEx pipeline but we do not use them (we use RSEM exclusively): markduplicates_v1-0_BETA_cfg (broadinstitute_gtex/markduplicates_v1-0_BETA Snapshot ID: 2), rnaseqc2_v1-0_BETA_cfg (broadinstitute_gtex/rnaseqc2_v1-0_BETA Snapshot ID: 2)
*   **ExonUsage_hg38_fixed** (gkugener/ExonUsage_fixed Snapshot ID: 1): this task calculates exon usage ratios. The non-fixed version contains a bug in the script that is not able to handle chromosome values prefixed with ‘chr’. The ‘fixed’ version resolves this issue.
*   **AggregateExonUsageRObj_hg38** (ccle_mg/AggregateExonUsageRObj Snapshot ID: 2): combines the exon usage ratios into a matrices that are saved in an R object.

### cleaning workspaces

In [None]:
if doCleanup:
    print("cleaning workspaces")
    torm = asyncio.run(terra.deleteHeavyFiles(refworkspace))
    h.parrun(['gsutil rm '+i for i in torm], cores=8)
    terra.removeFromFailedWorkflows(refworkspace, dryrun=False, everythingFor)

## On Terra

In [None]:
# TODO: update with latest workspace parameters from our repo

In [None]:
print("running Terra pipeline")

submission_id = refwm.create_submission("RNA_pipeline", samplesetname,'sample_set',expression='this.samples')
asyncio.run(terra.waitForSubmission(refworkspace, submission_id))

In [None]:
submission_id = refwm.create_submission("RNA_aggregate", 'all')
asyncio.run(terra.waitForSubmission(refworkspace, submission_id))

## On Local

### Save the workflow configurations used

In [None]:
terra.saveWorkspace(refworkspace,'data/'+samplesetname+'/RNAconfig/')

### Load QC files and generate a QC report

In [None]:
print("load QC and generate QC report")
samplesinset= [i['entityName'] for i in refwm.get_entities('sample_set').loc[samplesetname].samples]

qcs, lowqual, failed = myQC.plot_rnaseqc_results(refworkspace, samplesinset, output_path="data/"+samplesetname+"rna_qcs/")

failed = failed.index.tolist()
print('you want to copy that up top, to save it for next time',failed)
failed.extend(previousQCfail)

### solve QC fails when possible

In [None]:
rename = postrna.solveQC(ccle_refsamples, failed)
%store rnafailed

### Remove some datafile to save money

In [None]:
if doCleanup:
    print("cleaninp up data")
    res = refwm.get_samples()
    for val in colstoclean:
        refwm.disable_hound().delete_entity_attributes('sample', res[val], delete_files=True)

### Expression post processing

#### generating gene names

In [None]:
print("generating gene names")
gene_rename, protcod_rename, ensembltohgnc = postrna.generateGeneNames(ensemble_server=ensemblserver, cached=True)

#### loading the files

In [None]:
print("loading files")
def rn(r):
    renaming = tracker.removeOlderVersions(names=r, refsamples=refwm.get_samples(), arxspan_id="arxspan_id", version="version")
    # if we have a replaceable failed version in our dataset
    for k, v in renaming.items():
        if k in rename:
            renaming[rename[k]] = renaming.pop(k)
    return renaming
files, renaming = postrna.loadFromRSEMaggregate(refwm, ccle_refsamples, renameFunc=rn, filenames=["transcripts_tpm", "genes_tpm", "genes_expected_count", "transcripts_expected_count"],  sampleset="all")

#### renaming the files

In [None]:
print("renaming files")
# gene level
files = postrna.subsetGenes(files, gene_rename, filenames = ['rsem_genes_expected_count', 'rsem_genes_tpm'], drop="transcript_id", index="gene_id")

files = postrna.extractProtCod(files, ensembltohgnc[ensembltohgnc.gene_biotype == 'protein_coding'], protcod_rename, filenames=['rsem_genes_expected_count', 'rsem_genes_tpm'])

In [None]:
# transcript level
files = postrna.subsetGenes(files, gene_rename, filenames = ['rsem_transcripts_expected_count', 'rsem_transcripts_tpm'], drop="gene_id", index="transcript_id")

### validation

In [None]:
if isCCLE:
    print("doing validation")
    prevcounts = tc.get(name='depmap-a0ab', file='CCLE_RNAseq_reads')
    nonoverlap = set(prevcounts.columns) ^ set(files['genes_expected_count'].columns)
    print("number of non overlaping genes:")
    print(len(nonoverlap))
    # have we lost any samples compared to last release?
    lost = set(prevcounts.index) - set(files['genes_expected_count'].index)
    print("of which, lost genes:")
    print(lost)
    # do we have samples that are missanotated compared to previous releases (replicate level)
    #notindataset, missannotated, unmatched = findMissAnnotatedReplicates(replevel, prevcounts, renaming)
    #for k,v in unmatched.items():
    #    if ccle_refsamples.loc[k].arxspan_id!=v:
    #        print(k,v)
    # do we have samples that are missanotated compared to previous releases (sample level)
    unmatched = rna.getDifferencesFromCorrelations(files['genes_expected_count'] ,prevcounts, minsimi=0.95)
    print("differences in correlations against the previous release")
    print(unmatched)
    # Is it because of  duplicate version?
    print('do we see it as a duplicate in the tracker?')
    rnasamples = ccle_refsamples[ccle_refsamples.datatype=='rna']
    for i,val in unmatched:
        print(len(rnasamples[rnasamples.arxspan_id==i]))

### ssGSEA using R's console and GSVA

In [None]:
print("doing ssGSEA")
enrichments = asyncio.run(postrna.ssGSEA(files['genes_tpm']))

### Save files for taiga

In [None]:
if isCCLE:    
    #CCLE_expression, CCLE_expression_full, , 
    print("comparing to previous release")
    #h.compareDfs(files["rsem_transcripts_tpm"], tc.get(name='depmap-a0ab', file='CCLE_RNAseq_transcripts'))
    #h.compareDfs(files["rsem_transcripts_expected_count"], tc.get(name='depmap-a0ab', file='CCLE_expression_transcripts_expected_count'))
    # h.compareDfs(enrichments, tc.get(name='depmap-a0ab', file='CCLE_fusions_unfiltered'))
    for key, val in tocompare.items():
        _, omissmatchCols, _,omissmatchInds, newNAs, new0s = h.compareDfs(files[key], tc.get(name='depmap-a0ab', file=val))
        print(key)
        assert omissmatchCols==0
        assert omissmatchInds==0
        assert newNAs==0
        assert new0s==0

In [None]:
print("saving files")
enrichments.to_csv('temp/gene_sets_'+samplesetname+'_all.csv')
postrna.saveFiles(files, release)

### update tracker

In [None]:
if isCCLE:
    print("updating the tracker")
    ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)
    postrna.updateTracker(refworkspace, 
                          selected=set(renaming.keys())-set(['transcript_id(s)']),
                          lowqual=lowqual[lowqual.sum(1)>3].index.tolist()
                          ccle_refsamples, samplesinset, sheetname=sampletrackername, sheetcreds=creds)

In [None]:
#filenames = os.listdir("temp/expression_$release*
#files = {}
#for val in filenames:
#    #h.compareDfs(val, tc.get(name='depmap-a0ab', file=k))
#    files[val.replace('temp/expression_'+release, 'rsem')
#          [:-4]] = pd.read_csv(val, index_col=0)

## Fusion post processing

In [None]:
refwm = dm.WorkspaceManager(refworkspace)
fusions = pd.read_csv(refwm.get_sample_sets().loc['all']['fusions_star'], names=[fusionSamplecol, 'FusionName', 'JunctionReadCount', 'SpanningFragCount', 'SpliceType', 'LeftGene', 'LeftBreakpoint', 'RightGene', 'RightBreakpoint', 'LargeAnchorSupport', 'FFPM', 'LeftBreakDinuc', 'LeftBreakEntropy', 'RightBreakDinuc', 'RightBreakEntropy','annots'], skiprows=1, sep='\t')
fusions[fusionSamplecol] = [i.split('.')[0] for i in fusions[fusionSamplecol]]
print(len(fusions))
print(fusions[fusionSamplecol][:10])

In [None]:
fusions.RightGene = fusion.renameFusionGene(fusions.RightGene)
fusions.LeftGene = fusion.renameFusionGene(fusions.LeftGene)

In [None]:
fusions[['LeftGene', 'RightGene']] = fusions[['LeftGene', 'RightGene']].applymap(lambda x: re.sub(r'([^\^]+)\^(.*)$', r'\1 (\2)', x))

In [None]:
fusions.to_csv('temp/fusions_withReplicates_'+release+'.csv',index=False)

In [None]:
#fusions = pd.read_csv('temp/fusions_withReplicates_'+release+'.csv')

### Generate filtered fusion table

We want to apply filters to the fusion table to reduce the number of artifacts in the dataset. Specifically, we filter the following:

* Remove fusions involving mitochondrial chromosomes, or HLA genes, or immunoglobulin genes
* Remove red herring fusions (from STAR-Fusion annotations column)
* Remove recurrent in CCLE (>= 25 samples)
* Remove fusion with (SpliceType=" INCL_NON_REF_SPLICE" and LargeAnchorSupport="No" and FFPM < 0.1)
* Remove fusions with FFPM < 0.05 (STAR-Fusion suggests using 0.1, but looking at the translocation data, this looks like it might be too aggressive)


In [None]:
if isCCLE:
    renaming = tracker.removeOlderVersions(names=set(fusions[fusionSamplecol]), refsamples=refwm.get_samples(), arxspan_id="arxspan_id", version="version")
    fusions = fusions[fusions[fusionSamplecol].isin(renaming.keys())].replace({fusionSamplecol:renaming}).reset_index(drop=True)

In [None]:
fusions[fusionCountCol] = [i.LeftBreakpoint+'_'+i.RightBreakpoint for k, i in fusions.iterrows()]
counts = Counter(list(fusions[fusionCountCol]))
fusions[fusionCountCol] = [counts[val] for val in fusions[fusionCountCol]]
sns.kdeplot(fusions[fusionCountCol] )

In [None]:
filtered = fusion.filterFusions(fusions, maxfreq=0.1)
len(set(fusions['fusionSamplecol']))

In [None]:
ccle_refsamples = sheets.get(refsheet_url).sheets[0].to_frame(index_col=0)
normals = ccle_refsamples[ccle_refsamples['primary_disease']=='normal'].index.tolist()
#fusions = fusions[~fusions['fusionSamplecol']isin(normals)]
#filtered = filtered[~filtered['fusionSamplecol']isin(normals)]

In [None]:
prev = tc.get(name='depmap-a0ab', file='CCLE_fusions_unfiltered')
print('new')
print(set(fusions[fusionSamplecol]) - set(prev[fusionSamplecol]))

print('removed')
print(set(prev[fusionSamplecol]) - set(fusions[fusionSamplecol]))

print("changes in fusion names")
pf = prev.copy()
pf["id"] = pf[fusionSamplecol]+"_"+pf["FusionName"]
f = fusions.copy()
f["id"] = f[fusionSamplecol]+"_"+f["FusionName"]
print(len(set(pf[~pf.id.isin(f.id.tolist())][fusionSamplecol])))

print("changes in junction readd counts")
f["sid"] = f[fusionSamplecol]+"_"+f["FusionName"] + "_"+ f["JunctionReadCount"].astype(str)
pf["sid"] = pf[fusionSamplecol]+"_"+pf["FusionName"] + "_"+ pf["JunctionReadCount"].astype(str)
print(len(set(pf[~pf.sid.isin(f.sid.tolist())][fusionSamplecol])))

print("in fusion, not in rna")
print(set(fusions[fusionSamplecol]) - set(files['proteincoding_genes_tpm'].index))
print('in depmap, not in fusions')
print(set(files['proteincoding_genes_tpm'].index) - set(fusions[fusionSamplecol]))

In [None]:
filtered = filtered.drop(columns="id")
fusions = fusions.drop(columns="id")

In [None]:
fusions.to_csv('temp/fusions_'+release+'.csv',index=False)
filtered.to_csv('temp/filtered_fusions_'+release+'.csv',index=False)

In [None]:
fusions = pd.read_csv('temp/fusions_'+release+'.csv')
filtered = pd.read_csv('temp/filtered_fusions_'+release+'.csv')

### Uploading to Taiga

In [None]:
tc.update_dataset(dataset_permaname="fusions-95c9",
                  changes_description="new "+samplesetname+" release!",
                  upload_files=[
                    {
                        "path": 'temp/fusions_'+release+'.csv',
                        "format": "TableCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": 'temp/filtered_fusions_'+release+'.csv',
                        "format": "TableCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/fusions_withReplicates_"+release+".csv",
                        "format": "TableCSV",
                        "encoding": "utf-8"
                    },
                 ],
                 dataset_description="""
# Fusions

PORTAL TEAM SHOULD NOT USE THIS: There are lines here that should not make it even to internal.

/!\ This is the most up to date version of the CCLE CN data.

## Annotations

Description: Gene fusions derived from RNAseq data.

Rows: cell lines, IDs contained in the column DepMap_ID

Unfiltered data contains all output fusions, while the filtered data uses the filters suggested by the star fusion docs. These filters are:
- FFPM > 0.1 -  a cutoff of 0.1 means&nbsp;at least 1 fusion-supporting RNAseq fragment per 10M total reads
- Remove known false positives, such as GTEx recurrent fusions and certain paralogs
- Genes that are next to each other
- Fusions with mitochondrial breakpoints
- Removing fusion involving mitochondrial chromosomes or HLA genes
- Removed common false positive fusions (red herring annotations as described in the STAR-Fusion docs)
- Recurrent fusions observed in CCLE across cell lines (in more than 10% of our samples)
- Removed fusions where SpliceType='INCL_NON_REF_SPLICE' and LargeAnchorSupport='NO_LDAS' and FFPM < 0.1
- FFPM < 0.05
""")

In [None]:
tc.update_dataset(changes_description="new "+samplesetname+" release!",
                dataset_permaname="expression-d035",
                upload_files=[
                    {
                        "path": "temp/expression_"+samplesetname+"_proteincoding_tpm_logp1.csv",
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/expression_"+samplesetname+"_transcripts_tpm_logp1.csv",
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/expression_"+samplesetname+"_genes_tpm_logp1.csv",
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/expression_"+samplesetname+"_genes_tpm.csv",
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/expression_"+samplesetname+"_transcripts_tpm.csv",
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/expression_"+samplesetname+"_proteincoding_tpm.csv",
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/expression_"+samplesetname+"_transcripts_expectedcount.csv",
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/expression_"+samplesetname+"_proteincoding_expectedcount.csv",
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": "temp/expression_"+samplesetname+"_genes_expectedcount.csv",
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                    {
                        "path": 'temp/gene_sets_'+samplesetname+'_all.csv',
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    },
                 ],
                upload_async=False,
                add_all_existing_files=True,
                dataset_description=
"""
# RNAseq

PORTAL TEAM SHOULD NOT USE THIS: There are lines here that should not make it even to internal.

/!\ This is the most up to date version of the CCLE RNA data.

## Annotations:

transcriptions (Transcripts rpkm):

genes (gene rpkm):
__Rows__:
__Columns__:
Counts (gene counts):
__Rows__:
__Columns__:
Gene level CN data:
__Rows__:
__Columns__:
 DepMap cell line IDs
 gene names in the format HGNC\_symbol (Entrez\_ID)
DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean
""")