# Loading the necessary packages

In [None]:
from __future__ import print_function

from depmapomics import constants
from depmapomics import env_config

from depmapomics import fusions, expressions, dm_omics
from genepy import terra
from genepy.utils import helper as h

import dalmatian as dm

from bokeh.plotting import output_notebook

%load_ext autoreload
%autoreload 2

output_notebook()

In [None]:
isCCLE = True
doCleanup = False

# Loading new data

Currently, sequenced data for DepMap is generated by the Genomics Platform (GP) at the Broad who deposits them into several different Terra workspaces. Therefore, the first step of this pipeline is to look at these workspaces and:

 - identify new samples by looking at the bam files and compare them with bams we have already onboarded
 - remove duplicates and ones with broken file paths
 - map files to profiles in Gumbo, if possible
 - onboard new samples and new versions of old cell lines if we find any

#### The following two cells scan the delivery workspaces and add new samples to gumbo. Currently under construction to be regularly run off-cycle

In [None]:
# Currently working on running this step off-cycle
# if isCCLE:
#     from depmap_omics_upload import loading
#     print("loading new RNAseq data")
#     rnasamples, unmapped = loading.loadFromMultipleWorkspaces(RNAWORKSPACES, EXTRACT_DEFAULTS["sm_id"], "pdo_sample", "rna")

In [None]:
# Currently working on running this step off-cycle
# if isCCLE:
#     # write samples to Sequencing table, copy bam files to internal storage bucket:
#     rnasamples, cmds = loading.addSamplesToGumbo(rnasamples, 'rna', RNA_GCS_PATH)

#### All RNAseq sequencingIDs in gumbo that are not in the RNAseq terra workspace yet are considered "new" for the current release. Here we add them to the terra processing workspace as a sample set.

In [None]:
if isCCLE:
    from depmap_omics_upload import loading
    # load new rna samples from gumbo to RNAseq terra workspace
    loading.addSamplesToDepMapWorkspace('rna', env_config.RNAWORKSPACE, samplesetname=constants.SAMPLESETNAME)

# Run pipeline on Terra

We are using Dalmatian to send requests to Terra. See [our readme](https://github.com/broadinstitute/depmap_omics/blob/master/documentation/DepMap_processing_pipeline.md) for detailed breakdown of the subtasks in our RNAseq pipeline.

For non internal users, your Terra workspace needs to be correctly setup:

Please follow instructions in the readme and make sure that you created your sampleset

In [None]:
print("running Terra pipeline")
refwm = dm.WorkspaceManager(env_config.RNAWORKSPACE)
submission_id = refwm.create_submission("RNA_pipeline", constants.SAMPLESETNAME,'sample_set',expression='this.samples')
await terra.waitForSubmission(env_config.RNAWORKSPACE, submission_id)

In [None]:
submission_id = refwm.create_submission("RNA_aggregate", 'all')
await terra.waitForSubmission(env_config.RNAWORKSPACE, submission_id)

### Save the workflow configurations used

In [None]:
terra.saveWorkspace(env_config.RNAWORKSPACE,'data/'+constants.SAMPLESETNAME+'/RNAconfig/')

# Postprocessing on local


### Expression post processing

In [None]:
if isCCLE:
    await dm_omics.expressionPostProcessing(samplesetname=constants.SAMPLESETNAME, recompute_ssgsea=False, compute_enrichment=False)
else:
    await expressions.postProcessing(refworkspace=env_config.RNAWORKSPACE, samplesetname=constants.SAMPLESETNAME, recompute_ssgsea=False, compute_enrichment=False)

# Fusion post processing

In [None]:
if isCCLE:
    await dm_omics.fusionPostProcessing(samplesetname=env_config.SAMPLESETNAME)
else:
    await fusions.postProcessing(env_config.RNAWORKSPACE)

## cleaning workspaces

In [None]:
if doCleanup:
    print("cleaning workspaces")
    torm = await terra.deleteHeavyFiles(env_config.RNAWORKSPACE)
    h.parrun(['gsutil rm '+i for i in torm], cores=8)
    terra.removeFromFailedWorkflows(env_config.RNAWORKSPACE, dryrun=False, everythingFor=[])
    print("done")