# Loading the necessary packages

In [1]:
import depmapomics.patch_firecloud
depmapomics.patch_firecloud.install_patches()

In [2]:
from __future__ import print_function

from depmapomics import constants
from depmapomics import env_config

from depmapomics import dm_omics
from depmapomics import mutations as omics_mut
from depmapomics import copynumbers as omics_cn
from depmapomics import fingerprinting as fp

from mgenepy import terra
import dalmatian as dm
from bokeh.plotting import output_notebook

%load_ext autoreload
%autoreload 2

output_notebook()

In [3]:
isCCLE = True
doCleanup = False

# Loading new data

Currently, sequenced data for DepMap is generated by the Genomics Platform (GP) at the Broad who deposits them into several different Terra workspaces. Therefore, the first step of this pipeline is to look at these workspaces and:

 - identify new samples by looking at the bam files and compare them with bams we have already onboarded
 - remove duplicates and ones with broken file paths
 - map files to profiles in Gumbo, if possible
 - onboard new samples and new versions of old cell lines if we find any

#### The following two cells scan the delivery workspaces and add new samples to gumbo. Currently under construction to be regularly run off-cycle

In [None]:
# Currently working on running this step off-cycle
# if isCCLE:
#     print("loading new WGS data")
#     from depmap_omics_upload import loading
#     wgssamples, unmapped = loading.loadFromMultipleWorkspaces(WGSWORKSPACES, EXTRACT_DEFAULTS["sm_id"], "SMIDOrdered", "wgs", bamcol="cram_path")

In [None]:
# Currently working on running this step off-cycle
# if isCCLE:
#     from depmap_omics_upload import loading
#     # write samples to Sequencing table, copy bam files to internal storage bucket:
#     wgssamples, cmds = loading.addSamplesToGumbo(wgssamples, 'wgs', WGS_GCS_PATH, filetypes=["cram", "crai"])

#### All WGS sequencingIDs in gumbo that are not in the WGS terra workspace yet are considered "new" for the current release. Here we add them to the terra processing workspace as a sample set.

In [None]:
if isCCLE:
    from depmap_omics_upload import loading
    # load new rna samples from gumbo to WGS terra workspace:
    loading.addSamplesToDepMapWorkspace('wgs', env_config.WGSWORKSPACE, samplesetname=constants.SAMPLESETNAME, add_to_samplesets=['allcurrent'])

# Run SNP fingerprinting, new (rna + wgs) vs all existing samples

In [None]:
wgs_wm = dm.WorkspaceManager(env_config.WGSWORKSPACE)
rna_wm = dm.WorkspaceManager(env_config.RNAWORKSPACE)

In [None]:
wgs_all = wgs_wm.get_sample_sets().loc[constants.SAMPLESETNAME, "samples"]
rna_all = rna_wm.get_sample_sets().loc[constants.SAMPLESETNAME, "samples"]

In [None]:
if isCCLE:
    updated_lod_mat, mismatches, matches = await fp._CCLEFingerPrint(rna_all, wgs_all)

# Run pipeline on Terra

We are using Dalmatian to send requests to Terra. See [our readme](https://github.com/broadinstitute/depmap_omics/blob/master/documentation/DepMap_processing_pipeline.md) for detailed breakdown of the subtasks in our WGS pipeline.

For non internal users, please make sure that your workspace is correctly setup

To set up your workspace, follow the instructions in the README page.

In [None]:
print("running Terra pipeline")
refwm = dm.WorkspaceManager(env_config.WGSWORKSPACE)
submission_id = refwm.create_submission("WGS_pipeline", constants.SAMPLESETNAME, 'sample_set', expression='this.samples')
await terra.waitForSubmission(env_config.WGSWORKSPACE, submission_id)

In [None]:
submission_id = refwm.create_submission("Aggregate_CN_seg_files", 'all')
await terra.waitForSubmission(env_config.WGSWORKSPACE, submission_id)

### Save the workflow configurations used

In [None]:
terra.saveWorkspace(env_config.WGSWORKSPACE,'data/'+constants.SAMPLESETNAME+'/WGSconfig/')

# Postprocessing on local


### Copy Number

In [None]:
wgs_wm = dm.WorkspaceManager(env_config.WGSWORKSPACE)
wgs_samples = wgs_wm.get_samples()
wgs_purecn = wgs_samples[(~wgs_samples.PureCN_loh.isna()) & (wgs_samples.PureCN_loh != "NA")].index.tolist()
wgs_wm.update_sample_set(sample_set_id="PureCN", sample_ids=wgs_purecn)

In [None]:
if isCCLE:
    wespriosegs, wgspriosegs = await dm_omics.cnPostProcessing(samplesetname=constants.SAMPLESETNAME, wesrefworkspace=env_config.WESCNWORKSPACE, wgsrefworkspace=env_config.WGSWORKSPACE, dryrun=False, useCache=False)
else:
    segments, genecn, failed, purecn_segments, purecn_genecn, loh_status, feature_table = await omics_cn.postProcess(env_config.WGSWORKSPACE, sampleset=constants.SAMPLESETNAME)

### Somatic Mutations

In [6]:
from depmapomics.config_prod import *

In [11]:
!mkdir -p output/23Q4

In [12]:
if isCCLE:
    await dm_omics.mutationPostProcessing(wesrefworkspace=WESCNWORKSPACE, wgsrefworkspace=WGSWORKSPACE, run_guidemat=False, run_sv=False, mafcol="depmap_maf_23q4")
else:
    await omics_mut.postProcess(env_config.WGSWORKSPACE, samplesetname=constants.SAMPLESETNAME)

DOING WES
loading from Terra
MUTECT1_CS_SNV
MUTECT1_VEP_annotated_vcf
MUTECT2_VCF_ALL
MUTECT2_VCF_INDELS
MUTECT2_VEP_annotated_vcf
MuTect1_merged_coverage_wig
MuTect1_merged_power_wig
PDO
ProfileID
PureCN_chromosomes_pdf
PureCN_cin
PureCN_cin_allele_specific
PureCN_cin_allele_specific_ploidy_robust
PureCN_cin_ploidy_robust
PureCN_comment
PureCN_contamination
PureCN_curated
PureCN_curated_solution
PureCN_dnacopy
PureCN_failed
PureCN_flagged
PureCN_genes
PureCN_local_optima_pdf
PureCN_log
PureCN_loh
PureCN_loh_fraction
PureCN_ploidy
PureCN_purity
PureCN_rds
PureCN_segmentation
PureCN_selected_solution
PureCN_solutions_pdf
PureCN_variants
PureCN_wgd
STRELKA_VEP_annotated_vcf
SmId
absolute_highres_plot
absolute_rdata
age
allele_fraction_legacy_segments_normal
allele_fraction_legacy_segments_tumor
allele_fraction_parameters_begin_normal
allele_fraction_parameters_begin_tumor
allele_fraction_parameters_normal
allele_fraction_parameters_tumor
allelic_counts_entity_id_normal
allelic_counts_ent

  maf = pd.read_csv(row[mafcol])
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2095/2095 [07:49<00:00,  4.46it/s]


further filtering and standardizing maf
saving somatic mutations (all)
done
connecting to gumbo@localhost:5432/gumbo
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 5432 failed: server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.

	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 5432 failed: server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.

setting username to szhang


  df = pd.read_sql(


clearing username
DOING WGS
loading from Terra
MUTECT1_CS_SNV
MUTECT1_VEP_annotated_vcf
MUTECT2_VCF_ALL
MUTECT2_VCF_INDELS
MUTECT2_VEP_annotated_vcf
MuTect1_merged_coverage_wig
MuTect1_merged_power_wig
PDO
PDO-ID
PdoId
ProfileID
PureCN_chromosomes_pdf
PureCN_cin
PureCN_cin_allele_specific
PureCN_cin_allele_specific_ploidy_robust
PureCN_cin_ploidy_robust
PureCN_comment
PureCN_contamination
PureCN_curated
PureCN_curated_solution
PureCN_dnacopy
PureCN_failed
PureCN_flagged
PureCN_genes
PureCN_local_optima_pdf
PureCN_log
PureCN_loh
PureCN_loh_fraction
PureCN_ploidy
PureCN_purity
PureCN_rds
PureCN_segmentation
PureCN_selected_solution
PureCN_solutions_pdf
PureCN_variants
PureCN_wgd
SM-ID
STRELKA_VEP_annotated_vcf
SmId
absolute_highres_plot
absolute_rdata
age
allele_fraction_legacy_segments_normal
allele_fraction_legacy_segments_tumor
allele_fraction_parameters_begin_normal
allele_fraction_parameters_begin_tumor
allele_fraction_parameters_normal
allele_fraction_parameters_tumor
allelic_count

  maf = pd.read_csv(row[mafcol])
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1259/1259 [04:46<00:00,  4.40it/s]


further filtering and standardizing maf
saving somatic mutations (all)
done
merging WES and WGS
downloading gene names from biomart
adding entrez id column


  if "Y" in mergedmutations[col].values:
  if "Y" in merged[col].values:


downloading gene names from biomart
adding entrez id column
creating mutation matrices
generating genotyped driver and damaging mutation matrix
No dataset version provided. Using version 25.


FileNotFoundError: [Errno 2] No such file or directory: 'output/23Q4/merged_binary_germline_avana.csv'

# Subset and upload

Based on release dates and embargo status in gumbo, subset and upload datasets for each release audience, and hand off to the portal team.

In [None]:
from depmap_omics_upload import tracker
from depmap_omics_upload import upload
from mgenepy.utils import helper as h

In [None]:
from datetime import date
import datetime
release_date = datetime.date(2023, 5, 3)

In [None]:
virtual = upload.initVirtualDatasets(samplesetname=constants.SAMPLESETNAME)

In [None]:
upload.checkDataPermission()

In [None]:
upload.uploadAuxTables(taiga_ids=virtual, today=release_date)

In [None]:
upload.makeModelLvMatrices(virtual_ids=virtual, today=release_date)

In [None]:
upload.makePRLvMatrices(virtual_ids=virtual, files_nummat={}, files_table={}, files_raw={"mutations-latest-ed72": {"somaticMutations_profile_maf": "OmicsSomaticMutationsMAFProfile.maf"}}, today=release_date)

# Managing release readmes

In [None]:
# ! cd .. && git clone https://github.com/broadinstitute/depmap-release-readmes.git && cd -

In [None]:
! cd ../depmap-release-readmes && git pull --no-commit

In [None]:
!cd ../depmap-release-readmes/ && python3 make_new_release.py $constants.RELEASE  && git add . && git commit -m $constants.RELEASE && git push 

### cleaning workspaces

In [None]:
from depmap_omics_upload.mgenepy import terra as terra_cleanup

In [None]:
if doCleanup:
    print("cleaning workspaces")
    torm = await terra_cleanup.deleteHeavyFiles(env_config.WGSWORKSPACE)
    h.parrun(['gsutil rm '+i for i in torm], cores=8)
    terra_cleanup.removeFromFailedWorkflows(env_config.WGSWORKSPACE, dryrun=False)

### Saving workspace configs

In [None]:
! terra-sync export broad-firecloud-ccle/DepMap_WGS_CN data/$constants.SAMPLESETNAME/WGSconfig

In [None]:
! terra-sync export broad-firecloud-ccle/DepMap_hg38_RNAseq data/$constants.SAMPLESETNAME/RNAconfig

In [None]:
! cd data/$constants.SAMPLESETNAME/WGSconfig && mv */*/* . && rm -r configs/

In [None]:
! cd ../RNAconfig && mv */*/* . && rm -r configs/