In [None]:
import dalmatian as dm
import pandas as pd
from genepy.utils import helper as h
import numpy as np

In [None]:
ccle_wm = dm.WorkspaceManager("fccredits-silver-tan-7621/CCLE_v2").disable_hound()
wgs_wm = dm.WorkspaceManager("broad-firecloud-ccle/DepMap_WGS_CN").disable_hound()
wes_wm = dm.WorkspaceManager("broad-firecloud-ccle/DepMap_WES_CN_hg38").disable_hound()

In [None]:
ccle_samples = ccle_wm.get_samples()
wgs_samples = wgs_wm.get_samples()
wes_samples = wes_wm.get_samples()

In [None]:
cmds = []

for i in ccle_samples.index:
    if not pd.isnull(ccle_samples.loc[i, "hg38_wgs_bam"]):
        cdsid = ccle_samples.loc[i, "hg38_wgs_bam"].split('/')[-1][:10]
        vcf_old = wgs_samples.loc[cdsid, "final_vcf"]
        vcf_new = "gs://ccle-mutation/mutect2_vcf/" + vcf_old.split('/')[-1]
        if not pd.isnull(vcf_old):
            ccle_samples.loc[i, "mutect2_vcf"] = vcf_new
            cmds.append("gsutil -u broad-firecloud-ccle cp " + vcf_old + " " + vcf_new)
        parquet_old_all = wgs_samples.loc[cdsid, "dna_pipeline_main_parquet"]
        if len(parquet_old_all) > 0:
            new_parquet_list = []
            for p in parquet_old_all:
                new_parquet = "gs://ccle-mutation/mutect2_parquet/" + p.split('/')[-1]
                cmds.append("gsutil -u broad-firecloud-ccle cp " + p + " " + new_parquet)
                new_parquet_list.append(new_parquet)
            ccle_samples.at[i, "mutect2_parquet"] = new_parquet_list

In [None]:
len(ccle_samples)

In [None]:
h.parrun(cmds, cores=8)

In [None]:
print("done copying")

In [None]:
ccle_wm.update_sample_attributes(ccle_samples)

In [None]:
df = ccle_samples[["participant"]]

In [None]:
df.to_csv("participants.csv", index=False)

## first populate the wes workspace w/fixed vcf

In [None]:
from google.cloud import storage

client = storage.Client()
for blob in client.list_blobs('fc-secure-d2a2d895-a7af-4117-bdc7-652d7d268324', prefix='6001c090-b09a-4785-8b8a-33aa9c3a7ec6/omics_post_mutect2'):
    if blob.name.endswith("_fixed.vcf.gz"):
        fn = blob.name
        cdsid = fn.split('/')[-1][:10]
        if pd.isnull(wes_samples.loc[cdsid, "mutect2_fixed_vcf"]):
            wes_samples.loc[cdsid, "mutect2_fixed_vcf"] = "gs://fc-secure-d2a2d895-a7af-4117-bdc7-652d7d268324/" + blob.name

In [None]:
wes_wm.update_sample_attributes(wes_samples)

In [None]:
cmds = []

for i in ccle_samples.index:
    if not pd.isnull(ccle_samples.loc[i, "hg38_wes_bam"]):
        cdsid = ccle_samples.loc[i, "hg38_wes_bam"].split('/')[-1][:10]
        vcf_old = wes_samples.loc[cdsid, "mutect2_fixed_vcf"]
        vcf_new = "gs://ccle-mutation/mutect2_vcf/" + vcf_old.split('/')[-1]
        if not pd.isnull(vcf_old):
            ccle_samples.loc[i, "mutect2_vcf_wes"] = vcf_new
            cmds.append("gsutil -u broad-firecloud-ccle cp " + vcf_old + " " + vcf_new)
        parquet_old_all = wes_samples.loc[cdsid, "dna_pipeline_main_parquet"]
        if len(parquet_old_all) > 0:
            new_parquet_list = []
            for p in parquet_old_all:
                new_parquet = "gs://ccle-mutation/mutect2_parquet/" + p.split('/')[-1]
                cmds.append("gsutil -u broad-firecloud-ccle cp " + p + " " + new_parquet)
                new_parquet_list.append(new_parquet)
            ccle_samples.at[i, "mutect2_parquet_wes"] = new_parquet_list

In [None]:
cmds

In [None]:
h.parrun(cmds, cores=8)
print("done copying")

In [None]:
ccle_samples[["mutect2_parquet_wes"]]

In [None]:
ccle_wm.update_sample_attributes(ccle_samples)