In [None]:
import pandas as pd

import dalmatian as dm
from depmapomics import tracker
from genepy import terra

from taigapy import TaigaClient
tc = TaigaClient()

In [None]:
track = tracker.getTracker()

In [None]:
wm = dm.WorkspaceManager('fccredits-silver-tan-7621/CCLE_v2').disable_hound()
da = wm.get_samples() # get all samples


In [None]:
CCLE_segment_cn = tc.get(name='public-21q3-bf1e',
                         version=11, file='CCLE_segment_cn')
CCLE_expression = tc.get(name='public-21q3-bf1e',
                         version=11, file='CCLE_expression')
lines_genome = CCLE_segment_cn.DepMap_ID.unique()
lines_expression = CCLE_expression.index


In [None]:
# adding hg38 WGS data
data = track[track.arxspan_id.isin(lines_genome) & (
    track.datatype == 'wgs') & (track.blacklist == 0)]
for i in range(6,1,-1):
  a = data[data.version==i].arxspan_id.unique()
  if len(a)>0:
    data = data[~(data.arxspan_id.isin(a) & (data.version < i))]

data = data[["arxspan_id", "internal_bam_filepath", "internal_bai_filepath", "legacy_bam_filepath", "legacy_bai_filepath"]].set_index("arxspan_id").rename(columns={
    "internal_bam_filepath": "wgs_bam",
    "internal_bai_filepath": "wgs_bai",
    "legacy_bam_filepath": "hg19_wgs_bam", 
    "legacy_bai_filepath": "hg19_wgs_bai",
    })
data.index.name = 'sample_id'
data['participant_id'] = data.index
wm.upload_samples(data)

In [None]:
# adding hg38 WGS data
data = track[track.arxspan_id.isin(lines_genome) & (
    track.datatype == 'wes') & (track.blacklist == 0)]
for i in range(6, 1, -1):
  a = data[data.version == i].arxspan_id.unique()
  if len(a) > 0:
    data = data[~(data.arxspan_id.isin(a) & (data.version < i))]

data = data[["arxspan_id", "internal_bam_filepath", "internal_bai_filepath", "legacy_bam_filepath", "legacy_bai_filepath"]].set_index("arxspan_id").rename(columns={
    "internal_bam_filepath": "wes_bam",
    "internal_bai_filepath": "wes_bai",
    "legacy_bam_filepath": "hg19_wes_bam",
    "legacy_bai_filepath": "hg19_wes_bai",
})
data.index.name = 'sample_id'
data['participant_id'] = data.index


In [None]:
wm.upload_samples(data)


In [None]:
# adding RNA data
data = track[track.arxspan_id.isin(lines_expression) & (
    track.datatype == 'rna') & (track.blacklist == 0)]
for i in range(6, 1, -1):
  a = data[data.version == i].arxspan_id.unique()
  if len(a) > 0:
    data = data[~(data.arxspan_id.isin(a) & (data.version < i))]

data = data[["arxspan_id", "internal_bam_filepath", "internal_bai_filepath", "legacy_bam_filepath", "legacy_bai_filepath"]].set_index("arxspan_id").rename(columns={
    "internal_bam_filepath": "rna_bam",
    "internal_bai_filepath": "rna_bai",
    "legacy_bam_filepath": "hg19_rna_bam",
    "legacy_bai_filepath": "hg19_rna_bai",
})
data.index.name = 'sample_id'
data['participant_id'] = data.index
wm.upload_samples(data)

In [None]:
torm = track[(track.datatype == 'wgs') & (
    track.source != 'CCLE2')].arxspan_id.unique()

In [None]:
wm.delete_sample_attributes(da.loc[set(torm) & set(da.index), ['hg19_wgs_bam', 'hg19_wgs_bai', 'hg38_wgs_bam', 'hg38_wgs_bai', 'hg38_wgs_hc_cnn_filtered_vcf_index', 'hg38_wgs_hc_cnn_filtered_vcf_index']], dry_run=False)

In [None]:
sam = da[~da.hg19_wes_bam.isna()].index
sangerwes = track[(track.datatype=='wes')&(track.source=='SANGER')].arxspan_id.unique()
sangeronly = set(sangerwes) - set(track[(track.datatype=='wes')&(track.source!='SANGER')].arxspan_id.unique())

torm = set(sangeronly) & set(sam)
wm.delete_sample_attributes(da.loc[set(torm) & set(da.index), ['hg19_wes_bam', 'hg19_wes_bai', 'hg38_wes_bam',
'hg38_wes_bai', 'hg38_wes_hc_cnn_filtered_vcf_index', 'hg38_wes_hc_cnn_filtered_vcf_index']], dry_run=False)


In [None]:
terra.shareTerraBams("allAuthenticatedUsers",
                     'fccredits-silver-tan-7621/CCLE_v2', da[~da.hg19_wes_bam.isna()].index, ['wes_bam', 'hg19_wes_bam', 'wes_bai', 'hg19_wes_bai'])
