In [None]:
from __future__ import print_function
import pandas as pd
from gsheets import Sheets
import os 
from depmapomics import tracker
import numpy as np
import dalmatian as dm
import firecloud.api

from taigapy import TaigaClient
tc = TaigaClient()


SHEETCREDS = '../.credentials.json'
MY_ID = '~/.client_secret.json'
MYSTORAGE_ID = "~/.storage.json"
SHEETNAME = 'ccle sample tracker'
REFSHEET_URL = "https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY"
WORKING_DIR = "temp/"

In [None]:
wm = dm.WorkspaceManager('broad-firecloud-ccle/CCLE_SNP_QC').disable_hound()
wm_copy = dm.WorkspaceManager('broad-firecloud-ccle/CCLE_SNP_QC-copy').disable_hound()


In [None]:
# create sample sets
sample_group_df = wm.get_entities("sample_group")
existing_sample_sets = wm.get_entities("sample_set").index
vcf_groups = sample_group_df[~sample_group_df.index.isin(existing_sample_sets)]
for i in vcf_groups.index:
    fn = vcf_groups.loc[i, 'vcf_group']
    df = pd.read_csv(fn, sep='\t', header=None, comment='#')
    idx = df[0].tolist()
    idx = [x.split('/')[4][:10] for x in idx]
    wm.update_sample_set(sample_set_id=i,
                          sample_ids=idx)
i

In [None]:
all_sample_sets = wm_copy.get_entities("sample_set").index
all_sample_sets

In [None]:
sample_set_a_list = []
sample_set_b_list = []
pair_ids = []
for s in all_sample_sets:
    for t in all_sample_sets:
        sample_set_a_list.append(s)
        sample_set_b_list.append(t)
        pair_ids.append(s + '-' + t)
            
pair_df = pd.DataFrame(
    np.array([sample_set_a_list, sample_set_b_list]).T,
    columns=['sample_batch_a', 'sample_batch_b'],
    index=pair_ids
)
pair_df.index.name = 'entity:sample_batch_pair_id'
pair_df

In [None]:
# update dalmatian function (moved to terra.py)
def update_sample_batch_references(wm, etype, attrs):
    reserved_attrs = {}
    if etype=='sample':
        reserved_attrs = {'participant': 'participant'}
    elif etype=='pair':
        reserved_attrs = {'participant': 'participant','case_sample': 'sample','control_sample': 'sample'}
    elif etype == 'sample_batch_pair':
        reserved_attrs = {'sample_batch_a': 'sample_set', 'sample_batch_b': 'sample_set'}

    attr_list = []
    for entity, row in attrs.iterrows():
        attr_list.extend([{
            'name':entity,
            'entityType':etype,
            'operations': [
                {
                    "op": "AddUpdateAttribute",
                    "attributeName": i,
                    "addUpdateAttribute": wm._process_attribute_value(i, j, reserved_attrs)
                } for i,j in row.iteritems() if not np.any(pd.isnull(j))
            ]
        }])

    # try rawls batch call if available
    r = dm.wmanager._batch_update_entities(wm.namespace, wm.workspace, attr_list)
    try:
        if r.status_code == 204:
            if isinstance(attrs, pd.DataFrame):
                print("Successfully updated attributes '{}' for {} {}s.".format(attrs.columns.tolist(), attrs.shape[0], etype))
            elif isinstance(attrs, pd.Series):
                print("Successfully updated attribute '{}' for {} {}s.".format(attrs.name, len(attrs), etype))
            else:
                print("Successfully updated attribute '{}' for {} {}s.".format(attrs.name, len(attrs), etype))
        elif r.status_code >= 400:
            raise APIException("Unable to update entity attributes", r)
        else:
            print(r.text)
    except:  # revert to public API
        traceback.print_exc()
        print("Failed to use batch update endpoint; switching to slower fallback")
        for update in attr_list:
            r = firecloud.api.update_entity(
                wm.namespace,
                wm.workspace,
                update['entityType'],
                update['name'],
                update['operations']
            )
            if r.status_code==200:
                print('Successfully updated {}.'.format(update['name']))
            elif r.status_code >= 400:
                raise APIException("Unable to update entity attributes", r)
            else:
                print(r.text)
    if wm.hound._enabled:
        with wm.hound.batch():
            for obj in attr_list:
                wm.hound.update_entity_meta(
                    etype,
                    obj['name'],
                    "Updating {} attributes: {}".format(
                        len(obj['operations']),
                        ', '.join(attr['attributeName'] for attr in obj['operations'])
                    )
                )
                for attr in obj['operations']:
                    wm.hound.update_entity_attribute(
                        etype,
                        obj['name'],
                        attr['attributeName'],
                        attr['addUpdateAttribute'] if isinstance(attr['addUpdateAttribute'], str)
                        else (
                            attr['addUpdateAttribute']['entityName'] if 'entityName' in attr['addUpdateAttribute']
                            else attr['addUpdateAttribute']['items']
                        )
                    )

In [None]:
pair_df.to_csv("sample_batch_pairs.tsv", sep='\t') #uploaded to terra manually

In [None]:
pair_df

In [None]:
update_sample_batch_references(wm_copy, 'sample_batch_pair', pair_df)

In [None]:
# upload/update sample_set_pair_set
unique_pairs = wm_copy.get_entities('sample_batch_pair').index.tolist()
sample_set_pair_set_df = pd.DataFrame(np.transpose(unique_pairs), index=['all'] * len(unique_pairs), columns=['sample_batch_pair'])
sample_set_pair_set_df.index.name = 'membership:sample_batch_pair_set_id'
sample_set_pair_set_df

In [None]:
sample_set_pair_set_df.to_csv("sample_batch_pair_set.tsv", sep='\t') #manually updated

In [None]:
df1 = pd.read_csv('gs://fc-secure-5a880d76-79f9-4141-844c-9e95369ced4e/2eab2e8e-f2e5-43a5-9a21-9938b64ebdbb/crosscheck/029926a1-e3b8-4cd5-be50-2c5b982bc3fe/call-run_crosscheck/21Q4-vcf_batch_6_crosscheck', sep='\t', comment='#')
df1

In [None]:
df1[(df1.RIGHT_SAMPLE == 'CDS-WSxpHG') & (df1.LEFT_SAMPLE == 'CDS-y7yI24')]

In [None]:
df1[(df1.RIGHT_SAMPLE == 'CDS-WfPJyi') & (df1.LEFT_SAMPLE == 'CDS-y7yI24')]

In [None]:
df2 = pd.read_csv('gs://fc-secure-5a880d76-79f9-4141-844c-9e95369ced4e/2eab2e8e-f2e5-43a5-9a21-9938b64ebdbb/crosscheck/597eaf73-9def-45fc-b31c-552c71492fa1/call-run_crosscheck/21Q4-vcf_batch_1_crosscheck', sep='\t', comment='#')
df2[(df2.RIGHT_SAMPLE == 'CDS-7fm9Do') & (df2.LEFT_SAMPLE == 'CDS-y7yI24')]

In [None]:
df2 = pd.read_csv('gs://fc-secure-5a880d76-79f9-4141-844c-9e95369ced4e/b2aa5f3d-bc4a-4fae-ad0d-89436ea9a1f6/crosscheck/09e1df45-3aef-469f-bce8-4cdc290f90af/call-run_crosscheck/vcf_batch_1-vcf_batch_6_crosscheck', sep='\t', comment='#')
df2[(df2.RIGHT_SAMPLE == 'CDS-WfPJyi') & (df2.LEFT_SAMPLE == 'CDS-7fm9Do')]

In [None]:
df2 = pd.read_csv('gs://fc-secure-5a880d76-79f9-4141-844c-9e95369ced4e/b2aa5f3d-bc4a-4fae-ad0d-89436ea9a1f6/crosscheck/6f9ff836-605f-452b-8c3f-1c37c08cbac0/call-run_crosscheck/21Q4-vcf_batch_3_crosscheck', sep='\t', comment='#')
df2[(df2.RIGHT_SAMPLE == 'CDS-FXX26z') & (df2.LEFT_SAMPLE == 'CDS-eKPJ34')]

In [None]:
df2 = pd.read_csv('gs://fc-secure-5a880d76-79f9-4141-844c-9e95369ced4e/b2aa5f3d-bc4a-4fae-ad0d-89436ea9a1f6/crosscheck/39b6b4de-5a45-4a6d-9db3-b3268cc91207/call-run_crosscheck/vcf_batch_3-vcf_batch_7_crosscheck', sep='\t', comment='#')
df2[(df2.RIGHT_SAMPLE == 'CDS-f4JXdG') & (df2.LEFT_SAMPLE == 'CDS-FXX26z')]

In [None]:
df2 = pd.read_csv('gs://fc-secure-5a880d76-79f9-4141-844c-9e95369ced4e/b2aa5f3d-bc4a-4fae-ad0d-89436ea9a1f6/crosscheck/e00f40e8-7594-4a63-9513-45f3c63f54b3/call-run_crosscheck/21Q4-vcf_batch_7_crosscheck', sep='\t', comment='#')
df2[(df2.RIGHT_SAMPLE == 'CDS-f4JXdG') & (df2.LEFT_SAMPLE == 'CDS-eKPJ34')]

In [None]:
# combine LOD matrix

In [None]:
all_sample_sets

In [None]:
updated_lod_mat = []
for s in all_sample_sets:
    new_lod_list = []
    sample_batch_pair_df = wm_copy.get_entities("sample_batch_pair")
    samples_df = sample_batch_pair_df[sample_batch_pair_df.sample_batch_b.apply(lambda x: x['entityName'] == s)]['cross_checks_out'].tolist()
    for batch in samples_df:
        # could be pd concat
        df = pd.read_csv(batch, sep='\t', comment='#')
        lod_mat = df.pivot(index = "LEFT_SAMPLE",columns="RIGHT_SAMPLE",values = "LOD_SCORE")
        new_lod_list.append(lod_mat)
    new_lod_mat = pd.concat(new_lod_list)
    new_lod_mat.index.name = None
    new_lod_mat = new_lod_mat.T
    updated_lod_mat.append(new_lod_mat)
updated_lod_mat = pd.concat(updated_lod_mat)
updated_lod_mat.index.name = None
updated_lod_mat

In [None]:
prev_id = 'CDS-7fm9Do'
bam_from = 'CDS-WSxpHG'
bam_mapped_to = 'CDS-9HXF2k'
bam_from_wgs = 'CDS-wmOGjh'

print('wes - rna: ', updated_lod_mat.loc['CDS-WfPJyi', prev_id])
print('wgs - rna: ', updated_lod_mat.loc['CDS-y7yI24', prev_id])
print('wes - wgs: ', updated_lod_mat.loc['CDS-WfPJyi', 'CDS-y7yI24'])

In [None]:
updated_lod_mat.to_csv('fingerprint_lod_matrix.csv')

In [None]:
# Upload updated LOD matrix to Taiga
tc.update_dataset(dataset_permaname = "ccle-bam-fingerprints-6f30",
                  changes_description="Fixed swapped samples",
                  upload_files=[
                    {
                        "path": 'fingerprint_lod_matrix.csv',
                        "name": 'fingerprint_lod_matrix',
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    }
                 ],
                 add_all_existing_files=True)

In [None]:
from taigapy import TaigaClient
tc = TaigaClient()
old_lod_mat = tc.get(name='ccle-bam-fingerprints-6f30', version=4, file='fingerprint_lod_matrix')

In [None]:
len(set(updated_lod_mat.index) - set(old_lod_mat.index))

In [None]:
targeted = list(set(updated_lod_mat.index) - set(old_lod_mat.index))
targeted.remove('CDS-sGjmuP')
t = tracker.getTracker()
t.loc[targeted, :].datatype.unique()

In [None]:
submission_id = wm.create_submission("crosscheck_vcfs", 'all', 'sample_batch_pair_set', expression='this.sample_batch_pairs')

In [None]:
updated_lod_mat_new = []
for s in all_sample_sets:
    new_lod_list = []
    sample_batch_pair_df = wm.get_entities("sample_batch_pair")
    samples_df = sample_batch_pair_df[sample_batch_pair_df.sample_batch_b.apply(lambda x: x['entityName'] == s)]['cross_checks_out'].tolist()
    for batch in samples_df:
        # could be pd concat
        df = pd.read_csv(batch, sep='\t', comment='#')
        lod_mat = df.pivot(index = "LEFT_SAMPLE",columns="RIGHT_SAMPLE",values = "LOD_SCORE")
        new_lod_list.append(lod_mat)
    new_lod_mat = pd.concat(new_lod_list)
    new_lod_mat.index.name = None
    new_lod_mat = new_lod_mat.T
    updated_lod_mat_new.append(new_lod_mat)
updated_lod_mat_new = pd.concat(updated_lod_mat_new)
updated_lod_mat_new.index.name = None
updated_lod_mat_new

In [None]:
prev_id = 'CDS-7fm9Do'
bam_from = 'CDS-WSxpHG'
bam_mapped_to = 'CDS-9HXF2k'
bam_from_wgs = 'CDS-wmOGjh'

print('wes - rna: ', updated_lod_mat_new.loc['CDS-WfPJyi', prev_id])
print('wgs - rna: ', updated_lod_mat_new.loc['CDS-y7yI24', prev_id])
print('wes - wgs: ', updated_lod_mat_new.loc['CDS-WfPJyi', 'CDS-y7yI24'])

In [None]:
updated_lod_mat_new.loc['CDS-sGjmuP',:]

In [None]:
targeted.append('CDS-sGjmuP')
updated_lod_mat_new = updated_lod_mat_new[~updated_lod_mat_new.index.isin(targeted)]
updated_lod_mat_new = updated_lod_mat_new.drop(columns=targeted)
updated_lod_mat_new

In [None]:
updated_lod_mat_new.to_csv('fingerprint_lod_matrix.csv')
# Upload updated LOD matrix to Taiga
tc.update_dataset(dataset_permaname = "ccle-bam-fingerprints-6f30",
                  changes_description="drop targeted seq that were accidentally added",
                  upload_files=[
                    {
                        "path": 'fingerprint_lod_matrix.csv',
                        "name": 'fingerprint_lod_matrix',
                        "format": "NumericMatrixCSV",
                        "encoding": "utf-8"
                    }
                 ],
                 add_all_existing_files=True)