In [None]:
from __future__ import print_function
import os.path
import dalmatian as dm
import pandas as pd
import sys
sys.path.insert(0, '../JKBio/')
import TerraFunction as terra
from Helper import *
import numpy as np
from gsheets import Sheets
%load_ext autoreload
%autoreload 2

In [None]:
data_namespace="broad-genomics-delivery"
data_workspace="Cancer_Cell_Line_Factory_CCLF_PanCancer_PanelSeq"
proc_namespace="nci-mimoun-bi-org"
proc_workspace="PANCAN_TWIST"
source="CCLF"
source="CCLF"
recreateID=False
samplesetname = 'trial_twist2'
site="HT33MBCX2"
tsca_id="TSCA45"
date="201904"
TSCA_version="TSCA Rapid Cancer Detection Panel v2"
picard_aggregation_type_validation="PCR"
forcekeep=[]
gsheeturllist=["https://docs.google.com/spreadsheets/d/1LR8OFylVClxf0kmZpAdlVjrn3RBcfZKpNoDYtKdnHB8", "https://docs.google.com/spreadsheets/d/128dkFhL1A0GqTjmR7iMvBZE8j6ymO8krBL9WX-wUAn4"]

In [None]:
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

In [None]:
wfrom = dm.WorkspaceManager(data_namespace, data_workspace)
wto = dm.WorkspaceManager(proc_namespace, proc_workspace)

In [None]:
wto.delete_entity('pair',newpairs.index)

In [None]:
wto.delete_sample(sample_info.index)

In [None]:

# we look at all the samples we already have
refsamples = wto.get_samples()
refids = refsamples.index

# we use this gsheet package to get all the sheets into one dataframe
metadata = pd.concat([sheets.get(url).sheets[0].to_frame() for url in gsheeturllist])

# we do some corrections just in case
samples1 = wfrom.get_samples().replace(np.nan, '', regex=True)

# renaming
metadata = metadata.dropna(0, subset=['Collaborator Sample ID'])
ttype = [replace[i.split('_')[1][-1]] for i in metadata["Collaborator Sample ID"]]
metadata['sample_id'] = [ID + '-' + ttype[i] + '-' + metadata.iloc[i]['Exported DNA SM-ID'] for i, ID in enumerate(metadata['Collaborator Participant ID'])]
samples1.index = recreateSampleID(samples1.index)

In [None]:
samples1['sample_type']

In [None]:
# filtering
newsamples = samples1[(~samples1.index.isin(refids)) | samples1.index.isin(forcekeep)]
newsamples['SM_ID'] = ['SM-' + i.split('-SM-')[-1] for i in newsamples.index]
tokeep = set(metadata['Exported DNA SM-ID']) & set(newsamples['SM_ID'])

In [None]:
tokeep

In [None]:

newsamples = newsamples[newsamples['SM_ID'].isin(tokeep)]
metadata = metadata[metadata['Exported DNA SM-ID'].isin(tokeep)]

newsamples = newsamples.set_index('SM_ID')
newmetadata = metadata.set_index('Exported DNA SM-ID')

In [None]:
def recreateSampleID(listLike):
      return [i.split('_')[3] + '_' + i.split('_')[4][:-1] + '-' + replace[i.split('_')[4][-1]] + '-' + i.split('_')[2] for i in listLike]

In [None]:
newsamples.shape

In [None]:
newsamples

In [None]:
df = pd.concat([newmetadata, newsamples], axis=1,sort=True)

In [None]:
print('creating new df')
# from this new set we create a dataframe which will get uploaded to terra
sample_info = df[['crai_or_bai_path', 'cram_or_bam_path']]
sample_info['reference_id'] = df.index
sample_info['participant'] = df['Collaborator Participant ID']
sample_info['aggregation_product_name_validation'] = [TSCA_version] * sample_info.shape[0]
sample_info['bsp_sample_id_validation'] = sample_info['reference_id']
sample_info['stock_sample_id_validation'] = df['Stock DNA SM-ID']
sample_info['sample_type'] = df['Sample Type']
sample_info['picard_aggregation_type_validation'] = [picard_aggregation_type_validation] * sample_info.shape[0]
sample_info['tumor_subtype'] = df['Tumor Type']
sample_info['source_subtype_validation'] = df['Original Material Type']
sample_info['processed_subtype_validation'] = df['Material Type']
sample_info['primary_disease'] = df['Primary Disease']
sample_info['media'] = df['Media on Tube']
sample_info['Collection'] = df['Collection']
sample_info['tissue_site'] = df['Tissue Site']
sample_info['source'] = [source] * sample_info.shape[0]
sample_info['sample_id'] = df['sample_id']

sample_info = sample_info.set_index('sample_id')

In [None]:
normals = [r["participant"] for i, r in sample_info.iterrows() if r['sample_type'] == "Normal"]
tumors = [r["participant"] for i, r in sample_info.iterrows() if r['sample_type'] == "Tumor"]
prevtumors = [val["participant"] for k, val in refsamples.iterrows() if val.sample_type == "Tumor"]
prevnormals = [val["participant"] for k, val in refsamples.iterrows() if val.sample_type == "Normal"]

print("creating new pairs")

In [None]:
newpairs = {'pair_id': [], 'case_sample': [], 'control_sample': [], 'participant':[]}

paired = set(tumors) & set(normals)
for val in set(tumors):
    for tumor_id in sample_info[sample_info['participant'] == val][sample_info['sample_type'] == 'Tumor'].index.tolist():
        normal_id = sample_info[(sample_info['participant'] == val) & (sample_info['sample_type'] == 'Normal')].index.tolist()[0] if val in paired else 'NA'
        newpairs['pair_id'].append(tumor_id + "_" + normal_id)
        newpairs['case_sample'].append(tumor_id)
        newpairs['control_sample'].append(normal_id)
        newpairs['participant'].append(val)

In [None]:
sample_info[sample_info['sample_type']=='Tumor']

In [None]:

toreprocess_normals = set(tumors) & set(prevnormals)
for val in toreprocess_normals:
    for tumor_id in sample_info[sample_info['participant'] == val][sample_info['sample_type'] == 'Tumor'].index.tolist():
        normal_id = refsamples[refsamples['participant'] == val][refsamples['sample_type'] == 'Normal'].index.tolist()[0]
        newpairs['pair_id'].append(tumor_id + '_' + normal_id)
        newpairs['case_sample'].append(tumor_id)
        newpairs['control_sample'].append(normal_id)
        newpairs['participant'].append(val)

toreprocess_tumors = set(normals) & set(prevtumors)
for val in toreprocess_tumors:
    for tumor_id in refsamples[refsamples['participant'] == val][refsamples['sample_type'] == 'Tumor'].index.tolist():
        normal_id = sample_info[sample_info['participant'] == val][sample_info['sample_type'] == 'Normal'].index.tolist()[0]
        newpairs['pair_id'].append(tumor_id + '_' + normal_id)
        newpairs['case_sample'].append(tumor_id)
        newpairs['control_sample'].append(normal_id)
        newpairs['participant'].append(val)

paired = set(tumors) & set(normals)
for val in set(tumors) - (toreprocess_normals | toreprocess_tumors):
    for tumor_id in sample_info[sample_info['participant'] == val][sample_info['sample_type'] == 'Tumor'].index.tolist():
        normal_id = sample_info[(sample_info['participant'] == val) & (sample_info['sample_type'] == 'Normal')].index.tolist()[0] if val in paired else None
        newpairs['pair_id'].append(tumor_id + "_" + normal_id)
        newpairs['case_sample'].append(tumor_id)
        newpairs['control_sample'].append(normal_id)
        newpairs['participant'].append(val)

In [None]:
wto.upload_samples(sample_info)

In [None]:
wto.update_sample_set(sample_set_id=samplesetname +"_tumors", sample_ids=tumors)
#wto.update_sample_set(sample_set_id=samplesetname + "_normals", sample_ids=normals)

In [None]:
wto.update_sample_set(sample_set_id=samplesetname + "_all", sample_ids=sample_info.index.values)

In [None]:
normalsid = [i for i, r in sample_info.iterrows() if r['sample_type'] == "Normal"]
tumorsid = [i for i, r in sample_info.iterrows() if r['sample_type'] == "Tumor"]

In [None]:
wto.update_sample_set(sample_set_id=samplesetname + "_tumors", sample_ids=tumorsid)

In [None]:
wto.update_sample_set(sample_set_id=samplesetname + "_normals", sample_ids=normalsid)

In [None]:
normalsid.extend([k for k, val in refsamples.iterrows() if val.sample_type == "Normal"])

In [None]:
wto.update_sample_set(sample_set_id="All_normals", sample_ids=normalsid)

In [None]:
wto.update_sample_set(sample_set_id="All_samples", sample_ids = wto.get_samples().index.tolist())

In [None]:
newpairs = pd.DataFrame(newpairs).set_index('pair_id')

In [None]:
newpairs

In [None]:
wto.upload_entities('pair', newpairs)

In [None]:
wto.upload_entities('pair_set', newpairs, index=True)

In [None]:
wto.upload_samples(pd.DataFrame({'sample_id':['NA'],'bam_filename':['NA'],'participant':['NA']}).set_index('sample_id'))

In [None]:
#print("uploading new samples")
wto.upload_samples(sample_info)
print("creating a sample set")
wto.delete_entity('pair',[ID + '_NA' for ID in toreprocess_tumors])
wto.update_sample_set(sample_set_id=samplesetname + "_all", sample_ids=sample_info.index.values)
wto.update_sample_set(sample_set_id=samplesetname + "_tumors", sample_ids=tumors)
wto.update_sample_set(sample_set_id=samplesetname + "_normals", sample_ids=normals)
normals.extend(prevnormals)
# Same as cum pon but better
wto.update_sample_set(sample_set_id="All_normals", sample_ids=normals)
wto.update_sample_set(sample_set_id="All_samples", sample_ids=wto.get_samples().index.tolist())
newpairs = pd.DataFrame(newpairs).set_index('pair_id')
wto.upload_entities('pair', newpairs)
wto.upload_entities('pair_set', newpairs, index=True)

In [None]:
pd.DataFrame(newpairs).set_index('pair_id')

In [None]:
metadata['External ID']

In [None]:
samples1 = wfrom.get_samples().replace(np.nan, '', regex=True)

In [None]:
samples1.index = [i.split('_')[3] + '_' + i.split('_')[4][:-1] + '-' + replace[i.split('_')[4][-1]] + '-' + i.split('_')[2] for i in samples1.index]

In [None]:
newsamples = samples1[~samples1.index.isin(refids)]

In [None]:
newsamples['SM_ID'] = ['SM-'+i.split('-SM-')[-1] for i in newsamples.index]

In [None]:
metadata['Exported DNA SM-ID']

In [None]:
tokeep = set(metadata['Exported DNA SM-ID']) &  set(newsamples['SM_ID'])

In [None]:
newsamples= newsamples[newsamples['SM_ID'].isin(tokeep)]
newmetadata = metadata[metadata['Exported DNA SM-ID'].isin(tokeep)]

In [None]:
newsamples = newsamples.set_index('SM_ID')

In [None]:
newmetadata = newmetadata.set_index('Exported DNA SM-ID')

In [None]:
sample_info = newsamples[['crai_or_bai_path','cram_or_bam_path']]
sample_info['reference_id'] = newsamples.index
sample_info['participant'] = newmetadata['Collaborator Participant ID']
sample_info['aggregation_product_name_validation'] = [TSCA_version]* sample_info.shape[0]
sample_info['bsp_sample_id_validation'] = newmetadata.index
sample_info['stock_sample_id_validation'] = newmetadata['Stock DNA SM-ID']
sample_info['sample_type'] = newmetadata['Sample Type']
sample_info['picard_aggregation_type_validation'] = [picard_aggregation_type_validation]* sample_info.shape[0]
sample_info['tumor_subtype'] = newmetadata['Tumor Type']
sample_info['source_subtype_validation'] = newmetadata['Original Material Type']
sample_info['processed_subtype_validation'] = newmetadata['Material Type']
sample_info['primary_disease'] = newmetadata['Primary Disease']
sample_info['media'] = newmetadata['Media on Tube']
sample_info['Collection'] = newmetadata['Collection']
sample_info['tissue_site'] = newmetadata['Tissue Site']
sample_info['source'] = [source] * sample_info.shape[0]
sample_info['sample_id'] = newmetadata['sample_id']

In [None]:
sample_info = sample_info.set_index('sample_id')

In [None]:
normals = [r["participant"] for i, r in sample_info.iterrows() if r['sample_type'] == "Normal"]
tumors = [r["participant"] for i, r in sample_info.iterrows() if r['sample_type'] == "Tumor"]

In [None]:
prevtumors = [val["participant"] for k, val in refsamples.iterrows() if val.sample_type =="Tumor"]
prevnormals = [val["participant"] for k, val in refsamples.iterrows() if val.sample_type =="Normal"]

In [None]:
# do we have new tumors/normals for our previous ones
newpairs = {'pair_id':[],'tumor_id':[],'normal_id':[]}

toreprocess = set(tumors) & set(prevnormals)
for val in toreprocess:
    for i in sample_info[sample_info['participant']==val][sample_info['sample_type']=='Tumor'].index.values:
        newpairs['pair_id'].append(i)
        newpairs['tumor_id'].append(i)
        newpairs['normal_id'].append(refsamples[refsamples['participant']==val][refsamples['sample_type']=='Normal'].index.values[0])
    
toreprocess = set(normals) & set(prevtumors)
for val in toreprocess:
    for i in refsamples[refsamples['participant']==val][refsamples['sample_type']=='Tumor'].index.values:
        newpairs['pair_id'].append(i)
        newpairs['normal_id'].append(sample_info[sample_info['participant']==val][sample_info['sample_type']=='Normal'].index.values[0])
        newpairs['tumor_id'].append(i)

paired = set(tumors) & set(normals) 
for val in paired:
    for i in sample_info[sample_info['participant']==val][sample_info['sample_type']=='Tumor'].index.values:
        newpairs['pair_id'].append(i)
        newpairs['tumor_id'].append(sample_info[sample_info['participant']==val][sample_info['sample_type']=='Tumor'].index.values[0])
        newpairs['normal_id'].append(i)

In [None]:
newpairs = pd.DataFrame(newpairs)