# Intro & Loading 

In [120]:
from __future__ import print_function
import os.path
import pandas as pd
import gzip
import sys
import numpy as np

sys.path.insert(0, '..')

from src.CCLE_postp_function import *
from JKBio import Datanalytics as da 
from JKBio import TerraFunction as terra
from JKBio import Helper as h
from gsheets import Sheets
from taigapy import TaigaClient
import dalmatian as dm

from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier

from bokeh.plotting import *
from bokeh.models import HoverTool
from collections import OrderedDict
from IPython.display import Image,display


%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()

my_id = '~/.client_secret.json'
mystorage_id = "~/.storage.json"
sheets = Sheets.from_files(my_id, mystorage_id)
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [125]:
samplesetname = "21Q1"

virtual= {}
virtual['public'] = 'public-21q1-4b39'
virtual['ibm'] = 'ibm-21q1-abd9'
virtual['dmc'] = 'dmc-21q1-0e11'
virtual['internal']='internal-21q1-4fc4'

taiga_mutation = {}
taiga_mutation['internal'] = "depmap-mutation-calls-9be3"
taiga_mutation['ibm'] = ""
taiga_mutation['dmc'] = "depmap-mutation-calls-dfce"
taiga_mutation['public'] ="depmap-mutation-calls-9a1a"

taiga_expression = {}
taiga_expression['internal'] ="depmap-rnaseq-expression-data-363a"
taiga_expression['ibm'] = ""
taiga_expression['dmc'] = "depmap-rnaseq-expression-data-80ef"
taiga_expression['public'] ="depmap-rnaseq-expression-data-ccd0"

taiga_fusion = {}
taiga_fusion['internal'] = "gene-fusions-8b7a"
taiga_fusion['ibm'] = ""
taiga_fusion['dmc'] = "gene-fusions-375f"
taiga_fusion['public'] ="gene-fusions-6212"

taiga_copynumber = {}
taiga_copynumber['internal'] ="depmap-wes-cn-data-81a7"
taiga_copynumber['ibm'] = ""
taiga_copynumber['dmc'] = "depmap-cn-data-9b9d"
taiga_copynumber['public'] = "depmap-wes-cn-data-97cc"

prevname="20Q4"
prev_virtual = {}
prev_virtual['public'] = 'public-20q4-a4b3'
prev_virtual['dmc'] = 'ibm-20q4-269f'
prev_virtual['ibm'] = 'dmc-20q4-fcf4'
prev_virtual['internal'] = 'internal-20q4-2540'
potential_list_url = "https://docs.google.com/spreadsheets/d/1YuKEgZ1pFKRYzydvncQt9Y_BKToPlHP-oDB-0CAv3gE"

release = samplesetname

In [127]:
new = {}
gsheets = sheets.get(potential_list_url).sheets[0].to_frame()
new['internal'] = [i for i in gsheets['Internal'].values.tolist() if str(i) != "nan"]
new['dmc'] = [i for i in gsheets['DMC'].values.tolist() if str(i) != "nan"]
new['ibm'] = [i for i in gsheets['IBM'].values.tolist() if str(i) != "nan"]
new['public'] = [i for i in gsheets['Public'].values.tolist() if str(i) != "nan"]

## Getting what was released before

In [None]:
for val in ['internal', 'dmc', 'public']:
    print(val)
    prevmut[val] = set(tc.get(name=prev_virtual[val], file='CCLE_mutations').DepMap_ID)
    prevrna[val] = set(tc.get(name=prev_virtual[val], file='CCLE_expression').index)
    prevcn[val] = set(tc.get(name=prev_virtual[val], file='CCLE_segment_cn').DepMap_ID)
    prev[val] = prevmut[val] | prevrna[val] | prevcn[val]
    print('mismatch cn/mut')
    print(prevmut[val] ^ prevcn[val])
    print('mismatch rna+cn/mut')
    print(prev[val] - prevmut[val])
    print('mismatch mut+cn/rna')
    print(prev[val] - prevrna[val])

## managing the readmes

In [6]:
! cd .. && git clone https://github.com/broadinstitute/depmap-release-readmes.git && cd -

fatal: destination path 'depmap-release-readmes' already exists and is not an empty directory.


In [7]:
! cd ../depmap-release-readmes && git pull

remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 26 (delta 14), reused 19 (delta 9), pack-reused 0[K
Unpacking objects: 100% (26/26), done.
From https://github.com/broadinstitute/depmap-release-readmes
   5e66713..32bbcb4  master     -> origin/master
Updating 5e66713..32bbcb4
Fast-forward
 .gitignore                   |   1 [32m+[m
 release-20q3/.DS_Store       | Bin [31m0[m -> [32m6148[m bytes
 release-20q3/dmc-20q3.txt    | 348 [32m+++++++++++++++++++++++++++++++++++++++++++[m
 release-20q3/public-20q3.txt | 342 [32m++++++++++++++++++++++++++++++++++++++++++[m
 4 files changed, 691 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 release-20q3/.DS_Store
 create mode 100644 release-20q3/dmc-20q3.txt
 create mode 100644 release-20q3/public-20q3.txt


In [10]:
!cd ../depmap-release-readmes/ && python3 make_new_release.py $release && git add . && git commit -m $release && git push 

Making public
Making internal
Making dmc
[master c7fb45e] 20Q4
 4 files changed, 1386 insertions(+)
 create mode 100644 release-20q3/internal-20q3.txt
 create mode 100644 release-20q4/dmc-20q4.txt
 create mode 100644 release-20q4/internal-20q4.txt
 create mode 100644 release-20q4/public-20q4.txt
Counting objects: 6, done.
Delta compression using up to 12 threads.
Compressing objects: 100% (6/6), done.
Writing objects: 100% (6/6), 6.32 KiB | 0 bytes/s, done.
Total 6 (delta 3), reused 0 (delta 0)
remote: Resolving deltas: 100% (3/3), completed with 2 local objects.[K
To https://github.com/broadinstitute/depmap-release-readmes.git
   32bbcb4..c7fb45e  master -> master


In [14]:
! mkdir temp/README/

In [15]:
! cd ../depmap-release-readmes && git pull && cp release-$release/* ../ccle_processing/readmes/ && cd -

Already up-to-date.
/home/jeremie/ccle_processing


In [None]:
# NOW UPDATE THE READMEs

In [None]:
! git add . && git commit -m "Omics: updating readmes to new release" && git push

# Mutations

## Somatic

In [190]:
mutations = pd.read_csv("temp/wes_somatic_mutations_withlegacy_"+release+".csv")
damaging = pd.read_csv('temp/wes_somatic_mutations_boolmatrix_fordepmap_damaging_' + samplesetname + ".csv", index_col=0)
othercons = pd.read_csv('temp/wes_somatic_mutations_boolmatrix_fordepmap_othercons_' + samplesetname + ".csv", index_col=0)
othernoncons = pd.read_csv('temp/wes_somatic_mutations_boolmatrix_fordepmap_othernoncons_' + samplesetname + ".csv", index_col=0)
hotspot = pd.read_csv('temp/wes_somatic_mutations_boolmatrix_fordepmap_hotspot_' + samplesetname + '.csv', index_col=0)
hotspot=hotspot.astype(int)
damaging=damaging.astype(int)
othercons=othercons.astype(int)
othernoncons=othernoncons.astype(int)

  interactivity=interactivity, compiler=compiler, result=result)


In [176]:
#reverting to previous versions
mutations = mutations[mutations.is_likely_immortalization!=True]
mutations = mutations[['Hugo_Symbol', 'Entrez_Gene_Id', 'NCBI_Build', 'Chromosome',
       'Start_position', 'End_position', 'Strand', 'Variant_Classification',
       'Variant_Type', 'Reference_Allele', 'Tumor_Allele', 'dbSNP_RS',
       'dbSNP_Val_Status', 'Genome_Change', 'Annotation_Transcript',
       'DepMap_ID', 'cDNA_Change', 'Codon_Change', 'Protein_Change', 'isDeleterious',
       'isTCGAhotspot', 'TCGAhsCnt', 'isCOSMIChotspot', 'COSMIChsCnt',
       'ExAC_AF',"Variant_annotation", 'CGA_WES_AC', 'HC_AC',
       'RD_AC', 'RNAseq_AC', 'SangerWES_AC', 'WGS_AC']].rename(columns={"Tumor_Allele":"Tumor_Seq_Allele1"})

In [177]:
mutations = mutations[~mutations.DepMap_ID.isin(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])]
damaging = damaging[set(damaging.columns)-set(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])]
othercons = othercons[set(othercons.columns)-set(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])]
othernoncons = othernoncons[set(othernoncons.columns)-set(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])]
hotspot = hotspot[set(hotspot.columns)-set(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])]

In [None]:
print('nott present')
removed = set(prev[val]) - set(mutations.DepMap_ID)
print(removed)
print('removed')
removed = set(prevmut[val]) - set(mutations.DepMap_ID)
print(removed)

In [95]:
for val in ["internal", "dmc", "public"]:
    print(val)
    missing = set(new[val]) - set(mutations.DepMap_ID)
    blacklist = set(mutations.DepMap_ID) - (prev[val] | set(new[val]))
    print('missing')
    print(missing)
    newlines = set(new[val]) 
    print('blacklist')
    print(len(blacklist), blacklist)
    a = len(mutations)
    mutations = mutations[~mutations.DepMap_ID.isin(blacklist)]
    print(a - len(mutations))
    mutations.to_csv('temp/all_somatic_mutations_withlegacy.csv', index=False)
    a = len(damaging.columns)
    damaging = damaging[set(damaging.columns) -blacklist]
    print(a - len(damaging.columns))
    damaging.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_damaging.csv')
    a = len(othercons.columns)
    othercons = othercons[set(othercons.columns) -blacklist]
    print(a - len(othercons.columns))
    othercons.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_othercons.csv',)
    a = len(othernoncons.columns)
    othernoncons = othernoncons[set(othernoncons.columns) -blacklist]
    print(a - len(othernoncons.columns))
    othernoncons.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_othernoncons.csv',)
    a = len(hotspot.columns)
    hotspot = hotspot[set(hotspot.columns) -blacklist]
    print(a - len(hotspot.columns))
    hotspot.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_hotspot.csv',)
    # adding files
    a = len(mutations)
    mutations = mutations[~mutations.DepMap_ID.isin(blacklist)]
    print(a - len(mutations))
    mutations.to_csv('temp/all_somatic_mutations_withlegacy.csv', index=False)
    a = len(damaging.columns)
    damaging = damaging[set(damaging.columns) -blacklist]
    print(a - len(damaging.columns))
    damaging.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_damaging.csv')
    a = len(othercons.columns)
    othercons = othercons[set(othercons.columns) -blacklist]
    print(a - len(othercons.columns))
    othercons.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_othercons.csv',)
    a = len(othernoncons.columns)
    othernoncons = othernoncons[set(othernoncons.columns) -blacklist]
    print(a - len(othernoncons.columns))
    othernoncons.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_othernoncons.csv',)
    a = len(hotspot.columns)
    hotspot = hotspot[set(hotspot.columns) -blacklist]
    print(a - len(hotspot.columns))
    hotspot.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_hotspot.csv',)
    os.popen('cp readmes/'+val+'-'+release+'.txt temp/README')
    
    # updating on taiga
    tc.update_dataset(dataset_permaname=taiga_mutation[val],
                     upload_file_path_dict={
    'temp/all_somatic_mutations_withlegacy.csv': 'TableCSV',
    'temp/all_somatic_mutations_boolmatrix_fordepmap_damaging.csv': 'NumericMatrixCSV',
    'temp/all_somatic_mutations_boolmatrix_fordepmap_othernoncons.csv': 'NumericMatrixCSV',
    'temp/all_somatic_mutations_boolmatrix_fordepmap_othercons.csv': 'NumericMatrixCSV',
    'temp/all_somatic_mutations_boolmatrix_fordepmap_hotspot.csv': 'NumericMatrixCSV',
    'temp/README': 'Raw'},
                     dataset_description="""
    #Mutations Omics:

    for informations, see README

    NEW LINES:
    """+str(newlines)+"""

    BLACKLISTED:
    """+str(blacklist))

    # To add to a virtual dataset
    AddToVirtual(virtual[val], taiga_mutation[val], [('CCLE_all_somatic_mutations_withlegacy', 'all_somatic_mutations_withlegacy'),
    ('CCLE_all_somatic_mutations_boolmatrix_fordepmap_damaging', 'all_somatic_mutations_boolmatrix_fordepmap_damaging'),
    ('CCLE_all_somatic_mutations_boolmatrix_fordepmap_othernoncons', 'all_somatic_mutations_boolmatrix_fordepmap_othernoncons'),
    ('CCLE_all_somatic_mutations_boolmatrix_fordepmap_othercons', 'all_somatic_mutations_boolmatrix_fordepmap_othercons'),
    ('CCLE_all_somatic_mutations_boolmatrix_fordepmap_hotspot', 'all_somatic_mutations_boolmatrix_fordepmap_hotspot'), ('README','README')])

internal


NameError: name 'new' is not defined

In [181]:
# To add to a eternal dataset
AddToVirtual('depmap-a0ab', taiga_mutation['internal'], [('CCLE_all_somatic_mutations_withlegacy', 'all_somatic_mutations_withlegacy'),
('CCLE_all_somatic_mutations_boolmatrix_fordepmap_damaging', 'all_somatic_mutations_boolmatrix_fordepmap_damaging'),
('CCLE_all_somatic_mutations_boolmatrix_fordepmap_othernoncons', 'all_somatic_mutations_boolmatrix_fordepmap_othernoncons'),
('CCLE_all_somatic_mutations_boolmatrix_fordepmap_othercons', 'all_somatic_mutations_boolmatrix_fordepmap_othercons'),
('CCLE_all_somatic_mutations_boolmatrix_fordepmap_hotspot', 'all_somatic_mutations_boolmatrix_fordepmap_hotspot')])

[('CCLE_mutations', 'depmap-mutation-calls-9be3.30/all_somatic_mutations_withlegacy'), ('all_somatic_mutations_boolmatrix_fordepmap_damaging', 'depmap-mutation-calls-9be3.30/all_somatic_mutations_boolmatrix_fordepmap_damaging'), ('all_somatic_mutations_boolmatrix_fordepmap_othernoncons', 'depmap-mutation-calls-9be3.30/all_somatic_mutations_boolmatrix_fordepmap_othernoncons'), ('all_somatic_mutations_boolmatrix_fordepmap_othercons', 'depmap-mutation-calls-9be3.30/all_somatic_mutations_boolmatrix_fordepmap_othercons'), ('all_somatic_mutations_boolmatrix_fordepmap_hotspot', 'depmap-mutation-calls-9be3.30/all_somatic_mutations_boolmatrix_fordepmap_hotspot')]
hitting https://cds.team/taiga/api/datafile/cf78b026ca274b6d8ae86c787012c606
hitting https://cds.team/taiga/api/datafile/cf78b026ca274b6d8ae86c787012c606
hitting https://cds.team/taiga/api/datafile/cf78b026ca274b6d8ae86c787012c606
hitting https://cds.team/taiga/api/datafile/cf78b026ca274b6d8ae86c787012c606
hitting https://cds.team/taig

hitting https://cds.team/taiga/api/datafile/cc3395ea162f47c4a7035ef510c5d877
hitting https://cds.team/taiga/api/datafile/cc3395ea162f47c4a7035ef510c5d877
hitting https://cds.team/taiga/api/datafile/cc3395ea162f47c4a7035ef510c5d877
hitting https://cds.team/taiga/api/datafile/cc3395ea162f47c4a7035ef510c5d877
hitting https://cds.team/taiga/api/datafile/cc3395ea162f47c4a7035ef510c5d877
hitting https://cds.team/taiga/api/datafile/cc3395ea162f47c4a7035ef510c5d877
hitting https://cds.team/taiga/api/datasetVersion

Dataset version with id b53a973b68e34705a05996b04208d8ff created. You can access to this dataset version directly with this url: https://cds.team/taiga/dataset_version/b53a973b68e34705a05996b04208d8ff


# Copy Number

In [11]:
genecn= pd.read_csv('temp/all_'+release+'_gene_cn.csv',index_col=0)
segmentcn = pd.read_csv('temp/all_'+release+'_segment.csv')

In [13]:
#genecn = genecn.apply(lambda x: (x**2)-1)

In [16]:
for val in ['internal','dmc','public']:
    print('not present')
    removed = set(prev[val]) - set(segmentcn.DepMap_ID)
    print(removed)
    print('removed')
    removed = set(prevcn[val]) - set(segmentcn.DepMap_ID)
    print(removed)
    missing = set(new[val]) - set(segmentcn.DepMap_ID)
    blacklist = set(segmentcn.DepMap_ID) - (prev[val] | set(new[val]))
    print('missing')
    print(missing)
    newlines = set(new[val]) 
    print('blacklist')
    print(len(blacklist), blacklist)
    ## for segment removing first blacklisted, then embargoed, to create two datasets
    print(len(segmentcn))
    segmentcn = segmentcn[~segmentcn.DepMap_ID.isin(blacklist)]
    print(len(segmentcn))
    segmentcn.to_csv('temp/all_merged_segments.csv', index=False)
    print(len(genecn))
    genecn = genecn[~genecn.index.isin(blacklist)]
    print(len(genecn))
    genecn.to_csv('temp/all_merged_genes_cn.csv')
    
    os.popen('cp readmes/'+val+'-'+release+'.txt temp/README')

    # Add to Taiga
    tc.update_dataset(dataset_permaname=taiga_copynumber[val], 
                      upload_file_path_dict={
                        'temp/all_merged_genes_cn.csv': 'NumericMatrixCSV',
                        'temp/all_merged_segments.csv': 'TableCSV',
                        'temp/README': 'raw'},
                      dataset_description=
    """
    #Copy Number Omics:

    for informations, see README

    NEW LINES:
    """+str(newlines)+"""

    BLACKLIST:
    """+str(blacklist))
    # To add to a virtual dataset
    AddToVirtual(virtual[val], taiga_copynumber[val], [('CCLE_all_merged_genes_cn', 'all_merged_genes_cn'), ('CCLE_all_merged_segments','all_merged_segments'), ('README','README')])

not present
{'ACH-000033', 'ACH-000084', 'ACH-001189', 'ACH-002390', 'ACH-002396', 'ACH-001429', 'ACH-001131', 'ACH-001187', 'ACH-001011', 'ACH-001712', 'ACH-001393', 'ACH-002391', 'ACH-001109', 'ACH-001741', 'ACH-001316', 'ACH-002393', 'ACH-002341', 'ACH-002359', 'ACH-002394', 'ACH-000629', 'ACH-001108', 'ACH-002395', 'ACH-002709', 'ACH-002303', 'ACH-002315', 'ACH-001743'}
removed
{'ACH-001189', 'ACH-002315', 'ACH-002303', 'ACH-002341', 'ACH-002359'}
missing
set()
blacklist
16 {'ACH-001434', 'ACH-001707', 'ACH-001705', 'ACH-002013', 'ACH-001759', 'ACH-001756', 'ACH-001758', 'ACH-001553', 'ACH-002476', 'ACH-001686', 'ACH-001760', 'ACH-001046', 'ACH-001227', 'ACH-001828', 'ACH-002138', 'ACH-002055'}


In [None]:
# To add to a eternal dataset
AddToVirtual('depmap-a0ab', taiga_copynumber['internal'], [('CCLE_all_merged_genes_cn', 'all_merged_genes_cn'), ('CCLE_all_merged_segments', 'all_merged_segments')])

# RNA

In [79]:
transcripts_tpm = pd.read_csv('temp/expression_' + release + '_transcripts_tpm.csv',index_col=0)
genes_tpm = pd.read_csv('temp/expression_' + release + '_genes_tpm.csv',index_col=0)
genes_expected_count = pd.read_csv('temp/expression_' + release + '_genes_expected_count.csv',index_col=0)
proteincoding_genes_expected_count = pd.read_csv('temp/expression_' + release + '_proteincoding_genes_expected_count.csv',index_col=0)
proteincoding_genes_tpm = pd.read_csv('temp/expression_' + release + '_proteincoding_genes_tpm.csv',index_col=0)
transcripts_expected_count = pd.read_csv('temp/expression_' + release + '_transcripts_expected_count.csv',index_col=0)

In [99]:
cm = "anvil-datastorage/AnVIL_GTEx_V8_hg38"
bm = "broad-firecloud-ccle/DepMap_WGS_PON_hg38"
import dalmatian as dm
wm = dm.WorkspaceManager(cm)
sam = wm.get_participants()
sam = sam[(sam.sex=="Female") & (~sam.wgs_cram_file.isna())]
sam.index.name="sample_id"
sam['participant_id']=sam.index
dm.WorkspaceManager(bm).disable_hound().upload_samples(sam[['wgs_cram_file','wgs_cram_index', "participant_id"]])
dm.WorkspaceManager(bm).update_sample_set('WGS_XX_GTEX', sam.index.tolist())

In [116]:
dm.WorkspaceManager(bm).disable_hound().upload_samples(sam[['wgs_cram_file','wgs_cram_index', "participant_id"]])
dm.WorkspaceManager(bm).update_sample_set('WGS_XX_GTEX', sam.index.tolist())

Successfully imported 296 participants.
Successfully imported 296 samples.
Successfully imported 1 sample sets:
  * WGS_XX_GTEX (296 samples)


In [118]:
wm = dm.WorkspaceManager(bm)

In [119]:
wm.delete_sample(sam.index.tolist())

Sample set "WGS_XX_GTEX" (0 samples) successfully updated.
Sample(s) ['GTEX-1117F', 'GTEX-1122O', 'GTEX-1128S', 'GTEX-113JC', 'GTEX-11DXX', 'GTEX-11EM3', 'GTEX-11EMC', 'GTEX-11GSP', 'GTEX-11I78', 'GTEX-11ILO', 'GTEX-11P81', 'GTEX-11TTK', 'GTEX-11UD1', 'GTEX-11VI4', 'GTEX-11XUK', 'GTEX-11ZTS', 'GTEX-11ZTT', 'GTEX-11ZVC', 'GTEX-1211K', 'GTEX-1269C', 'GTEX-12WS9', 'GTEX-12WSB', 'GTEX-12WSD', 'GTEX-12WSG', 'GTEX-12WSJ', 'GTEX-12WSK', 'GTEX-12ZZX', 'GTEX-13113', 'GTEX-1313W', 'GTEX-131XG', 'GTEX-131XW', 'GTEX-131YS', 'GTEX-132AR', 'GTEX-133LE', 'GTEX-1399S', 'GTEX-1399U', 'GTEX-139D8', 'GTEX-139T4', 'GTEX-13CF3', 'GTEX-13D11', 'GTEX-13FH7', 'GTEX-13FTX', 'GTEX-13FTY', 'GTEX-13JUV', 'GTEX-13N11', 'GTEX-13NZ8', 'GTEX-13O3O', 'GTEX-13OVI', 'GTEX-13OVJ', 'GTEX-13PL6', 'GTEX-13PL7', 'GTEX-13PLJ', 'GTEX-13PVR', 'GTEX-13QBU', 'GTEX-13QIC', 'GTEX-13QJC', 'GTEX-13S7M', 'GTEX-13SLX', 'GTEX-13U4I', 'GTEX-13VXT', 'GTEX-13W3W', 'GTEX-13X6H', 'GTEX-13X6K', 'GTEX-145MI', 'GTEX-146FH', 'GTEX-146FR', 'GTEX-

In [80]:
#putting it back to what it was before

# log transforming tpm data
transcripts_tpm=transcripts_tpm.apply(lambda x: np.log2(x+1))
genes_tpm=genes_tpm.apply(lambda x: np.log2(x+1))
proteincoding_genes_tpm=proteincoding_genes_tpm.apply(lambda x: np.log2(x+1))

In [None]:
store -r rename

In [81]:
for val in ['internal','dmc','public']:
    print('not present')
    removed = set(prev[val]) - set(genes_tpm.index)
    print(removed)
    print('removed for QC reasons')
    print(set(rename.keys()))
    print('removed')
    removed = set(prevrna[val]) - set(genes_tpm.index)
    print(removed - set(rename.keys()))
    missing = set(new[val]) - set(genes_tpm.index)
    blacklist = set(genes_tpm.index) - (prevrna[val] | set(new[val]))
    print('missing')
    print(missing)
    newlines = set(new[val]) 
    print('blacklist')
    print(len(blacklist), blacklist)

    ## removing first blacklisted, then embargoed, to create two datasets
    print(len(genes_expected_count))
    genes_expected_count = genes_expected_count[~genes_expected_count.index.isin(blacklist)]
    print(len(genes_expected_count))
    genes_expected_count.to_csv('temp/expression_genes_expected_count.csv')
    print(len(genes_tpm))
    genes_tpm = genes_tpm[~genes_tpm.index.isin(blacklist)]
    print(len(genes_tpm))
    genes_tpm.to_csv('temp/expression_genes_tpm.csv')
    print(len(proteincoding_genes_tpm))
    proteincoding_genes_tpm = proteincoding_genes_tpm[~proteincoding_genes_tpm.index.isin(blacklist)]
    print(len(proteincoding_genes_tpm))
    proteincoding_genes_tpm.to_csv('temp/expression_proteincoding_genes_tpm.csv')
    print(len(transcripts_tpm))
    transcripts_tpm = transcripts_tpm[~transcripts_tpm.index.isin(blacklist)]
    print(len(transcripts_tpm))
    transcripts_tpm.to_csv('temp/expression_transcripts_tpm.csv')
    print(len(proteincoding_genes_expected_count))
    proteincoding_genes_expected_count = proteincoding_genes_expected_count[~proteincoding_genes_expected_count.index.isin(blacklist)]
    print(len(proteincoding_genes_expected_count))
    proteincoding_genes_expected_count.to_csv('temp/expression_proteincoding_genes_expected_count.csv')
    print(len(transcripts_expected_count))
    transcripts_expected_count = transcripts_expected_count[~transcripts_expected_count.index.isin(blacklist)]
    print(len(transcripts_expected_count))
    transcripts_expected_count.to_csv('temp/expression_transcripts_expected_count.csv')
    
    os.popen('cp readmes/'+val+'-'+release+'.txt temp/README')

    # adding to taiga
    tc.update_dataset(dataset_permaname=taiga_rna[val],
                     upload_file_path_dict={
                       'temp/expression_genes_expected_count.csv': 'NumericMatrixCSV',
                       'temp/expression_transcripts_tpm.csv': 'NumericMatrixCSV',
                       'temp/expression_genes_tpm.csv': 'NumericMatrixCSV',
                       'temp/expression_proteincoding_genes_tpm.csv': 'NumericMatrixCSV',
                       'temp/expression_proteincoding_genes_expected_count.csv': 'NumericMatrixCSV',
                       'temp/expression_transcripts_expected_count.csv': 'NumericMatrixCSV',
                     "temp/README": "README"},
                      dataset_description=
    """
    # INTERNAL RNA

    for information, see README

    NEW LINES:
    """+str(newlines)+"""

    REMOVED FOR QC REASONS:
    """+str(rename)+"""

    BLACKLIST:
    """+str(blacklist))

    # add to virtual 
    AddToVirtual(virtual[val], taiga_rna[val], files=[
    ('CCLE_expression_genes_tpm', 'expression_genes_tpm'), 
    ('CCLE_expression_transcripts_tpm', 'expression_transcripts_tpm'),
    ('CCLE_expression_genes_expected_count', 'expression_genes_expected_count'),
    ('CCLE_expression_proteincoding_genes_tpm', 'expression_proteincoding_genes_tpm'), ('CCLE_expression_proteincoding_genes_expected_count', 'expression_proteincoding_genes_expected_count'),('CCLE_expression_transcripts_expected_count', 'expression_transcripts_expected_count'), ('README','README')])

not present
{'ACH-002162', 'ACH-001101', 'ACH-002116', 'ACH-002152', 'ACH-002247', 'ACH-002221', 'ACH-002105', 'ACH-001091', 'ACH-002220', 'ACH-001044', 'ACH-001047', 'ACH-002100', 'ACH-001224', 'ACH-002316', 'ACH-002185', 'ACH-002356', 'ACH-002374', 'ACH-002348', 'ACH-002106', 'ACH-002289', 'ACH-002096', 'ACH-002224', 'ACH-002341', 'ACH-002394', 'ACH-001137', 'ACH-002344', 'ACH-001130', 'ACH-002092', 'ACH-002400', 'ACH-002161', 'ACH-001233', 'ACH-002155', 'ACH-001364', 'ACH-002258', 'ACH-002243', 'ACH-002306', 'ACH-002300', 'ACH-001704', 'ACH-001092', 'ACH-001093', 'ACH-002168', 'ACH-002357', 'ACH-002244', 'ACH-001182', 'ACH-002304', 'ACH-002313', 'ACH-002122', 'ACH-001107', 'ACH-002358', 'ACH-002359', 'ACH-002192', 'ACH-002298', 'ACH-002040', 'ACH-002178', 'ACH-000047', 'ACH-002099', 'ACH-002102', 'ACH-001090', 'ACH-002144', 'ACH-002146', 'ACH-002274', 'ACH-002233', 'ACH-002275', 'ACH-002094', 'ACH-001002', 'ACH-002135', 'ACH-002252', 'ACH-002210', 'ACH-001121', 'ACH-002153', 'ACH-00

In [94]:
AddToVirtual('depmap-a0ab', taiga_rna['internal'], files=[
('CCLE_expression_genes_tpm', 'expression_genes_tpm'), 
('CCLE_expression_transcripts_tpm', 'expression_transcripts_tpm'),
('CCLE_expression_genes_expected_count', 'expression_genes_expected_count'),
('CCLE_expression_proteincoding_genes_tpm', 'expression_proteincoding_genes_tpm'), ('CCLE_expression_proteincoding_genes_expected_count', 'expression_proteincoding_genes_expected_count'),('CCLE_expression_transcripts_expected_count', 'expression_transcripts_expected_count')])

[('CCLE_expression_full', 'depmap-rnaseq-expression-data-363a.32/expression_genes_tpm'), ('CCLE_RNAseq_transcripts', 'depmap-rnaseq-expression-data-363a.32/expression_transcripts_tpm'), ('CCLE_RNAseq_reads', 'depmap-rnaseq-expression-data-363a.32/expression_genes_expected_count'), ('CCLE_expression', 'depmap-rnaseq-expression-data-363a.32/expression_proteincoding_genes_tpm'), ('CCLE_expression_proteincoding_genes_expected_count', 'depmap-rnaseq-expression-data-363a.32/expression_proteincoding_genes_expected_count'), ('CCLE_expression_transcripts_expected_count', 'depmap-rnaseq-expression-data-363a.32/expression_transcripts_expected_count')]
hitting https://cds.team/taiga/api/datafile/766a8a1003394a1da6d478547f106c23
hitting https://cds.team/taiga/api/datafile/766a8a1003394a1da6d478547f106c23
hitting https://cds.team/taiga/api/datafile/766a8a1003394a1da6d478547f106c23
hitting https://cds.team/taiga/api/datafile/766a8a1003394a1da6d478547f106c23
hitting https://cds.team/taiga/api/datafile

hitting https://cds.team/taiga/api/datafile/c3b3f1a82b7a4e29b40bf4cd2b64b96f
hitting https://cds.team/taiga/api/datasetVersion

Dataset version with id 1685b01832ce49cf8b58ec9c253ec01b created. You can access to this dataset version directly with this url: https://cds.team/taiga/dataset_version/1685b01832ce49cf8b58ec9c253ec01b


# Fusions

In [55]:
fusions=pd.read_csv('temp/unfiltered_fusions_'+release+'.csv')
filtered=pd.read_csv('temp/fusions_'+release+'.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'temp/unfiltered_fusions_20Q4.csv'

In [56]:
for val in ['internal', 'dmc', 'public']:
    print('not present')
    removed = set(prev[val]) - set(fusions.DepMap_ID)
    print(removed)
    print('removed for QC reasons')
    print(set(rename.keys()))
    print('removed')
    removed = set(prevrna[val]) - set(fusions.DepMap_ID)
    print(removed - set(rename.keys()))
    missing = set(new[val]) - set(fusions.DepMap_ID)
    blacklist = set(fusions.DepMap_ID) - (prev[val] | set(new[val]))
    print('missing')
    print(missing)
    newlines = set(new[val]) 
    print('blacklist')
    print(len(blacklist), blacklist)
    ## removing first blacklisted, then embargoed, to create two datasets
    print(len(fusions))
    fusions = fusions[~fusions.DepMap_ID.isin(blacklist)]
    print(len(fusions))
    fusions.to_csv('temp/fusions.csv', index=False)
    print(len(filtered))
    filtered= filtered[~filtered.DepMap_ID.isin(blacklist)]
    print(len(filtered))
    filtered.to_csv('temp/filtered_fusions.csv', index=False)

    os.popen('cp readmes/'+val+'-'+release+'.txt temp/README')

    # uploading to taiga
    tc.update_dataset(dataset_permaname=taiga_fusion[val],
                     upload_file_path_dict={
                         'temp/fusions.csv': 'TableCSV',
                         'temp/filtered_fusions.csv': 'TableCSV',
                         'temp/README', 'raw'},
                      dataset_description=
    """
    # Internal Fusions

    for more information, see README

    NEW LINES:
    """+str(newlines)+"""

    REMOVED FOR QC REASONS:
    """+str(rename)+"""

    BLACKLIST:
    """+str(blacklist))

    # virtual datasets
    AddToVirtual(virtual[val], taiga_fusion[val], files=[('CCLE_fusions', 'fusions'),('CCLE_filtered_fusions', 'filtered_fusions'), ('README','README')])

not present
{'ACH-002196', 'ACH-002205', 'ACH-002289', 'ACH-001064', 'ACH-001641', 'ACH-002127', 'ACH-002344', 'ACH-002384', 'ACH-002383', 'ACH-002157', 'ACH-001037', 'ACH-001143', 'ACH-002216', 'ACH-001680', 'ACH-000727', 'ACH-002351', 'ACH-002207', 'ACH-002114', 'ACH-002300', 'ACH-000795', 'ACH-002377', 'ACH-002204', 'ACH-002360', 'ACH-002292', 'ACH-002058', 'ACH-002104', 'ACH-002314', 'ACH-002183', 'ACH-002213', 'ACH-002168', 'ACH-002382', 'ACH-002260', 'ACH-001208', 'ACH-002095', 'ACH-002200', 'ACH-002269', 'ACH-002387', 'ACH-002375', 'ACH-002129', 'ACH-001199', 'ACH-002312', 'ACH-000561', 'ACH-002282', 'ACH-002395', 'ACH-001338', 'ACH-002232', 'ACH-002267', 'ACH-002190', 'ACH-001502', 'ACH-001093', 'ACH-002336', 'ACH-002281', 'ACH-001137', 'ACH-002199', 'ACH-002208', 'ACH-002185', 'ACH-002276', 'ACH-002374', 'ACH-002396', 'ACH-001383', 'ACH-002237', 'ACH-001151', 'ACH-002119', 'ACH-002136', 'ACH-002359', 'ACH-001182', 'ACH-002399', 'ACH-002106', 'ACH-001187', 'ACH-002394', 'ACH-00

In [52]:
AddToVirtual('depmap-a0ab', "gene-fusions-8b7a", files=[('CCLE_fusions', 'fusions'),('CCLE_filtered_fusions', 'filtered_fusions')])

[('CCLE_fusions_unfiltered', 'gene-fusions-8b7a.16/fusions'), ('CCLE_fusions', 'gene-fusions-8b7a.16/filtered_fusions')]
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/a

In [None]:
terra.waitForSubmission(refworkspace, "847eef61-01af-496d-8d8e-cb1cbaa48f7c")
submission_id2 = refwm.create_submission("CNV_Somatic_Workflow_on_Sample", samplesetname,'sample_set',expression='this.samples')



status is: Failed for 0 jobs in submission 0. 774 mn elapsed.