# Intro & Loading 

In [1]:
from __future__ import print_function
import os.path
import pandas as pd
import gzip
import sys
import numpy as np

sys.path.insert(0, '..')

from src.CCLE_postp_function import *
from JKBio import Datanalytics as da 
from JKBio import TerraFunction as terra
from JKBio import Helper as h
from gsheets import Sheets
from taigapy import TaigaClient
import dalmatian as dm

from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier

from bokeh.plotting import *
from bokeh.models import HoverTool
from collections import OrderedDict
from IPython.display import Image,display


%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()

my_id = '~/.client_secret.json'
mystorage_id = "~/.storage.json"
sheets = Sheets.from_files(my_id, mystorage_id)
replace = {'T': 'Tumor', 'N': 'Normal', 'm': 'Unknown', 'L': 'Unknown'}

you need to have JKBio in your path:
e.g. have installed JKBio in the same folder as ccle_processing


  from pandas.core.index import Index as PandasIndex


In [2]:
samplesetname = "20Q4"
prevname="20Q3"

prevprevname ='20Q2'

virtual_public='public-20q4-a4b3'
virtual_dmc='dmc-20q4-fcf4'
virtual_ibm='ibm-20q4-269f'
virtual_internal='internal-20q4-2540'

prev_virtual_public='public-20q3-3d35'
prev_virtual_dmc='dmc-20q3-deprecated-never-released--5f55'
prev_virtual_internal='internal-20q3-00d0'


prevprev_virtual_internal='internal-20q2-7f46'


workspace1="terra-broad-cancer-prod/DepMap_WGS"
workspace2="terra-broad-cancer-prod/Getz_IBM_CellLines_WGS"


refworkspace="broad-firecloud-ccle/DepMap_WGS_CN"
cgaworkspace="broad-firecloud-ccle/DepMap_Mutation_Calling_CGA_pipeline-wgs"

source1="ccle"
source2="ibm"

refsheet_url = "https://docs.google.com/spreadsheets/d/1XkZypRuOEXzNLxVk9EOHeWRE98Z8_DBvL4PovyM01FE"
sheeturl = "https://docs.google.com/spreadsheets/d/115TUgA1t_mD32SnWAGpW9OKmJ2W5WYAOs3SuSdedpX4"
potential_list_url = "https://docs.google.com/spreadsheets/d/1YuKEgZ1pFKRYzydvncQt9Y_BKToPlHP-oDB-0CAv3gE"
release = samplesetname

In [3]:
gsheets = sheets.get(sheeturl).sheets[6].to_frame()
wes_dmc_embargo = [i for i in gsheets['WES_DMC_embargo'].values.tolist() if str(i) != "nan"]
wes_embargo = [i for i in gsheets['WES_embargo'].values.tolist() if str(i) != "nan"]
blacklist = [i for i in gsheets['blacklist'].values.tolist() if str(i) != "nan"]

In [4]:
gsheets = sheets.get(potential_list_url).sheets[0].to_frame()
internal = [i for i in gsheets['Internal'].values.tolist() if str(i) != "nan"]
dmc = [i for i in gsheets['DMC'].values.tolist() if str(i) != "nan"]
ibm = [i for i in gsheets['IBM'].values.tolist() if str(i) != "nan"]
public = [i for i in gsheets['Public'].values.tolist() if str(i) != "nan"]

## Getting what was released before

In [5]:
print('internal')
internal_mut = set(tc.get(name=prev_virtual_internal, file='CCLE_mutations').DepMap_ID)
internal_rna = set(tc.get(name=prev_virtual_internal, file='CCLE_expression').index)
internal_cn = set(tc.get(name=prev_virtual_internal, file='CCLE_segment_cn').DepMap_ID)
previnternal = internal_mut | internal_rna | internal_cn
print('mismatch cn/mut')
print(internal_mut ^ internal_cn)
print('mismatch rna+cn/mut')
print(previnternal - internal_mut)
print('mismatch mut+cn/rna')
print(previnternal - internal_rna)

#ibm_mut = set(tc.get(name=prev_virtual_ibm, file='CCLE_mutations').DepMap_ID)
#ibm_rna = set(tc.get(name=prev_virtual_ibm, file='CCLE_expression').index)
#ibm_cn = tc.get(name=prev_virtual_ibm, file='CCLE_segment_cn')

print('dmc')
dmc_mut = set(tc.get(name=prev_virtual_dmc, file='CCLE_mutations').DepMap_ID)
dmc_rna = set(tc.get(name=prev_virtual_dmc, file='CCLE_expression').index)
dmc_cn = set(tc.get(name=prev_virtual_dmc, file='CCLE_segment_cn').DepMap_ID)
prevdmc = dmc_mut | dmc_rna | dmc_cn
print('mismatch cn/mut')
print(dmc_mut ^ dmc_cn)
print('mismatch rna+cn/mut')
print(prevdmc - dmc_mut)
print('mismatch mut+cn/rna')
print(prevdmc - dmc_rna)

print('public')
public_mut = set(tc.get(name=prev_virtual_public, file='CCLE_mutations').DepMap_ID)
public_rna = set(tc.get(name=prev_virtual_public, file='CCLE_expression').index)
public_cn = set(tc.get(name=prev_virtual_public, file='CCLE_segment_cn').DepMap_ID)
prevpublic = public_mut | public_rna | public_cn
print('mismatch cn/mut')
print(public_mut ^ public_cn)
print('mismatch rna+cn/mut')
print(prevpublic - public_mut)
print('mismatch mut+cn/rna')
print(prevpublic - public_rna)

internal
mismatch cn/mut
{'ACH-001037', 'ACH-002396', 'ACH-000629', 'ACH-001045', 'ACH-001018', 'ACH-001678', 'ACH-001098', 'ACH-001101', 'ACH-001225', 'ACH-001017', 'ACH-001224', 'ACH-001108', 'ACH-001087', 'ACH-001071', 'ACH-001171', 'ACH-001198', 'ACH-001121', 'ACH-001079', 'ACH-000010', 'ACH-001175', 'ACH-000084', 'ACH-001194', 'ACH-002204', 'ACH-002395', 'ACH-001187', 'ACH-002394', 'ACH-002048', 'ACH-002391', 'ACH-001011', 'ACH-001109', 'ACH-001078', 'ACH-002392', 'ACH-001847', 'ACH-000033', 'ACH-001000', 'ACH-001131', 'ACH-002393', 'ACH-001061', 'ACH-001210', 'ACH-001015', 'ACH-002390', 'ACH-000712', 'ACH-002335', 'ACH-001088', 'ACH-001249'}
mismatch rna+cn/mut
{'ACH-001249', 'ACH-001079', 'ACH-002709', 'ACH-002335', 'ACH-001088', 'ACH-001000', 'ACH-001101', 'ACH-001045', 'ACH-001493', 'ACH-001017', 'ACH-001449', 'ACH-001712', 'ACH-001087', 'ACH-001224', 'ACH-001171', 'ACH-002010', 'ACH-001037', 'ACH-001071', 'ACH-001502', 'ACH-001662', 'ACH-001669', 'ACH-001708', 'ACH-001672', '

mismatch cn/mut
{'ACH-001037', 'ACH-002396', 'ACH-000629', 'ACH-001045', 'ACH-001018', 'ACH-001098', 'ACH-001101', 'ACH-001225', 'ACH-001017', 'ACH-001224', 'ACH-001108', 'ACH-001087', 'ACH-001071', 'ACH-001171', 'ACH-001198', 'ACH-001121', 'ACH-001079', 'ACH-000010', 'ACH-001175', 'ACH-000084', 'ACH-001194', 'ACH-002204', 'ACH-002395', 'ACH-001187', 'ACH-002394', 'ACH-002391', 'ACH-001011', 'ACH-001109', 'ACH-001078', 'ACH-002392', 'ACH-000033', 'ACH-001000', 'ACH-001131', 'ACH-002393', 'ACH-001061', 'ACH-001210', 'ACH-001015', 'ACH-002390', 'ACH-000712', 'ACH-002335', 'ACH-001088', 'ACH-001249'}
mismatch rna+cn/mut
{'ACH-001249', 'ACH-001079', 'ACH-002335', 'ACH-001088', 'ACH-001000', 'ACH-001101', 'ACH-001045', 'ACH-001017', 'ACH-001712', 'ACH-001087', 'ACH-001224', 'ACH-001171', 'ACH-001037', 'ACH-001071', 'ACH-001743', 'ACH-001225', 'ACH-001316', 'ACH-001061', 'ACH-001741', 'ACH-001018', 'ACH-001194', 'ACH-002204', 'ACH-001175', 'ACH-001198', 'ACH-001098', 'ACH-001015', 'ACH-00112

## managing the readmes

In [6]:
! cd .. && git clone https://github.com/broadinstitute/depmap-release-readmes.git && cd -

fatal: destination path 'depmap-release-readmes' already exists and is not an empty directory.


In [7]:
! cd ../depmap-release-readmes && git pull

remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 26 (delta 14), reused 19 (delta 9), pack-reused 0[K
Unpacking objects: 100% (26/26), done.
From https://github.com/broadinstitute/depmap-release-readmes
   5e66713..32bbcb4  master     -> origin/master
Updating 5e66713..32bbcb4
Fast-forward
 .gitignore                   |   1 [32m+[m
 release-20q3/.DS_Store       | Bin [31m0[m -> [32m6148[m bytes
 release-20q3/dmc-20q3.txt    | 348 [32m+++++++++++++++++++++++++++++++++++++++++++[m
 release-20q3/public-20q3.txt | 342 [32m++++++++++++++++++++++++++++++++++++++++++[m
 4 files changed, 691 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 release-20q3/.DS_Store
 create mode 100644 release-20q3/dmc-20q3.txt
 create mode 100644 release-20q3/public-20q3.txt


In [10]:
!cd ../depmap-release-readmes/ && python3 make_new_release.py $release && git add . && git commit -m $release && git push 

Making public
Making internal
Making dmc
[master c7fb45e] 20Q4
 4 files changed, 1386 insertions(+)
 create mode 100644 release-20q3/internal-20q3.txt
 create mode 100644 release-20q4/dmc-20q4.txt
 create mode 100644 release-20q4/internal-20q4.txt
 create mode 100644 release-20q4/public-20q4.txt
Counting objects: 6, done.
Delta compression using up to 12 threads.
Compressing objects: 100% (6/6), done.
Writing objects: 100% (6/6), 6.32 KiB | 0 bytes/s, done.
Total 6 (delta 3), reused 0 (delta 0)
remote: Resolving deltas: 100% (3/3), completed with 2 local objects.[K
To https://github.com/broadinstitute/depmap-release-readmes.git
   32bbcb4..c7fb45e  master -> master


In [14]:
! mkdir temp/README/

In [15]:
! cd ../depmap-release-readmes && git pull && cp -r release-* ../ccle_processing/temp/README/ && cd -

Already up-to-date.
/home/jeremie/ccle_processing


# Mutations

## Somatic

In [190]:
mutations = pd.read_csv("temp/wes_somatic_mutations_withlegacy_"+release+".csv")
#damaging = pd.read_csv('temp/wes_somatic_mutations_boolmatrix_fordepmap_damaging_' + samplesetname + ".csv", index_col=0)
#othercons = pd.read_csv('temp/wes_somatic_mutations_boolmatrix_fordepmap_othercons_' + samplesetname + ".csv", index_col=0)
#othernoncons = pd.read_csv('temp/wes_somatic_mutations_boolmatrix_fordepmap_othernoncons_' + samplesetname + ".csv", index_col=0)
#hotspot = pd.read_csv('temp/wes_somatic_mutations_boolmatrix_fordepmap_hotspot_' + samplesetname + '.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [191]:
mutations[mutations.DepMap_ID=="ACH-002359"]

Unnamed: 0.1,Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,NCBI_Build,Chromosome,Start_position,End_position,Strand,Variant_Classification,Variant_Type,...,ExAC_AF,PASS,is_likely_immortalization,CGA_WES_AC,HC_AC,Variant_annotation,RD_AC,RNAseq_AC,SangerWES_AC,WGS_AC
1302147,1204721,COL16A1,1307,37,1,32163671,32163671,+,Missense_Mutation,SNP,...,0.000008,True,False,45:43,,other non-conserving,,,,
1302148,1204722,MTF1,4520,37,1,38289412,38289412,+,Missense_Mutation,SNP,...,0.000008,True,False,21:19,,other non-conserving,,,,
1302149,1204723,FAAH,2166,37,1,46872008,46872008,+,Missense_Mutation,SNP,...,,True,False,21:30,,other non-conserving,,,,
1302150,1204724,ZCCHC11,23318,37,1,52896892,52896892,+,Missense_Mutation,SNP,...,0.000041,True,False,31:24,,other non-conserving,,,,
1302151,1204725,SAMD13,148418,37,1,84815454,84815454,+,Silent,SNP,...,0.000008,True,False,61:71,,silent,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1302339,1204803,UNC13B,10497,37,9,35397690,35397690,+,Missense_Mutation,SNP,...,0.000033,True,False,20:16,,other non-conserving,,,,
1302340,1204804,TESK1,7016,37,9,35609184,35609184,+,Silent,SNP,...,,True,False,38:27,,silent,,,,
1302341,1204805,RNF20,56254,37,9,104324630,104324630,+,Missense_Mutation,SNP,...,,True,False,20:60,,other non-conserving,,,,
1302342,1204806,SH2D3C,10044,37,9,130507316,130507316,+,Missense_Mutation,SNP,...,0.000043,True,False,6:15,,other non-conserving,,,,


In [175]:
mutations = mutations[mutations.is_likely_immortalization!=True]

In [176]:
mutations = mutations[['Hugo_Symbol', 'Entrez_Gene_Id', 'NCBI_Build', 'Chromosome',
       'Start_position', 'End_position', 'Strand', 'Variant_Classification',
       'Variant_Type', 'Reference_Allele', 'Tumor_Allele', 'dbSNP_RS',
       'dbSNP_Val_Status', 'Genome_Change', 'Annotation_Transcript',
       'DepMap_ID', 'cDNA_Change', 'Codon_Change', 'Protein_Change', 'isDeleterious',
       'isTCGAhotspot', 'TCGAhsCnt', 'isCOSMIChotspot', 'COSMIChsCnt',
       'ExAC_AF',"Variant_annotation", 'CGA_WES_AC', 'HC_AC',
       'RD_AC', 'RNAseq_AC', 'SangerWES_AC', 'WGS_AC']].rename(columns={"Tumor_Allele":"Tumor_Seq_Allele1"})

In [177]:
mutations = mutations[~mutations.DepMap_ID.isin(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])]
damaging = damaging[set(damaging.columns)-set(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])]
othercons = othercons[set(othercons.columns)-set(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])]
othernoncons = othernoncons[set(othernoncons.columns)-set(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])]
hotspot = hotspot[set(hotspot.columns)-set(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])]

In [100]:
hotspot=hotspot.astype(int)
damaging=damaging.astype(int)
othercons=othercons.astype(int)
othernoncons=othernoncons.astype(int)

## Internal

In [178]:
print('nott present')
removed = set(previnternal) - set(mutations.DepMap_ID)
print(removed)
print('removed')
removed = set(internal_mut) - set(mutations.DepMap_ID)
print(removed)
missing = set(internal) - set(mutations.DepMap_ID)
blacklist = set(mutations.DepMap_ID) - (previnternal | set(internal))
print('missing')
print(missing)
newlines = set(internal) 
print('blacklist')
print(len(blacklist), blacklist)

nott present
{'ACH-001189', 'ACH-001249', 'ACH-001079', 'ACH-002709', 'ACH-002335', 'ACH-001088', 'ACH-001000', 'ACH-001101', 'ACH-001045', 'ACH-001017', 'ACH-001712', 'ACH-001087', 'ACH-001224', 'ACH-001171', 'ACH-001037', 'ACH-001071', 'ACH-001743', 'ACH-002303', 'ACH-001225', 'ACH-001316', 'ACH-001741', 'ACH-001018', 'ACH-001175', 'ACH-001393', 'ACH-001198', 'ACH-002315', 'ACH-002341', 'ACH-001015', 'ACH-001121', 'ACH-001429'}
removed
{'ACH-002341', 'ACH-001189', 'ACH-002315', 'ACH-002303'}
missing
set()
blacklist
17 {'ACH-001756', 'ACH-001705', 'ACH-002055', 'ACH-001760', 'ACH-001828', 'ACH-001553', 'ACH-002476', 'ACH-001227', 'ACH-002138', 'ACH-001707', 'ACH-001046', 'ACH-001686', 'ACH-002013', 'ACH-003000', 'ACH-001758', 'ACH-001759', 'ACH-001434'}


In [179]:
a = len(mutations)
mutations = mutations[~mutations.DepMap_ID.isin(blacklist)]
print(a - len(mutations))
mutations.to_csv('temp/all_somatic_mutations_withlegacy.csv', index=False)
a = len(damaging.columns)
damaging = damaging[set(damaging.columns) -blacklist]
print(a - len(damaging.columns))
damaging.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_damaging.csv')
a = len(othercons.columns)
othercons = othercons[set(othercons.columns) -blacklist]
print(a - len(othercons.columns))
othercons.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_othercons.csv',)
a = len(othernoncons.columns)
othernoncons = othernoncons[set(othernoncons.columns) -blacklist]
print(a - len(othernoncons.columns))
othernoncons.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_othernoncons.csv',)
a = len(hotspot.columns)
hotspot = hotspot[set(hotspot.columns) -blacklist]
print(a - len(hotspot.columns))
hotspot.to_csv('temp/all_somatic_mutations_boolmatrix_fordepmap_hotspot.csv',)

7053
0
0
0
0


In [180]:
tc.update_dataset(dataset_permaname="depmap-mutation-calls-9be3",
                 upload_file_path_dict={
'temp/all_somatic_mutations_withlegacy.csv': 'TableCSV',
'temp/all_somatic_mutations_boolmatrix_fordepmap_damaging.csv': 'NumericMatrixCSV',
'temp/all_somatic_mutations_boolmatrix_fordepmap_othernoncons.csv': 'NumericMatrixCSV',
'temp/all_somatic_mutations_boolmatrix_fordepmap_othercons.csv': 'NumericMatrixCSV',
'temp/all_somatic_mutations_boolmatrix_fordepmap_hotspot.csv': 'NumericMatrixCSV',
                                       },#'temp/README': 'Raw'},
                 dataset_description="""
# Internal Mutations

Mutation calls for Internal DepMap data

* Version 1 Internal 18Q1*

original source: `/xchip/ccle_dist/broad_only/CMAG/mutations/CCLE_depMap_18Q1_maf_20180202.txt`
* Version 2-4 Internal 18Q2*

merged mutations and indels file (1,606 cell lines, including CCLE and Sanger WES reanalysis)
original source: 
`/xchip/ccle_dist/broad_only/CMAG/mutations/CCLE_depMap_18q2_maf_20180502.txt`
Binary matrices:
- damaging: if isDeleterious is true
- missense: if isDeleterious is false
- hotspot: if missense and either TCGA or COSMIC hotspot
Version 2 contains the MAF file
* Version 5-6 Internal 18Q3*

version 5 deprecated

original source: `/xchip/ccle_dist/broad_only/CMAG/mutations/CCLE_depMap_18q3_maf_20180716.txt`

Binary matrices:
- damaging: if isDeleterious is true
- missense: if isDeleterious is false
- hotspot: if missense and either TCGA or COSMIC hotspot
- Rows: cell line, Broad (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

MAF file

* Version 7-8 Internal 18Q4*

version 8 just changes a column name in the MAF file from Broad_ID to DepMap_ID

original source: `/xchip/ccle_dist/broad_only/CMAG/mutations/CCLE_DepMap_18Q4_maf_20181028.txt`

* Version 9-12 Internal 19Q1*

version 12 updates the column name from VA_WES_AC to CCLE_WES_AC

version 11+ uses an updated definition for hotspot mutations

version 12 contains the correct data for 19Q1

* Version 13 Internal 19Q2*

* Version 14-15 Internal 19Q3*

version 15 fixed entrez ids

* Version 16 Internal 19Q4*

adding 35 new cell lines.

* Version 16 Internal 19Q4*
uploading as matrices

* Version 17 Internal 19Q4*
removing unauthorized lines and setting as matrices

* Version 18 Internal 19Q4*
removing unauthorized lines and setting as matrices

* Version 19 Internal 20Q1*
uploading 8 new lines

* Version 20 Internal 20Q1*
removing unauthorized cl

* Version 21 Internal 20Q2*
uploading 8 new lines and adding .all to express the fact that this data is the aggregate of all different sequencing methods.

* Version 22 Internal 20Q2*
removing 2 cell lines

* Version 23 Internal 20Q3*
nothing different from 20Q2. no new cell lines

* Version 24 Internal 20Q3*
updating the blacklists

* Version 25 Internal 20Q4*
removed 'ACH-002303' because wrong line. new dataset, adding wgs mutations, full reprocessing of the mutations, improved filtering

* Version 26 Internal 20Q4*
failed matrix upload

* Version 26 Internal 20Q4*
renaming filies

*** Variant annotation column ***

MAF file, added column (Variant_annotation) classifying each variant as either silent, damaging, other conserving, or other non-conserving, based on this mapping (old annotation from Variant_Classification column - new annotation):

Silent - silent
Splice_Site - damaging
Missense_Mutation - other non-conserving
Nonsense_Mutation - damaging
De_novo_Start_OutOfFrame - damaging
Nonstop_Mutation - other non-conserving
Frame_Shift_Del - damaging
Frame_Shift_Ins - damaging
In_Frame_Del - other non-conserving
In_Frame_Ins - other non-conserving
Stop_Codon_Del - other non-conserving
Stop_Codon_Ins - other non-conserving
Start_Codon_SNP - damaging
Start_Codon_Del - damaging
Start_Codon_Ins - damaging
5'Flank - other conserving
Intron - other conserving
IGR - other conserving
3'UTR - other conserving
5'UTR - other conserving
Binary matrices:

- damaging: if damaging
- other: if other conserving or other non-conserving
- hotspot: if it is not a silent mutation and is either TCGA or COSMIC hotspot
- Rows: cell line, DepMap (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

NEW LINES:
"""+str(newlines)+"""

BLACKLISTED:
"""+str(blacklist))

Uploading all_somatic_mutations_withlegacy...
hitting https://cds.team/taiga/api/datafile/d44b1c7cd81d403dbb3248c4a1d3de23
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading 

'1dfeca4e84dc4b76955694fee3a243f3'

In [181]:
# To add to a virtual dataset
AddToVirtual(virtual_internal, 'depmap-mutation-calls-9be3', [('CCLE_mutations', 'all_somatic_mutations_withlegacy'),
('all_somatic_mutations_boolmatrix_fordepmap_damaging', 'all_somatic_mutations_boolmatrix_fordepmap_damaging'),
('all_somatic_mutations_boolmatrix_fordepmap_othernoncons', 'all_somatic_mutations_boolmatrix_fordepmap_othernoncons'),
('all_somatic_mutations_boolmatrix_fordepmap_othercons', 'all_somatic_mutations_boolmatrix_fordepmap_othercons'),
('all_somatic_mutations_boolmatrix_fordepmap_hotspot', 'all_somatic_mutations_boolmatrix_fordepmap_hotspot')])#('README','README')])
# To add to a eternal dataset
AddToVirtual('depmap-a0ab', 'depmap-mutation-calls-9be3', [('CCLE_mutations', 'all_somatic_mutations_withlegacy'),
('all_somatic_mutations_boolmatrix_fordepmap_damaging', 'all_somatic_mutations_boolmatrix_fordepmap_damaging'),
('all_somatic_mutations_boolmatrix_fordepmap_othernoncons', 'all_somatic_mutations_boolmatrix_fordepmap_othernoncons'),
('all_somatic_mutations_boolmatrix_fordepmap_othercons', 'all_somatic_mutations_boolmatrix_fordepmap_othercons'),
('all_somatic_mutations_boolmatrix_fordepmap_hotspot', 'all_somatic_mutations_boolmatrix_fordepmap_hotspot')])

[('CCLE_mutations', 'depmap-mutation-calls-9be3.30/all_somatic_mutations_withlegacy'), ('all_somatic_mutations_boolmatrix_fordepmap_damaging', 'depmap-mutation-calls-9be3.30/all_somatic_mutations_boolmatrix_fordepmap_damaging'), ('all_somatic_mutations_boolmatrix_fordepmap_othernoncons', 'depmap-mutation-calls-9be3.30/all_somatic_mutations_boolmatrix_fordepmap_othernoncons'), ('all_somatic_mutations_boolmatrix_fordepmap_othercons', 'depmap-mutation-calls-9be3.30/all_somatic_mutations_boolmatrix_fordepmap_othercons'), ('all_somatic_mutations_boolmatrix_fordepmap_hotspot', 'depmap-mutation-calls-9be3.30/all_somatic_mutations_boolmatrix_fordepmap_hotspot')]
hitting https://cds.team/taiga/api/datafile/cf78b026ca274b6d8ae86c787012c606
hitting https://cds.team/taiga/api/datafile/cf78b026ca274b6d8ae86c787012c606
hitting https://cds.team/taiga/api/datafile/cf78b026ca274b6d8ae86c787012c606
hitting https://cds.team/taiga/api/datafile/cf78b026ca274b6d8ae86c787012c606
hitting https://cds.team/taig

hitting https://cds.team/taiga/api/datafile/cc3395ea162f47c4a7035ef510c5d877
hitting https://cds.team/taiga/api/datafile/cc3395ea162f47c4a7035ef510c5d877
hitting https://cds.team/taiga/api/datafile/cc3395ea162f47c4a7035ef510c5d877
hitting https://cds.team/taiga/api/datafile/cc3395ea162f47c4a7035ef510c5d877
hitting https://cds.team/taiga/api/datafile/cc3395ea162f47c4a7035ef510c5d877
hitting https://cds.team/taiga/api/datafile/cc3395ea162f47c4a7035ef510c5d877
hitting https://cds.team/taiga/api/datasetVersion

Dataset version with id b53a973b68e34705a05996b04208d8ff created. You can access to this dataset version directly with this url: https://cds.team/taiga/dataset_version/b53a973b68e34705a05996b04208d8ff


## IBM

In [None]:
print('missing')
missing = set(ibm) - set(mutations.DepMap_ID)
print(missing)
print('ibm_embargo')
ibm_embargo = set(mutations.DepMap_ID) - (prevdmc | set(ibm))
print(len(ibm_embargo), ibm_embargo)
newlines = set(ibm) 
print(len(newlines))

In [None]:
a = len(mutations)
b = mutations[~mutations.DepMap_ID.isin(ibm_embargo| prevdmc)]
print(a - len(b))
b.to_csv('temp/all_somatic_mutations_withlegacy.csv', index=False)

In [103]:
tc.update_dataset(dataset_permaname="mutations-b05c",
                 upload_file_path_dict={
'temp/all_somatic_mutations_withlegacy.csv': 'TableCSV',
                                       },#'temp/README': 'Raw'},
                 dataset_description="""
# DMC Mutations

## Versions

* Version 1 empty*

* Version 2 Internal 20Q4*
First IBM version

* Version 3 Internal 20Q4*
updated issue in IBM version 2 (had duplicate lines with DMC)

* Version 4 Internal 20Q4*
reworking the readme

* Version 5 Internal 20Q4*
renaming files

## Notations

MAF file, added column (Variant_annotation) classifying each variant as either silent, damaging, other conserving, or other non-conserving, based on this mapping (old annotation from Variant_Classification column - new annotation):

Silent - silent
Splice_Site - damaging
Missense_Mutation - other non-conserving
Nonsense_Mutation - damaging
De_novo_Start_OutOfFrame - damaging
Nonstop_Mutation - other non-conserving
Frame_Shift_Del - damaging
Frame_Shift_Ins - damaging
In_Frame_Del - other non-conserving
In_Frame_Ins - other non-conserving
Stop_Codon_Del - other non-conserving
Stop_Codon_Ins - other non-conserving
Start_Codon_SNP - damaging
Start_Codon_Del - damaging
Start_Codon_Ins - damaging
5'Flank - other conserving
Intron - other conserving
IGR - other conserving
3'UTR - other conserving
5'UTR - other conserving
Binary matrices:
- damaging: if damaging
- other cons: if other conserving
- other non cons: if other non-conserving
- hotspot: if it is not a silent mutation and is either TCGA or COSMIC hotspot
- Rows: cell line, DepMap (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

## Update

NEW LINES:
"""+str(newlines)+"""

EMBARGO:
"""+str(ibm_embargo|blacklist))

Uploading wes_somatic_mutations_withlegacy...
hitting https://cds.team/taiga/api/datafile/6773edbe2a724e01b90f2f15d37644c7
Conversion and upload...:
	 Uploading to S3

	 Done: wes_somatic_mutations_withlegacy properly converted and uploaded
hitting https://cds.team/taiga/api/datasetVersion

Dataset version with id 83fb71cb317c48db8e9d52e5b50c01fe created. You can access to this dataset version directly with this url: https://cds.team/taiga/dataset_version/83fb71cb317c48db8e9d52e5b50c01fe


'83fb71cb317c48db8e9d52e5b50c01fe'

In [256]:
# To add to a virtual dataset
AddToVirtual(virtual_ibm, 'mutations-b05c', [('all_somatic_mutations_withlegacy', 'all_somatic_mutations_withlegacy'),])#('README','README')])

[('all_somatic_mutations_withlegacy', 'mutations-b05c.5/all_somatic_mutations_withlegacy')]
hitting https://cds.team/taiga/api/datafile/bbd8eda063f0466cb00fc672fa98cfe9
hitting https://cds.team/taiga/api/datafile/bbd8eda063f0466cb00fc672fa98cfe9
hitting https://cds.team/taiga/api/datasetVersion

Dataset version with id b4616d4c78b14a35a705ca6b6c83b025 created. You can access to this dataset version directly with this url: https://cds.team/taiga/dataset_version/b4616d4c78b14a35a705ca6b6c83b025


## DMC

In [182]:
print('missing')
missing = set(dmc) - set(mutations.DepMap_ID)
dmc_embargo = set(mutations.DepMap_ID) - (prevdmc | set(dmc))
print(missing)
newlines = set(dmc) 
print('dmc_embargo')
print(len(dmc_embargo), dmc_embargo)
print(len(newlines))

missing
set()
dmc_embargo
2 {'ACH-001512', 'ACH-001708'}
79


In [183]:
a = len(mutations)
mutations = mutations[~mutations.DepMap_ID.isin(dmc_embargo)]
print(a - len(mutations))
mutations.to_csv('temp/all_somatic_mutations_withlegacy.csv', index=False)

585


In [184]:
tc.update_dataset(dataset_permaname="depmap-mutation-calls-dfce",
                 upload_file_path_dict={
'temp/all_somatic_mutations_withlegacy.csv': 'TableCSV',
                                       },#'temp/README': 'Raw'},
                 dataset_description="""
# DMC Mutations

* Version 1-5 DMC 19Q1*

version 5 is a one-off portal thing because dmc wanted to be able to plot if a gene has any mutation as one-hot encoded value in the x/y axes of the data explorer It adds the any_mutation matrix, but does not change the others. Code used to generate:

```
from taigapy import TaigaClient

c = TaigaClient()

dmc_19q1_mutation_taiga_root = "depmap-mutation-calls-dfce.3/"
other_matrix = c.get(dmc_19q1_mutation_taiga_root + "other_mutation")
damaging_matrix = c.get(dmc_19q1_mutation_taiga_root + "damaging_mutation")
hotspot_matrix = c.get(dmc_19q1_mutation_taiga_root + "hotspot_mutation")

df = other_matrix.append(damaging_matrix)
df = df.groupby(level=0).sum()

df = df.append(hotspot_matrix)
df = df.groupby(level=0).sum()

df[df > 1] = 1

df.to_csv('any_mutation.csv')
```
The code uses version 3 because the dmc portal was using version 3

version 4 updates the column name from VA_WES_AC to CCLE_WES_AC

version 3 has an updated definition for hotspot mutations

version 2+ contains the correct data for 19Q1

* Version 6 DMC 19Q2*

* Version 7-8 DMC 19Q3*
version 8 fixed entrez ids

* Version 9 DMC 19Q4*
adding 52 new cell lines.

* Version 10 DMC 19Q4*
removing unauthorized lines and setting as matrices

* Version 11 DMC 19Q4*
removing unauthorized lines and setting as matrices

* Version 12 DMC 20Q1*
uploading 8 new lines

* Version 13 DMC 20Q1*
removing unauthorized cl

* Version 14 DMC 20Q2*
uploading 8 new lines and adding .all to express the fact that this data is the aggregate of all different sequencing methods.

* Version 15 DMC 20Q2*
removing 2 lines

* Version 15 DMC 20Q3*
nothing different from 20Q2. no new cell lines

* Version 15 DMC 20Q3*
updating the blacklists

* Version 16 DMC 20Q4*
adding more lines, new dataset, adding wgs mutations, full reprocessing of the mutations, improved filtering

* Version 17 DMC 20Q4*
reaming files

MAF file, added column (Variant_annotation) classifying each variant as either silent, damaging, other conserving, or other non-conserving, based on this mapping (old annotation from Variant_Classification column - new annotation):

Silent - silent
Splice_Site - damaging
Missense_Mutation - other non-conserving
Nonsense_Mutation - damaging
De_novo_Start_OutOfFrame - damaging
Nonstop_Mutation - other non-conserving
Frame_Shift_Del - damaging
Frame_Shift_Ins - damaging
In_Frame_Del - other non-conserving
In_Frame_Ins - other non-conserving
Stop_Codon_Del - other non-conserving
Stop_Codon_Ins - other non-conserving
Start_Codon_SNP - damaging
Start_Codon_Del - damaging
Start_Codon_Ins - damaging
5'Flank - other conserving
Intron - other conserving
IGR - other conserving
3'UTR - other conserving
5'UTR - other conserving
Binary matrices:
- damaging: if damaging
- other cons: if other conserving
- other non cons: if other non-conserving
- hotspot: if it is not a silent mutation and is either TCGA or COSMIC hotspot
- Rows: cell line, DepMap (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

NEW LINES:
"""+str(newlines)+"""

EMBARGO:
"""+str(dmc_embargo|blacklist))

Uploading all_somatic_mutations_withlegacy...
hitting https://cds.team/taiga/api/datafile/f39e38d7dde841e794fa288f3fc7f2be
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading 

'1b7e5173dca44a739c2e83a35d820b77'

In [185]:
# To add to a virtual dataset
AddToVirtual(virtual_dmc, 'depmap-mutation-calls-dfce', [('CCLE_mutations', 'all_somatic_mutations_withlegacy'),])#('README','README')])

[('CCLE_mutations', 'depmap-mutation-calls-dfce.22/all_somatic_mutations_withlegacy')]
hitting https://cds.team/taiga/api/datafile/252e7801fb02469c88097ac93c1601fc
hitting https://cds.team/taiga/api/datafile/252e7801fb02469c88097ac93c1601fc
hitting https://cds.team/taiga/api/datafile/252e7801fb02469c88097ac93c1601fc
hitting https://cds.team/taiga/api/datafile/252e7801fb02469c88097ac93c1601fc
hitting https://cds.team/taiga/api/datafile/252e7801fb02469c88097ac93c1601fc
hitting https://cds.team/taiga/api/datafile/252e7801fb02469c88097ac93c1601fc
hitting https://cds.team/taiga/api/datafile/252e7801fb02469c88097ac93c1601fc
hitting https://cds.team/taiga/api/datafile/252e7801fb02469c88097ac93c1601fc
hitting https://cds.team/taiga/api/datafile/252e7801fb02469c88097ac93c1601fc
hitting https://cds.team/taiga/api/datafile/252e7801fb02469c88097ac93c1601fc
hitting https://cds.team/taiga/api/datafile/252e7801fb02469c88097ac93c1601fc
hitting https://cds.team/taiga/api/datafile/252e7801fb02469c88097a

## Public

In [186]:
print('missing')
missing = set(public) - set(mutations.DepMap_ID)
embargo = set(mutations.DepMap_ID) - (prevpublic | set(public))
print(missing)
newlines = set(public) 
print('embargo')
print(len(embargo), embargo)
print(len(newlines))

missing
{'ACH-002709'}
embargo
32 {'ACH-001493', 'ACH-001970', 'ACH-001449', 'ACH-002401', 'ACH-002010', 'ACH-001502', 'ACH-001662', 'ACH-001669', 'ACH-001672', 'ACH-002465', 'ACH-001973', 'ACH-001693', 'ACH-001547', 'ACH-001293', 'ACH-001533', 'ACH-001971', 'ACH-001847', 'ACH-002014', 'ACH-001854', 'ACH-001349', 'ACH-001696', 'ACH-001437', 'ACH-002021', 'ACH-002048', 'ACH-001679', 'ACH-001678', 'ACH-001676', 'ACH-001537', 'ACH-001855', 'ACH-002512', 'ACH-001438', 'ACH-002400'}
52


In [187]:
a = len(mutations)
mutations = mutations[~mutations.DepMap_ID.isin(embargo)]
print(a - len(mutations))
mutations.to_csv('temp/all_somatic_mutations_withlegacy.csv', index=False)

15632


In [188]:
description="""
# Public Mutations

Mutation calls for Public DepMap data

* Version 1 Public 18Q1*

original source: CCLE data portal
* Version 2 Public 18Q2*

merged mutations and indels file (1,549 cell lines total, including data for 63 newly released cell lines)
original source: `/xchip/ccle_dist/public/DepMap_18Q2/CCLE_DepMap_18Q2_maf_20180502.txt`
* Version 3-4 Public 18Q3*

version 3 deprecated

original source: `/xchip/ccle_dist/public/DepMap_18Q3/CCLE_DepMap_18q3_maf_20180718.txt`

Binary matrices:
damaging: if isDeleterious is true
missense: if isDeleterious is false
hotspot: if missense and either TCGA or COSMIC hotspot
Rows: cell line, Broad (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

MAF file

* Version 5 Public 18Q4*

original source: `/xchip/ccle_dist/public/DepMap_18Q4/CCLE_DepMap_18q4_maf_20181029.txt`

* Version 6-9 Public 19Q1*

version 9 updates the column name from VA_WES_AC to CCLE_WES_AC

version 8 uses an updated definition for hotspot mutations

version 9 contains the correct data for 19Q1

* Version 10 Public 19Q2*

* Version 11-12 Public 19Q3*

version 12 fixed entrez ids

* Version 13 Public 19Q4*

adding 52 new cell lines

* Version 14 Public 19Q4*
removing unauthorized lines and setting matrices

* Version 15 Public 20Q1*
adding 8 new lines 

* Version 16 Public 20Q1*
removing an unauthorized line

* Version 17 Public 20Q2*
uploading 8 new lines and adding .all to express the fact that this data is the aggregate of all different sequencing methods.

* Version 18 Public 20Q2*
removing 2 lines

* Version 19 Public 20Q3*
nothing different from 20Q2. no new cell lines

* Version 20 Public 20Q3*
updating the blacklists

* Version 21 Public 20Q3*
updating the dmc

* Version 22 Public 20Q3*
readding two already released samples to the public list

* Version 23 Public 20Q4*
new samples, new dataset, adding wgs mutations, full reprocessing of the mutations, improved filtering

* Version 23 Public 20Q4*
renaming files

* Version 24 Public 20Q4*
removing lines

MAF file, added column (Variant_annotation) classifying each variant as either silent, damaging, other conserving, or other non-conserving, based on this mapping (old annotation from Variant_Classification column - new annotation):

Silent - silent
Splice_Site - damaging
Missense_Mutation - other non-conserving
Nonsense_Mutation - damaging
De_novo_Start_OutOfFrame - damaging
Nonstop_Mutation - other non-conserving
Frame_Shift_Del - damaging
Frame_Shift_Ins - damaging
In_Frame_Del - other non-conserving
In_Frame_Ins - other non-conserving
Stop_Codon_Del - other non-conserving
Stop_Codon_Ins - other non-conserving
Start_Codon_SNP - damaging
Start_Codon_Del - damaging
Start_Codon_Ins - damaging
5'Flank - other conserving
Intron - other conserving
IGR - other conserving
3'UTR - other conserving
5'UTR - other conserving
Binary matrices:

- damaging: if damaging
- other: if other conserving or other non-conserving
- hotspot: if it is not a silent mutation and is either TCGA or COSMIC hotspot
- Rows: cell line, DepMap (arxspan) IDs

Columns: Gene, HGNC symbol (Entrez ID)

NEW LINES:
"""+str(newlines)+"""

EMBARGOED:
"""+str(embargo)

tc.update_dataset(dataset_permaname="depmap-mutation-calls-9a1a",
                 upload_file_path_dict={
        'temp/all_somatic_mutations_withlegacy.csv': 'TableCSV'
                                       },#'temp/README': 'Raw'},
                 dataset_description=description)

Uploading all_somatic_mutations_withlegacy...
hitting https://cds.team/taiga/api/datafile/f8891533f4484ae3b6c6b682fc534b49
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading 

'2d192136539b439f8f71dc9eba2d0882'

In [189]:
# To add to a virtual dataset
AddToVirtual(virtual_public, 'depmap-mutation-calls-9a1a', [('CCLE_mutations', 'all_somatic_mutations_withlegacy'),])#('README','README')])

[('CCLE_mutations', 'depmap-mutation-calls-9a1a.30/all_somatic_mutations_withlegacy')]
hitting https://cds.team/taiga/api/datafile/c0b325228a3f48c1a7fdf5093175b1fa
hitting https://cds.team/taiga/api/datafile/c0b325228a3f48c1a7fdf5093175b1fa
hitting https://cds.team/taiga/api/datafile/c0b325228a3f48c1a7fdf5093175b1fa
hitting https://cds.team/taiga/api/datafile/c0b325228a3f48c1a7fdf5093175b1fa
hitting https://cds.team/taiga/api/datafile/c0b325228a3f48c1a7fdf5093175b1fa
hitting https://cds.team/taiga/api/datafile/c0b325228a3f48c1a7fdf5093175b1fa
hitting https://cds.team/taiga/api/datafile/c0b325228a3f48c1a7fdf5093175b1fa
hitting https://cds.team/taiga/api/datafile/c0b325228a3f48c1a7fdf5093175b1fa
hitting https://cds.team/taiga/api/datafile/c0b325228a3f48c1a7fdf5093175b1fa
hitting https://cds.team/taiga/api/datafile/c0b325228a3f48c1a7fdf5093175b1fa
hitting https://cds.team/taiga/api/datafile/c0b325228a3f48c1a7fdf5093175b1fa
hitting https://cds.team/taiga/api/datafile/c0b325228a3f48c1a7fdf5

# Copy Number

In [203]:
genecn= pd.read_csv('temp/all_'+release+'_gene_cn.csv',index_col=0)
segmentcn = pd.read_csv('temp/all_'+release+'_segment.csv')

In [204]:
set(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-003000", 
"ACH-002875", 
"ACH-002874",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])&set(genecn.index)

{'ACH-002874', 'ACH-002875', 'ACH-003000'}

In [205]:
genecn = genecn.drop(set(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-003000",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])&set(genecn.index))#.apply(lambda x: (x**2)-1)
segmentcn = segmentcn[~segmentcn.DepMap_ID.isin(["ACH-001714",
"ACH-002709",
"ACH-002874",
"ACH-002875",
"ACH-003000",
"ACH-001189",
"ACH-002303",
"ACH-002315",
"ACH-002341"])]

## Internal

In [206]:
print('not present')
removed = set(previnternal) - set(segmentcn.DepMap_ID)
print(removed)
print('removed')
removed = set(internal_cn) - set(segmentcn.DepMap_ID)
print(removed)
missing = set(internal) - set(segmentcn.DepMap_ID)
blacklist = set(segmentcn.DepMap_ID) - (previnternal | set(internal))
print('missing')
print(missing)
newlines = set(internal) 
print('blacklist')
print(len(blacklist), blacklist)

not present
{'ACH-001011', 'ACH-001189', 'ACH-002709', 'ACH-001187', 'ACH-002394', 'ACH-000084', 'ACH-001712', 'ACH-002395', 'ACH-001743', 'ACH-002303', 'ACH-001108', 'ACH-000033', 'ACH-001109', 'ACH-001316', 'ACH-002396', 'ACH-001131', 'ACH-001741', 'ACH-001393', 'ACH-002391', 'ACH-002393', 'ACH-002315', 'ACH-002390', 'ACH-002341', 'ACH-002359', 'ACH-001429', 'ACH-000629'}
removed
{'ACH-001189', 'ACH-002315', 'ACH-002341', 'ACH-002359', 'ACH-002303'}
missing
set()
blacklist
16 {'ACH-001756', 'ACH-001705', 'ACH-002055', 'ACH-001760', 'ACH-001828', 'ACH-001553', 'ACH-002476', 'ACH-001227', 'ACH-002138', 'ACH-001707', 'ACH-001046', 'ACH-001686', 'ACH-002013', 'ACH-001758', 'ACH-001759', 'ACH-001434'}


In [207]:
## for segment removing first blacklisted, then embargoed, to create two datasets
print(len(segmentcn))
segmentcn = segmentcn[~segmentcn.DepMap_ID.isin(blacklist)]
print(len(segmentcn))
segmentcn.to_csv('temp/all_merged_segments.csv', index=False)
print(len(genecn))
genecn = genecn[~genecn.index.isin(blacklist)]
print(len(genecn))
genecn.to_csv('temp/all_merged_genes_cn.csv')

3231832
3173701
1803
1787


In [208]:
tc.update_dataset(dataset_permaname="depmap-wes-cn-data-81a7", 
                  upload_file_path_dict={
                    'temp/all_merged_genes_cn.csv': 'NumericMatrixCSV',
                    'temp/all_merged_segments.csv': 'TableCSV'},
                  dataset_description=
"""
# Copy Number


## ** Version 1 Internal 18Q1****

Generated with the following script:

```
wes_pri <- taigr::load.from.taiga(data.name='gene-level-cn-87aa', 
                                  data.version=5, 
                                  data.file='gene_CN_WES_priority')
source_info <- data.frame(ccle_name=gsub("snp_|sangerWES_|ccleWES_|achillesWES_", 
                                         "", row.names(wes_pri)), 
                          source=gsub("_.*", "", row.names(wes_pri)))
wes_pri %<>% magrittr::set_rownames(source_info$ccle_name)

```

## ** Version 2 Internal 18Q2****

Generated with the following script:

```
wes_pri <- taigr::load.from.taiga(data.name='segmented-cn-wes-prioritzed-7fe1', data.version=9, data.file='wes_priority_cn_gene_matrix') %>% log2()


```

## ** Version 3 Internal 18Q2****

Generated with the following script:

```
wes_pri <- taigr::load.from.taiga(data.name='segmented-cn-wes-prioritzed-7fe1', data.version=11, data.file='wes_priority_cn_gene_matrix') %>% log2()


```
## ** Version 4-6 Internal 18Q3****

__Description__: log2 gene level copy number data

Generated with the following script:

```
wes_pri <- taigr::load.from.taiga(data.name='segmented-cn-wes-prioritzed-7fe1', data.version=15, data.file='wes_priority_cn_gene_matrix') %>% log2()


```
__Rows__: Broad (arxspan) cell line IDs

__Columns__: gene names in the format HGNC\_symbol (Entrez\_ID)

Version 5 has updated cell line name mapping

Version 4 and 5 the segment level CN for Sanger's data is off by a factor of 2, version 6 corrects this

**** Version 7 Internal 18Q4****

__Description__: log2 gene level copy number data

Generated with the following script:

```
wes_pri <- taigr::load.from.taiga(data.name='segmented-cn-wes-prioritzed-7fe1', data.version=17, data.file='wes_priority_cn_gene_matrix') %>% log2()


```
__Rows__: DepMap (arxspan) cell line IDs

__Columns__: gene names in the format HGNC\_symbol (Entrez\_ID)

## ** Version 8-9 Internal 19Q1****

version 9 has the correct data for 19Q1

## ** Version 10-11 Internal 19Q2****

__version 11 added an additional 13 cell lines and adds the segment level copy number data__

## ** Version 12 Internal 19Q3****

__Description__: log2(X + 1) gene level copy number data (data is now log2 transformed with a __pseudocount of 1__ added). CN data is generated using __hg38__. 


## ** Version 15 Internal 19Q4****

Adding 35 new cell lines

## ** Version 16 Internal 19Q4****
resolving problem with not having log2 transform 

## ** Version 17 Internal 19Q4****
resolving problem with having log2 transform on segments

## ** Version 18 Internal 20Q1****
adding 8 new cell lines

## ** Version 19 Internal 20Q1****
unlog2 transforming segmentcn

## ** Version 20 Internal 20Q1****
adding new cell lines

## ** Version 21 Internal 20Q1****
reparing some missing lines

Some cells lines have been flagged as:

 - having bad looking copy ration plots = ACH-002511 (M140325) and ACH-001370 (OCIP5X)
 - having too many segments (format: sample seg_count) = ACH-001079 2586, ACH-000044 1202, ACH-000258 872, ACH-001230 947, ACH-000068 812, ACH-000454 1051, ACH-000216 925, ACH-001150 782, ACH-001214 889, ACH-002335 1312, ACH-000836 1001, ACH-001957 1426, ACH-000960 913, ACH-000458 762, ACH-000578 869, ACH-000327 819, ACH-000090 1024, ACH-000488 954, ACH-000848 1171, ACH-000923 1469, ACH-000904 868, ACH-000452 816, ACH-000600 939, ACH-001656 902, ACH-000854 899, ACH-000774 953, ACH-001000 980, ACH-000941 813, ACH-000887 1408, ACH-001017 1223, ACH-001171 792, ACH-001071 1175, ACH-000593 764, ACH-001239 851, ACH-000071 1287, ACH-001956 1368, ACH-000509 873, ACH-002204 1318, ACH-000550 974, ACH-000738 1064, ACH-000870 1557, ACH-001036 858, ACH-001043 825, ACH-000028 868, ACH-001955 1296, ACH-000419 826, ACH-001234 819, ACH-001094 1036, ACH-001225 792, ACH-000118 794, ACH-000300 1431, ACH-001113 1072, ACH-001045 822, ACH-000444 974, ACH-000901 816, ACH-000865 1358, ACH-000961 763, ACH-001249 1756, ACH-000167 838, ACH-001101 1005, ACH-000842 929, ACH-000837 1015, ACH-000710 968, ACH-000195 2029, ACH-000064 1203, ACH-000690 771, ACH-000635 1368, ACH-000356 1294, ACH-000659 1129, ACH-000868 1422, ACH-000128 767, ACH-000658 927, ACH-001088 1337
 - Genes having a similar CN value accross all: []
 
## ** Version 20 Internal 20Q2****
Added 7 samples.

Some cells lines have been flagged as:

 - having bad looking copy ratio plots (appear to have too many segments): ACH-002399 (CDS-sukIAT, 21NT\_1), ACH-002401 (CDS-tVy3GF, 21MT2\_1), ACH-002400 (CDS-VUHMHG, 21MT1\_1)
 - having too many segments (format: sample seg_count): same as for 20Q1
 - Genes having a similar CN value accross all samples: []
 
 
## ** Version 21 Internal 20Q2****
 
Duplicating the CN data in genecn and segmentcn for ACH-000219 so we have CN data for ACH-002874, the same cell line grown in different media. This step is required for Achilles / CERES.

Version 22: removing two weird undefined lines 

## ** Version 23 Internal 20Q3****
same  as  20Q2 for this release. no new lines

## ** Version 24 Internal 20Q3****
updating the blacklists

## ** Version 25 Internal 20Q4****
more cell lines, new full reprocessing, adding a new columns for amplification status, new way to create gene level cn

## ** Version 26 Internal 20Q4****
reparing issue with foldtransform

## ** Version 27 Internal 20Q4****
adding a missing line

## ** Version 27 Internal 20Q4****
reverting to logfold change

Gene level CN data:

__Rows__: DepMap cell line IDs

__Columns__: gene names in the format HGNC\_symbol (Entrez\_ID)

Segment level data:

__Columns__: DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean

NEW LINES:
"""+str(newlines)+"""

BLACKLIST:
"""+str(blacklist))

Uploading all_merged_genes_cn...
hitting https://cds.team/taiga/api/datafile/84479914f6db47ea8e273d7da5a76f96
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (l

'800b41429da649359f30c82e3b92feda'

In [209]:
# To add to a virtual dataset
AddToVirtual(virtual_internal, 'depmap-wes-cn-data-81a7', [('CCLE_gene_cn', 'all_merged_genes_cn'),('CCLE_segment_cn','all_merged_segments')])
# To add to a eternal dataset
AddToVirtual('depmap-a0ab', 'depmap-wes-cn-data-81a7', [('CCLE_gene_cn', 'all_merged_genes_cn'),('CCLE_segment_cn', 'all_merged_segments')])

[('CCLE_gene_cn', 'depmap-wes-cn-data-81a7.30/all_merged_genes_cn'), ('CCLE_segment_cn', 'depmap-wes-cn-data-81a7.30/all_merged_segments')]
hitting https://cds.team/taiga/api/datafile/716086914a0448beb961bb7413fe9358
hitting https://cds.team/taiga/api/datafile/716086914a0448beb961bb7413fe9358
hitting https://cds.team/taiga/api/datafile/716086914a0448beb961bb7413fe9358
hitting https://cds.team/taiga/api/datafile/716086914a0448beb961bb7413fe9358
hitting https://cds.team/taiga/api/datafile/716086914a0448beb961bb7413fe9358
hitting https://cds.team/taiga/api/datafile/716086914a0448beb961bb7413fe9358
hitting https://cds.team/taiga/api/datafile/716086914a0448beb961bb7413fe9358
hitting https://cds.team/taiga/api/datafile/716086914a0448beb961bb7413fe9358
hitting https://cds.team/taiga/api/datafile/716086914a0448beb961bb7413fe9358
hitting https://cds.team/taiga/api/datafile/716086914a0448beb961bb7413fe9358
hitting https://cds.team/taiga/api/datafile/716086914a0448beb961bb7413fe9358
hitting https

## IBM

In [230]:
print('missing')
missing = set(ibm) - set(segmentcn.DepMap_ID)
print(missing)
print('ibm_embargo')
ibm_embargo = set(segmentcn.DepMap_ID) - (prevdmc | set(ibm))
print(len(ibm_embargo), ibm_embargo)
newlines = set(ibm) 
print(len(newlines))

missing
set()
ibm_embargo
2 {'ACH-001512', 'ACH-001708'}
34


In [231]:
## for segment removing first blacklisted, then embargoed, to create two datasets
print(len(segmentcn))
a = segmentcn[~segmentcn.DepMap_ID.isin(ibm_embargo|prevdmc)]
print(len(a))
a.to_csv('temp/all_merged_segments.csv', index=False)
print(len(genecn))
b = genecn[~genecn.index.isin(ibm_embargo|prevdmc)]
print(len(b))
b.to_csv('temp/all_merged_genes_cn.csv')

3173701
0
1787
0


In [213]:
tc.update_dataset(dataset_permaname="cn-e20f",
                upload_file_path_dict={
                    'temp/all_merged_genes_cn.csv':'NumericMatrixCSV',
                    'temp/all_merged_segments.csv': 'TableCSV',
                   },
                  changes_description=
"""
""",
                
                  dataset_description="""

## Versions:

V1: 20Q4
new cell lines, new full reprocessing, adding a new columns for amplification status, new way to create gene level cn


## Annotations:

__Description__: log2(X + 1) gene level copy number data (data is now log2 transformed with a __pseudocount of 1__ added). CN data is generated using __hg38__.  The segment copy number data includes the mean segment copy number segments.

Gene level CN data:

__Rows__: DepMap cell line IDs

__Columns__: gene names in the format HGNC\_symbol (Entrez\_ID)

Segment level data:

__Columns__: DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean

NEW LINES:
"""+str(newlines)+"""

EMBARGOED:
"""+str(ibm_embargo))

Uploading all_merged_genes_cn...
hitting https://cds.team/taiga/api/datafile/c95080b2f41a438d8a4c089f5a3ca858
Conversion and upload...:
	 Downloading the file from S3


AttributeError: 'dict' object has no attribute 'message'

In [262]:
AddToVirtual(virtual_ibm, "cn-e20f", files=[('all_merged_genes_cn', 'all_merged_genes_cn'),('all_merged_segments', 'all_merged_segments'),])

[('all_merged_genes_cn', 'cn-e20f.1/all_merged_genes_cn'), ('all_merged_segments', 'cn-e20f.1/all_merged_segments')]
hitting https://cds.team/taiga/api/datafile/f5aba24ec71b4d139c9c4d359113d0ec


Exception: Bad status code: 400

## DMC

* **NOTE: change as of 20Q2 onwards**. We need to remove lines in WES_DMC_embargo from the Internal version of the CN datasets before we upload the `genecn` and `segmentcn` files to DMC.

In [210]:
print('missing')
missing = set(dmc) - set(segmentcn.DepMap_ID)
print(missing)
print('dmc_embargo')
dmc_embargo = set(segmentcn.DepMap_ID) - (prevdmc | set(dmc))
print(len(dmc_embargo), dmc_embargo)
newlines = set(dmc) 
print(len(newlines))

missing
set()
dmc_embargo
2 {'ACH-001512', 'ACH-001708'}
79


In [211]:
## for segment removing first blacklisted, then embargoed, to create two datasets
print(len(segmentcn))
segmentcn = segmentcn[~segmentcn.DepMap_ID.isin(dmc_embargo)]
print(len(segmentcn))
segmentcn.to_csv('temp/all_merged_segments.csv', index=False)
print(len(genecn))
genecn = genecn[~genecn.index.isin(dmc_embargo)]
print(len(genecn))
genecn.to_csv('temp/all_merged_genes_cn.csv')

3173701
3173256
1787
1785


In [212]:
tc.update_dataset(dataset_permaname="depmap-cn-data-9b9d",
                upload_file_path_dict={
                    'temp/all_merged_segments.csv':'TableCSV',
                    'temp/all_merged_genes_cn.csv': 'NumericMatrixCSV',
                   },
                  changes_description=
"""
""",
                
                  dataset_description="""
                  
## Versions:

**** Version 1-2 DMC 19Q1****

version 2 contains the correct data for 19Q1

**** Version 3-4 DMC 19Q2****

__version 4 added an additional 13 cell lines and adds the segment level copy number data__

**** Version 5 DMC 19Q3***

**** Version 7 DMC 19Q4***
adding 35 new cell lines

**** Version 8 DMC 19Q4****
resolving problem with not having log2 transform 

**** Version 9 DMC 19Q4****
resolving problem with having log2 transformed the segments

**** Version 10 DMC 20Q1****
adding new samples

**** Version 11 DMC 20Q1****
unlog2 transforming segmentcn

**** Version 12 DMC 20Q2****
Adding samples to be included in 20Q2

**** Version 13 DMC 20Q2****
unknown changes

**** Version 14 DMC 20Q3****
updated blacklists

**** Version 15 DMC 20Q3****
issues with blacklists

**** Version 16 DMC 20Q4****
new cell lines, new full reprocessing, adding a new columns for amplification status, new way to create gene level cn

**** Version 17 DMC 20Q4****
adding a missing line

**** Version 17 DMC 20Q4****
reverting log transform

## Annotations

__Description__: log2(X + 1) gene level copy number data (data is now log2 transformed with a __pseudocount of 1__ added). CN data is generated using __hg38__.  The segment copy number data includes the mean segment copy number segments.

Gene level CN data:

__Rows__: DepMap cell line IDs

__Columns__: gene names in the format HGNC\_symbol (Entrez\_ID)

Segment level data:

__Columns__: DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean

NEW LINES:
"""+str(newlines)+"""

EMBARGOED:
"""+str(dmc_embargo))

Uploading all_merged_segments...
hitting https://cds.team/taiga/api/datafile/0d6150560adb4d2c8e395efdd17ca2a9
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from

'8707f9507f58402eb67ec4748ef85080'

In [213]:
AddToVirtual(virtual_dmc, "depmap-cn-data-9b9d", files=[('CCLE_gene_cn', 'all_merged_genes_cn'),('CCLE_segment_cn', 'all_merged_segments')])

[('CCLE_gene_cn', 'depmap-cn-data-9b9d.19/all_merged_genes_cn'), ('CCLE_segment_cn', 'depmap-cn-data-9b9d.19/all_merged_segments')]
hitting https://cds.team/taiga/api/datafile/f2f38a131464455696f5f575f6f39157
hitting https://cds.team/taiga/api/datafile/f2f38a131464455696f5f575f6f39157
hitting https://cds.team/taiga/api/datafile/f2f38a131464455696f5f575f6f39157
hitting https://cds.team/taiga/api/datafile/f2f38a131464455696f5f575f6f39157
hitting https://cds.team/taiga/api/datafile/f2f38a131464455696f5f575f6f39157
hitting https://cds.team/taiga/api/datafile/f2f38a131464455696f5f575f6f39157
hitting https://cds.team/taiga/api/datafile/f2f38a131464455696f5f575f6f39157
hitting https://cds.team/taiga/api/datafile/f2f38a131464455696f5f575f6f39157
hitting https://cds.team/taiga/api/datafile/f2f38a131464455696f5f575f6f39157
hitting https://cds.team/taiga/api/datafile/f2f38a131464455696f5f575f6f39157
hitting https://cds.team/taiga/api/datafile/f2f38a131464455696f5f575f6f39157
hitting https://cds.t

## Public

We add to public as internal minus dmc embargoed and only cell lines from previous previous release (6 month)

In [124]:
print('missing')
missing = set(public) - set(segmentcn.DepMap_ID)
embargo = set(segmentcn.DepMap_ID) - (prevpublic | set(public))
print(missing)
newlines = set(public) 
print('embargo')
print(len(embargo), embargo)
print(len(newlines))

missing
{'ACH-002709'}
embargo
32 {'ACH-001493', 'ACH-001970', 'ACH-001449', 'ACH-002401', 'ACH-002010', 'ACH-001502', 'ACH-001662', 'ACH-001669', 'ACH-001672', 'ACH-002465', 'ACH-001973', 'ACH-001693', 'ACH-001547', 'ACH-001293', 'ACH-001533', 'ACH-001971', 'ACH-001847', 'ACH-002014', 'ACH-001854', 'ACH-001349', 'ACH-001696', 'ACH-001437', 'ACH-002021', 'ACH-002048', 'ACH-001679', 'ACH-001678', 'ACH-001676', 'ACH-001537', 'ACH-001855', 'ACH-002512', 'ACH-001438', 'ACH-002400'}
52


In [125]:
print(len(segmentcn))
a = segmentcn[~segmentcn.DepMap_ID.isin(set(embargo))]
print(len(a))
a.to_csv('temp/all_merged_segments.csv', index=False)
print(len(genecn))
b = genecn[~genecn.index.isin(set(embargo))]
print(len(b))
b.to_csv('temp/all_merged_genes_cn.csv',)

3173256
3134184
1787
1755


In [126]:
tc.update_dataset(dataset_permaname='depmap-wes-cn-data-97cc',
                    upload_file_path_dict={
                    'temp/all_merged_genes_cn.csv':'NumericMatrixCSV',
                    'temp/all_merged_segments.csv': 'TableCSV',
                   },
                  dataset_description="""
**** Versions 1-5 Public 18Q1****

Gene-level WES copy-number data for publicly accessible CCLE data. 

```

internal_lines <- readr::read_csv("~/Downloads/avana-broad-18q1_v2-sample-info.csv")$cell_line
public_lines <- readr::read_csv("~/Downloads/avana-public-tentative-18q1_v5-sample-info.csv")$cell_line
non_public_lines <- setdiff(internal_lines, public_lines)

full_cn_set <- taigr::load.from.taiga(data.name='gene-level-cn-87aa', data.version=5, data.file='full_gene_CN')
source_info <- data.frame(source=gsub("_.*", "", row.names(full_cn_set)),
                          ccle_name=gsub("snp_|achillesWES_|ccleWES_|sangerWES_", "",
                                         row.names(full_cn_set)),
                          row_idx=1:nrow(full_cn_set))
to_remove <- source_info %>%
  dplyr::filter(ccle_name %in% non_public_lines,
                source %in% c("ccleWES", "achillesWES"))
also_to_remove <- source_info %>%
                    dplyr::filter(source == "sangerWES")
indices_to_remove <- c(to_remove$row_idx, also_to_remove$row_idx) %>% unique()
indices_to_keep <- source_info %>%
  dplyr::filter(!(row_idx %in% indices_to_remove)) %>%
  dplyr::group_by(ccle_name) %>%
  dplyr::mutate(priority=ifelse(source == "snp", 4,
                                ifelse(source == "sangerWES", 3,
                                       ifelse(source == "ccleWES", 2, 1)))) %>%
  dplyr::filter(priority == min(priority)) %>%
  dplyr::ungroup()

public_cn <- full_cn_set[indices_to_keep$row_idx,]
source_info <- data.frame(source=gsub("_.*", "", row.names(public_cn)),
                          ccle_name=gsub("snp_|achillesWES_|ccleWES_|sangerWES_", "",
                                         row.names(public_cn)))
public_cn %<>% magrittr::set_rownames(source_info$ccle_name)
```

CN data are on a log2 scale.

`WES_source_info` tracks the source data for each cell line. Sources are `snp`, `achillesWES`, `ccleWES`, and `sangerWES`

NOTE: Version 1 contained WES data from cell lines not available in the 18Q1 Public release. Versions 2-4 contained Sanger's WES CN data

**** Version 6 Public 18Q2****

Generated by running:

```
wes_pri <- taigr::load.from.taiga(data.name='segmented-cn-wes-prioritzed-7fe1', data.version=10, data.file='public_wes_priority_cn_gene_matrix') %>% log2()
```

**** Version 7 Public 18Q2****

Generated by running:

```
wes_pri <- taigr::load.from.taiga(data.name='segmented-cn-wes-prioritzed-7fe1', data.version=11, data.file='public_wes_priority_cn_gene_matrix') %>% log2()
```


**** Version 8-9 Public 18Q3****

Generated by running:

```
wes_pri <- taigr::load.from.taiga(data.name='segmented-cn-wes-prioritzed-7fe1', data.version=15, data.file='public_wes_priority_cn_gene_matrix') %>% log2()
```
Version 8 the segment level CN for Sanger's data is off by a factor of 2, version 9 corrects this

includes cell lines that should not be public

**** Version 10 Public 18Q1, 18Q2, 18Q3****

__use version 10 for 18Q1, 18Q2 and 18Q3 datasets__ 

Version 10 is the most up-to-date version of "public\_18Q3\_gene\_cn.csv". The three datasets have been updated to remove cell lines that should not have been made public. They are named in the portal and google bucket for portal downloads as v2, e.g. public\_18Q3\_gene\_cn\_v2.csv.

__Rows__: Broad (arxspan) cell line IDs

__Columns__: gene names in the format HGNC\_symbol (Entrez\_ID)

**** Version 11 Public 18Q4****

Generated by running:

```
wes_pri <- taigr::load.from.taiga(data.name='segmented-cn-wes-prioritzed-7fe1', data.version=17, data.file='public_wes_priority_cn_gene_matrix') %>% log2()
```

__Rows__: DepMap (arxspan) cell line IDs

__Columns__: gene names in the format HGNC\_symbol (Entrez\_ID)

**** Version 12-14 Public 19Q1****

version 14 contains the correct data for 19Q1

version 13 is the same as v12 except that it uses the original hg19 coordinates not hg38. The Achilles public data set uses the hg19 coordinates. 

**** Version 15-16 Public 19Q2****

__version 16 also adds the segment level copy number data__

**** Version 17-18 Public 19Q3****


**** Version 23 Public 19Q4****
adding new cell lines

**** Version 24 Public 19Q4****
resolving problem with not having log2 transform 

**** Version 25 Public 19Q4****
another issue in log transform

**** Version 26 Public 19Q4****
unlog2 transforming segmentcn FINAL

**** Version 27 Public 20Q1****
adding new samples

**** Version 28 Public 20Q1****
log 2 transform issues

**** Version 29 Public 20Q1****
readding one missing column in segments

**** Version 30 Public 20Q2****
Adding new samples

**** Version 31 Public 20Q2****
unknown changes

**** Version 32 Public 20Q3****
same  as  20Q2 for this release. no new lines

**** Version 33 Public 20Q3****
updated blacklist

**** Version 34 Public 20Q3****
updated dmc list

**** Version 35 Public 20Q4****
new cell lines, new full reprocessing, adding a new columns for amplification status, new way to create gene level cn

**** Version 36 Public 20Q4****
reverting to logtransform

**** Version 37 Public 20Q4****
removing lines

## Gene level CN data:

__data is hg38 liftover__

__Description__: log2 + 1 gene level copy number data (data is log2 transformed with a __pseudocount of 1__ added). It uses hg19 coordinates. Also the segment level copy number data.

__Rows__: DepMap cell line IDs

__Columns__: gene names in the format HGNC\_symbol (Entrez\_ID)

Segment level data:

__Columns__: DepMap\_ID, Chromosome, Start, End, Num\_Probes, Segment\_Mean, CCLE\_name

NEW LINES:
"""+str(newlines)+"""

EMBARGOED:
"""+str(embargo))

Uploading all_merged_genes_cn...
hitting https://cds.team/taiga/api/datafile/29507b79b9fc4dda855a0afa67461297
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (l

'deb6c8996f0d46419a4a1b9e574b16cb'

In [217]:
!git push

Counting objects: 76, done.
Delta compression using up to 12 threads.
Compressing objects: 100% (76/76), done.
Writing objects: 100% (76/76), 6.69 MiB | 1.53 MiB/s, done.
Total 76 (delta 57), reused 0 (delta 0)
remote: Resolving deltas: 100% (57/57), completed with 9 local objects.[K
remote: error: GH001: Large files detected. You may want to try Git Large File Storage - https://git-lfs.github.com.[K
remote: error: Trace: 5938e8fa34910bcbf615473a02ae2b5b0ef97f74049e9de0a1416d0696f03b17[K
remote: error: See http://git.io/iEPt8g for more information.[K
remote: error: File 20Q4.csv is 178.93 MB; this exceeds GitHub's file size limit of 100.00 MB[K
To https://github.com/broadinstitute/ccle_processing.git
 ! [remote rejected] master -> master (pre-receive hook declined)
error: failed to push some refs to 'https://github.com/broadinstitute/ccle_processing.git'


In [127]:
# To add to a virtual dataset
AddToVirtual(virtual_public, "depmap-wes-cn-data-97cc", files=[('CCLE_gene_cn', 'all_merged_genes_cn'),('CCLE_segment_cn', 'all_merged_segments')])

[('CCLE_gene_cn', 'depmap-wes-cn-data-97cc.38/all_merged_genes_cn'), ('CCLE_segment_cn', 'depmap-wes-cn-data-97cc.38/all_merged_segments')]
hitting https://cds.team/taiga/api/datafile/d5c6453657944089b45c08320012f809
hitting https://cds.team/taiga/api/datafile/d5c6453657944089b45c08320012f809
hitting https://cds.team/taiga/api/datafile/d5c6453657944089b45c08320012f809
hitting https://cds.team/taiga/api/datafile/d5c6453657944089b45c08320012f809
hitting https://cds.team/taiga/api/datafile/d5c6453657944089b45c08320012f809
hitting https://cds.team/taiga/api/datafile/d5c6453657944089b45c08320012f809
hitting https://cds.team/taiga/api/datafile/d5c6453657944089b45c08320012f809
hitting https://cds.team/taiga/api/datafile/d5c6453657944089b45c08320012f809
hitting https://cds.team/taiga/api/datafile/d5c6453657944089b45c08320012f809
hitting https://cds.team/taiga/api/datafile/d5c6453657944089b45c08320012f809
hitting https://cds.team/taiga/api/datafile/d5c6453657944089b45c08320012f809
hitting https

# RNA

In [128]:
transcripts_tpm = pd.read_csv('temp/expression_' + release + '_transcripts_tpm.csv',index_col=0)
genes_tpm = pd.read_csv('temp/expression_' + release + '_genes_tpm.csv',index_col=0)
genes_expected_count = pd.read_csv('temp/expression_' + release + '_genes_expected_count.csv',index_col=0)
proteincoding_genes_expected_count = pd.read_csv('temp/expression_' + release + '_proteincoding_genes_expected_count.csv',index_col=0)
proteincoding_genes_tpm = pd.read_csv('temp/expression_' + release + '_proteincoding_genes_tpm.csv',index_col=0)
transcripts_expected_count = pd.read_csv('temp/expression_' + release + '_transcripts_expected_count.csv',index_col=0)

In [133]:
proteincoding_genes_expected_count = proteincoding_genes_expected_count[[i for i in proteincoding_genes_expected_count.columns if ' (' in i]]
proteincoding_genes_tpm = proteincoding_genes_tpm[[i for i in proteincoding_genes_tpm.columns if ' (' in i]]

## Internal

In [134]:
store -r rename

In [135]:
print('not present')
removed = set(previnternal) - set(genes_tpm.index)
print(removed)
print('removed for QC reasons')
print(set(rename.keys()))
print('removed')
removed = set(internal_rna) - set(genes_tpm.index)
print(removed - set(rename.keys()))
missing = set(internal) - set(genes_tpm.index)
blacklist = set(genes_tpm.index) - (previnternal | set(internal))
print('missing')
print(missing)
newlines = set(internal) 
print('blacklist')
print(len(blacklist), blacklist)

not present
{'ACH-002196', 'ACH-002205', 'ACH-002289', 'ACH-001064', 'ACH-001641', 'ACH-002127', 'ACH-002344', 'ACH-002384', 'ACH-002383', 'ACH-002157', 'ACH-001037', 'ACH-001143', 'ACH-002216', 'ACH-001680', 'ACH-000727', 'ACH-002351', 'ACH-002207', 'ACH-002114', 'ACH-002300', 'ACH-000795', 'ACH-002377', 'ACH-002204', 'ACH-002360', 'ACH-002292', 'ACH-002058', 'ACH-002104', 'ACH-002314', 'ACH-002183', 'ACH-002213', 'ACH-002168', 'ACH-002382', 'ACH-002260', 'ACH-001208', 'ACH-002095', 'ACH-002200', 'ACH-002269', 'ACH-002387', 'ACH-002375', 'ACH-002129', 'ACH-001199', 'ACH-002312', 'ACH-002282', 'ACH-002395', 'ACH-001338', 'ACH-002232', 'ACH-002267', 'ACH-002190', 'ACH-001502', 'ACH-001093', 'ACH-002336', 'ACH-002281', 'ACH-001137', 'ACH-002199', 'ACH-002208', 'ACH-002185', 'ACH-002276', 'ACH-002374', 'ACH-002396', 'ACH-001383', 'ACH-002237', 'ACH-002119', 'ACH-002136', 'ACH-002359', 'ACH-001182', 'ACH-002399', 'ACH-002106', 'ACH-001187', 'ACH-002394', 'ACH-002180', 'ACH-001045', 'ACH-00

In [136]:
transcripts_tpm=transcripts_tpm.apply(lambda x: np.log2(x+1))
genes_tpm=genes_tpm.apply(lambda x: np.log2(x+1))
genes_expected_count=genes_expected_count.apply(lambda x: np.log2(x+1))
proteincoding_genes_expected_count=proteincoding_genes_expected_count.apply(lambda x: np.log2(x+1))
proteincoding_genes_tpm=proteincoding_genes_tpm.apply(lambda x: np.log2(x+1))
transcripts_expected_count=transcripts_expected_count.apply(lambda x: np.log2(x+1))

In [137]:
## removing first blacklisted, then embargoed, to create two datasets
print(len(genes_expected_count))
genes_expected_count = genes_expected_count[~genes_expected_count.index.isin(blacklist)]
print(len(genes_expected_count))
genes_expected_count.to_csv('temp/expression_genes_expected_count.csv')
print(len(genes_tpm))
genes_tpm = genes_tpm[~genes_tpm.index.isin(blacklist)]
print(len(genes_tpm))
genes_tpm.to_csv('temp/expression_genes_tpm.csv')
print(len(proteincoding_genes_tpm))
proteincoding_genes_tpm = proteincoding_genes_tpm[~proteincoding_genes_tpm.index.isin(blacklist)]
print(len(proteincoding_genes_tpm))
proteincoding_genes_tpm.to_csv('temp/expression_proteincoding_genes_tpm.csv')
print(len(transcripts_tpm))
transcripts_tpm = transcripts_tpm[~transcripts_tpm.index.isin(blacklist)]
print(len(transcripts_tpm))
transcripts_tpm.to_csv('temp/expression_transcripts_tpm.csv')
print(len(proteincoding_genes_expected_count))
proteincoding_genes_expected_count = proteincoding_genes_expected_count[~proteincoding_genes_expected_count.index.isin(blacklist)]
print(len(proteincoding_genes_expected_count))
proteincoding_genes_expected_count.to_csv('temp/expression_proteincoding_genes_expected_count.csv')
print(len(transcripts_expected_count))
transcripts_expected_count = transcripts_expected_count[~transcripts_expected_count.index.isin(blacklist)]
print(len(transcripts_expected_count))
transcripts_expected_count.to_csv('temp/expression_transcripts_expected_count.csv')

1418
1407
1418
1407
1418
1407
1418
1407
1417
1406
1417
1406


In [138]:
tc.update_dataset(dataset_permaname="depmap-rnaseq-expression-data-363a",
                 upload_file_path_dict={
                   'temp/expression_genes_expected_count.csv': 'NumericMatrixCSV',
                   'temp/expression_transcripts_tpm.csv': 'NumericMatrixCSV',
                   'temp/expression_genes_tpm.csv': 'NumericMatrixCSV',
                   'temp/expression_proteincoding_genes_tpm.csv': 'NumericMatrixCSV',
                   'temp/expression_proteincoding_genes_expected_count.csv': 'NumericMatrixCSV',
                   'temp/expression_transcripts_expected_count.csv': 'NumericMatrixCSV'},
                  dataset_description=
"""
# INTERNAL RNA

* Version 1-3 Internal 18Q1*

All CCLE cell lines with RNAseq data.

Original data source:

`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_depMap_18Q1_RNAseq_reads_20180201.gct`
`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_depMap_18Q1_RNAseq_RPKM_20180201.gct`

Version 2 of RPKM data are log2-transformed with a noisy floor at -3 (-3 + N(0, 0.1))

* Version 4-6 Internal 18Q2*

Original data source:

`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_depMap_18q2_RNAseq_reads_20180420.gct` `/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_depMap_18q2_RNAseq_RPKM_20180420.gct`

RPKM data are log2-transformed with a noisy floor at -3 (-3 + N(0, 0.1)). Reads file unaltered aside from formatting row / column names.

* Version 7 Internal 18Q2*

Includes a matrix with genes filtered by HGNC protein-coding gene locus group.

* Version 8-10 Internal 18Q3*

use version 10

Original data source:

`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_depMap_18q3_RNAseq_reads_20180716.gct` `/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_depMap_18q3_RNAseq_RPKM_20180716.gct`

RPKM data are log2-transformed with a noisy floor at -3 (-3 + N(0, 0.1)). Reads file unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are Broad (Arxspan) cell line IDs.

Columns: In the complete RPKM and read datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding RPKM and read datasets column names are HGNC_symbol (Entrez_ID)

version 9 updates names, and slightly different RPKM values due to randomly added noisy floor (using a seed of 4)

version 10 removes duplicate gene names from the protein coding datasets

* Version 11-12 Internal 18Q4*

18Q4 transcript level data is found in version 14. (In versions 1-13 transcript data contains only gene level not transcript level data)

changing to TPM expression

Original data source:

`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_DepMap_18Q4_rsem_genes_tpm_20181029.txt` `/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_DepMap_18Q4_rsem_transcripts_tpm_20181029.gct` `/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_DepMap_18Q4_RNAseq_reads_20181029.gct` `/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_DepMap_18Q4_RNAseq_RPKM_20181029.gct`

TPM data is the primary expression data now. It is log2-transformed with a pseudo count of 1 added. The TPM data contains 4 cell lines not included in the RPKM data.

RPKM data are log2-transformed with a pseudo count of 1 added. RPKM values are no longer thresholded.

Reads/transcript files unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are DepMap (Arxspan) cell line IDs

Columns: In the complete TPM, TPM transcripts, RPKM and read datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding TPM dataset column names are HGNC_symbol (Entrez_ID)

* Version 13-15 Internal 19Q1*

Version 15 contains the correct data sets for 19Q1 - 2 cell lines are removed

Version 14 contains the correct transcript level data for 18Q4

* Version 16 Internal 19Q2*

* Version 17 Internal 19Q3*

* Version 18 Internal 19Q4*

Adding 93 new cell lines - Blacklisted
Some cells lines have been removed because they:

 - had too many 0 values = ["ACH-001388" "ACH-001577" "ACH-001767" "ACH-002463"] 

Some cells lines have been flagged as:

 - None
* Version 19 Internal 19Q4
removing blacklisted

* Version 20 Internal 19Q4
removing blacklisted in transcripts

* Version 21 Internal 19Q4
uploading as matrices 

* Version 22 Internal 20Q1
adding 6 new cell lines

* Version 23 Internal 20Q2
adding  new cell lines

* Version 24 Internal 20Q2
adding  back ACH-000052

* Version 25 Internal 20Q3
nothing different from  20Q2. no new cell lines added

* Version 26 Internal 20Q3
some lines were wrongly added to the blacklist

* Version 27 Internal 20Q4
new samples, adding a new QC method that removed 10 samples. fully adding the reprocessed samples, removing some wrong genes, adding falsely removed genes (that are still expressed)

* Version 28 Internal 20Q4
revertig log transform

* Version 29 Internal 20Q4
reverting gene names

data is aligned to hg38

read count data is created using RSEM, which, when a read maps to multiple places, splits the counts between genes, with the weight based on the likelihood that it came from one gene or the other, so counts data may not be integers

TPM data is log2-transformed with a pseudo count of 1 added. log2(X+1)

Reads/transcript files unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are DepMap cell line IDs Mapping between Broad IDs and CCLE IDs can be done using a R or python package

To install R implementation: options(repos = c("https://iwww.broadinstitute.org/~datasci/R-packages", "https://cran.cnr.berkeley.edu")) install.packages('celllinemapr')

To install python implementation: pip install https://intranet.broadinstitute.org/~datasci/python-packages/cell_line_mapper-0.1.9.tar.gz)

Columns: In the complete TPM and read counts datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding TPM dataset column names are HGNC_symbol (Entrez_ID). In the TPM transcript dataset column names are HGNC_symbol (Transcript_ID) - the HGNC symbols are not unique, use the transcript IDs for unique identifiers.


NEW LINES:
"""+str(newlines)+"""

REMOVED FOR QC REASONS:
"""+str(rename)+"""

BLACKLIST:
"""+str(blacklist))

Uploading expression_genes_expected_count...
hitting https://cds.team/taiga/api/datafile/ea1cd7fbce31449092a43af9f71bd262
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scannin

	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to d

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin

	 Conversion in progress, line 750
	 Conversion in progress, line 750
	 Conversion in progress, line 750
	 Conversion in progress, line 750
	 Conversion in progress, line 750
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to 

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, li

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin

'fa49f001e31142bb9cd9bbff1e0868de'

In [139]:
AddToVirtual('depmap-a0ab', "depmap-rnaseq-expression-data-363a", files=[
('CCLE_expression_full', 'expression_genes_expected_count'), 
('CCLE_RNAseq_transcripts', 'expression_transcripts_tpm'),
('CCLE_RNAseq_reads', 'expression_genes_tpm'),
('CCLE_expression', 'expression_proteincoding_genes_tpm'), ('expression_proteincoding_genes_expected_count', 'expression_proteincoding_genes_expected_count'),('expression_transcripts_expected_count', 'expression_transcripts_expected_count')])

AddToVirtual(virtual_internal, "depmap-rnaseq-expression-data-363a", files=[
('CCLE_expression_full', 'expression_genes_expected_count'), 
('CCLE_RNAseq_transcripts', 'expression_transcripts_tpm'),
('CCLE_RNAseq_reads', 'expression_genes_tpm'),
('CCLE_expression', 'expression_proteincoding_genes_tpm'), ('expression_proteincoding_genes_expected_count', 'expression_proteincoding_genes_expected_count'),('expression_transcripts_expected_count', 'expression_transcripts_expected_count')])

[('CCLE_expression_full', 'depmap-rnaseq-expression-data-363a.30/expression_genes_expected_count'), ('CCLE_RNAseq_transcripts', 'depmap-rnaseq-expression-data-363a.30/expression_transcripts_tpm'), ('CCLE_RNAseq_reads', 'depmap-rnaseq-expression-data-363a.30/expression_genes_tpm'), ('CCLE_expression', 'depmap-rnaseq-expression-data-363a.30/expression_proteincoding_genes_tpm'), ('expression_proteincoding_genes_expected_count', 'depmap-rnaseq-expression-data-363a.30/expression_proteincoding_genes_expected_count'), ('expression_transcripts_expected_count', 'depmap-rnaseq-expression-data-363a.30/expression_transcripts_expected_count')]
hitting https://cds.team/taiga/api/datafile/8b3bdc4cdbdc47abb6512719b440cc0a
hitting https://cds.team/taiga/api/datafile/8b3bdc4cdbdc47abb6512719b440cc0a
hitting https://cds.team/taiga/api/datafile/8b3bdc4cdbdc47abb6512719b440cc0a
hitting https://cds.team/taiga/api/datafile/8b3bdc4cdbdc47abb6512719b440cc0a
hitting https://cds.team/taiga/api/datafile/8b3bdc4cd

hitting https://cds.team/taiga/api/datafile/18928658694c47f19f4b0e5540c3ca03
hitting https://cds.team/taiga/api/datafile/18928658694c47f19f4b0e5540c3ca03
hitting https://cds.team/taiga/api/datafile/18928658694c47f19f4b0e5540c3ca03
hitting https://cds.team/taiga/api/datafile/18928658694c47f19f4b0e5540c3ca03
hitting https://cds.team/taiga/api/datafile/18928658694c47f19f4b0e5540c3ca03
hitting https://cds.team/taiga/api/datasetVersion

Dataset version with id 2e2b44672a5b4924aab28ebd8f5fb29f created. You can access to this dataset version directly with this url: https://cds.team/taiga/dataset_version/2e2b44672a5b4924aab28ebd8f5fb29f


## IBM

In [140]:
print('not present')
removed = set(prevdmc) - set(genes_tpm.index)
print(removed)
print('removed for QC reasons')
print(set(rename.keys()))
print('removed')
removed = set(dmc_rna) - set(genes_tpm.index)
print(removed - set(rename.keys()))
missing = set(ibm) - set(genes_tpm.index)
embargo_ibm = set(genes_tpm.index) - (prevdmc | set(ibm))
print('missing')
print(missing)
newlines = set(ibm) 
print('embargo_ibm')
print(len(embargo_ibm), embargo_ibm)

not present
{'ACH-002196', 'ACH-002205', 'ACH-002289', 'ACH-001064', 'ACH-001641', 'ACH-002127', 'ACH-002344', 'ACH-002384', 'ACH-002383', 'ACH-002157', 'ACH-001037', 'ACH-001143', 'ACH-002216', 'ACH-001680', 'ACH-000727', 'ACH-002351', 'ACH-002207', 'ACH-002114', 'ACH-002300', 'ACH-000795', 'ACH-002377', 'ACH-002204', 'ACH-002360', 'ACH-002292', 'ACH-002058', 'ACH-002104', 'ACH-002314', 'ACH-002183', 'ACH-002213', 'ACH-002168', 'ACH-002382', 'ACH-002260', 'ACH-001208', 'ACH-002095', 'ACH-002200', 'ACH-002269', 'ACH-002387', 'ACH-002375', 'ACH-002129', 'ACH-001199', 'ACH-002312', 'ACH-002282', 'ACH-002395', 'ACH-001338', 'ACH-002232', 'ACH-002267', 'ACH-002190', 'ACH-001502', 'ACH-001093', 'ACH-002336', 'ACH-002281', 'ACH-001137', 'ACH-002199', 'ACH-002208', 'ACH-002185', 'ACH-002276', 'ACH-002374', 'ACH-002396', 'ACH-001383', 'ACH-002237', 'ACH-002119', 'ACH-002136', 'ACH-002359', 'ACH-001182', 'ACH-002399', 'ACH-002106', 'ACH-001187', 'ACH-002394', 'ACH-002180', 'ACH-001045', 'ACH-00

In [141]:
## removing first blacklisted, then embargoed, to create two datasets
print(len(genes_expected_count))
a = genes_expected_count[~genes_expected_count.index.isin(embargo_ibm|prevdmc)]
print(len(a))
a.to_csv('temp/expression_genes_expected_count.csv')
print(len(genes_tpm))
a = genes_tpm[~genes_tpm.index.isin(embargo_ibm|prevdmc)]
print(len(a))
a.to_csv('temp/expression_genes_tpm.csv')
print(len(proteincoding_genes_tpm))
a = proteincoding_genes_tpm[~proteincoding_genes_tpm.index.isin(embargo_ibm|prevdmc)]
print(len(a))
a.to_csv('temp/expression_proteincoding_genes_tpm.csv')
print(len(transcripts_tpm))
a = transcripts_tpm[~transcripts_tpm.index.isin(embargo_ibm|prevdmc)]
print(len(a))
a.to_csv('temp/expression_transcripts_tpm.csv')
print(len(proteincoding_genes_expected_count))
a = proteincoding_genes_expected_count[~proteincoding_genes_expected_count.index.isin(embargo_ibm|prevdmc)]
print(len(a))
a.to_csv('temp/expression_proteincoding_genes_expected_count.csv')
print(len(transcripts_expected_count))
a = transcripts_expected_count[~transcripts_expected_count.index.isin(embargo_ibm|prevdmc)]
print(len(a))
a.to_csv('temp/expression_transcripts_expected_count.csv')

1407
0
1407
0
1407
0
1407
0
1406
0
1406
0


In [142]:
tc.update_dataset(dataset_permaname="depmap-rnaseq-expression-data-80ef",
                 upload_file_path_dict={
                     'temp/expression_genes_expected_count.csv': 'NumericMatrixCSV',
                   'temp/expression_transcripts_tpm.csv': 'NumericMatrixCSV',
                   'temp/expression_genes_tpm.csv': 'NumericMatrixCSV',
                   'temp/expression_proteincoding_genes_tpm.csv': 'NumericMatrixCSV',
                   'temp/expression_proteincoding_genes_expected_count.csv': 'NumericMatrixCSV',
                   'temp/expression_transcripts_expected_count.csv': 'NumericMatrixCSV'},
                  dataset_description=
"""
# IBM RNA

* Version 1-3 DMC 19Q1*


data is aligned to hg38

read count data is created using RSEM, which, when a read maps to multiple places, splits the counts between genes, with the weight based on the likelihood that it came from one gene or the other, so counts data may not be integers

TPM data is log2-transformed with a pseudo count of 1 added.

Reads/transcript files unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are DepMap cell line IDs

Columns: In the complete TPM and read counts datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding TPM dataset column names are HGNC_symbol (Entrez_ID). In the TPM transcript dataset column names are HGNC_symbol (Transcript_ID) - the HGNC symbols are not unique, use the transcript IDs for unique identifiers.


NEW LINES:
"""+str(newlines)+"""

REMOVED FOR QC REASONS:
"""+str(rename)+"""

BLACKLIST:
"""+str(blacklist))

Uploading expression_genes_expected_count...
hitting https://cds.team/taiga/api/datafile/db47114a5a204fa69578a449a8601b78
Conversion and upload...:
	 Downloading the file from S3


AttributeError: 'dict' object has no attribute 'message'

In [None]:
AddToVirtual(virtual_dmc, "depmap-rnaseq-expression-data-80ef", files=[('CCLE_expression_full', 'expression_genes_expected_count'), 
('CCLE_RNAseq_transcripts', 'expression_transcripts_tpm'),
('CCLE_RNAseq_reads', 'expression_genes_tpm'),
('CCLE_expression', 'expression_proteincoding_genes_tpm'), ('expression_proteincoding_genes_expected_count', 'expression_proteincoding_genes_expected_count'),('expression_transcripts_expected_count', 'expression_transcripts_expected_count')])

## DMC

In [143]:
print('not present')
removed = set(prevdmc) - set(genes_tpm.index)
print(removed)
print('removed for QC reasons')
print(set(rename.keys()))
print('removed')
removed = set(dmc_rna) - set(genes_tpm.index)
print(removed - set(rename.keys()))
missing = set(internal) - set(genes_tpm.index)
embargo_dmc = set(genes_tpm.index) - (prevdmc | set(dmc))
print('missing')
print(missing)
newlines = set(dmc) 
print('embargo_dmc')
print(len(embargo_dmc), embargo_dmc)

not present
{'ACH-002196', 'ACH-002205', 'ACH-002289', 'ACH-001064', 'ACH-001641', 'ACH-002127', 'ACH-002344', 'ACH-002384', 'ACH-002383', 'ACH-002157', 'ACH-001037', 'ACH-001143', 'ACH-002216', 'ACH-001680', 'ACH-000727', 'ACH-002351', 'ACH-002207', 'ACH-002114', 'ACH-002300', 'ACH-000795', 'ACH-002377', 'ACH-002204', 'ACH-002360', 'ACH-002292', 'ACH-002058', 'ACH-002104', 'ACH-002314', 'ACH-002183', 'ACH-002213', 'ACH-002168', 'ACH-002382', 'ACH-002260', 'ACH-001208', 'ACH-002095', 'ACH-002200', 'ACH-002269', 'ACH-002387', 'ACH-002375', 'ACH-002129', 'ACH-001199', 'ACH-002312', 'ACH-002282', 'ACH-002395', 'ACH-001338', 'ACH-002232', 'ACH-002267', 'ACH-002190', 'ACH-001502', 'ACH-001093', 'ACH-002336', 'ACH-002281', 'ACH-001137', 'ACH-002199', 'ACH-002208', 'ACH-002185', 'ACH-002276', 'ACH-002374', 'ACH-002396', 'ACH-001383', 'ACH-002237', 'ACH-002119', 'ACH-002136', 'ACH-002359', 'ACH-001182', 'ACH-002399', 'ACH-002106', 'ACH-001187', 'ACH-002394', 'ACH-002180', 'ACH-001045', 'ACH-00

In [144]:
## removing first blacklisted, then embargoed, to create two datasets
print(len(genes_expected_count))
genes_expected_count = genes_expected_count[~genes_expected_count.index.isin(embargo_dmc)]
print(len(genes_expected_count))
genes_expected_count.to_csv('temp/expression_genes_expected_count.csv')
print(len(genes_tpm))
genes_tpm = genes_tpm[~genes_tpm.index.isin(embargo_dmc)]
print(len(genes_tpm))
genes_tpm.to_csv('temp/expression_genes_tpm.csv')
print(len(proteincoding_genes_tpm))
proteincoding_genes_tpm = proteincoding_genes_tpm[~proteincoding_genes_tpm.index.isin(embargo_dmc)]
print(len(proteincoding_genes_tpm))
proteincoding_genes_tpm.to_csv('temp/expression_proteincoding_genes_tpm.csv')
print(len(transcripts_tpm))
transcripts_tpm = transcripts_tpm[~transcripts_tpm.index.isin(embargo_dmc)]
print(len(transcripts_tpm))
transcripts_tpm.to_csv('temp/expression_transcripts_tpm.csv')
print(len(proteincoding_genes_expected_count))
proteincoding_genes_expected_count = proteincoding_genes_expected_count[~proteincoding_genes_expected_count.index.isin(embargo_dmc)]
print(len(proteincoding_genes_expected_count))
proteincoding_genes_expected_count.to_csv('temp/expression_proteincoding_genes_expected_count.csv')
print(len(transcripts_expected_count))
transcripts_expected_count = transcripts_expected_count[~transcripts_expected_count.index.isin(embargo_dmc)]
print(len(transcripts_expected_count))
transcripts_expected_count.to_csv('temp/expression_transcripts_expected_count.csv')

1407
1405
1407
1405
1407
1405
1407
1405
1406
1404
1406
1404


In [145]:
tc.update_dataset(dataset_permaname="depmap-rnaseq-expression-data-80ef",
                 upload_file_path_dict={
                     'temp/expression_genes_expected_count.csv': 'NumericMatrixCSV',
                   'temp/expression_transcripts_tpm.csv': 'NumericMatrixCSV',
                   'temp/expression_genes_tpm.csv': 'NumericMatrixCSV',
                   'temp/expression_proteincoding_genes_tpm.csv': 'NumericMatrixCSV',
                   'temp/expression_proteincoding_genes_expected_count.csv': 'NumericMatrixCSV',
                   'temp/expression_transcripts_expected_count.csv': 'NumericMatrixCSV'},
                  dataset_description=
"""
# DMC RNA

* Version 1-3 DMC 19Q1*

version 3 contains the correct data for 19Q1

version 2 contains correct TPM transcript data (in version 1 transcript data contains only gene level not transcript level data)

* Version 4 DMC 19Q2*

* Version 5 DMC 19Q3*

* Version 6 DMC 19Q4*

Adding 93 new cell lines - Blacklisted - IBM
Some cells lines have been removed because they:

 - had too many 0 values = ["ACH-001388" "ACH-001577" "ACH-001767" "ACH-002463"] 

Some cells lines have been flagged as:

 - None

* Version 7 DMC 19Q4

removing blacklisted

* Version 8 DMC 19Q4

removing blacklisted in transcripts

* Version 9 DMC 19Q4

uploading as numeric matrix

* Version 10 DMC 20Q1
unknown reupload

* Version 11 DMC 20Q2
adding  new cell lines

* Version 12 DMC 20Q2
unknown reupload

* Version 13 DMC 20Q2
removing some missed blacklisted lines

* Version 14 DMC 20Q2
Adding one missing line

* Version 15 DMC 20Q3
nothing different from  20Q2. no new cell lines added

* Version 16 DMC 20Q3
Some wrong annotations in the blacklists

* Version 17 DMC 20Q3
Updated annotations in the blacklists

* Version 18 DMC 20Q4
new samples, adding a new QC method that removed 10 samples. fully adding the reprocessed samples, removing some wrong genes, adding falsely removed genes (that are still expressed)

* Version 18 DMC 20Q4
revertig logtransform

* Version 19 DMC 20Q4
reverting gene names

data is aligned to hg38

read count data is created using RSEM, which, when a read maps to multiple places, splits the counts between genes, with the weight based on the likelihood that it came from one gene or the other, so counts data may not be integers

TPM data is log2-transformed with a pseudo count of 1 added.

Reads/transcript files unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are DepMap cell line IDs

Columns: In the complete TPM and read counts datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding TPM dataset column names are HGNC_symbol (Entrez_ID). In the TPM transcript dataset column names are HGNC_symbol (Transcript_ID) - the HGNC symbols are not unique, use the transcript IDs for unique identifiers.


NEW LINES:
"""+str(newlines)+"""

REMOVED FOR QC REASONS:
"""+str(rename)+"""

BLACKLIST:
"""+str(embargo_dmc))

Uploading expression_genes_expected_count...
hitting https://cds.team/taiga/api/datafile/19b98e19792d47749558e1b906dd8cec
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to deter

	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to d

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin

	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downlo

	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1000
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, line 1250
	 Conversion in progress, li

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin

'c3c447b0823a438396c828b9b05a3596'

In [146]:
AddToVirtual(virtual_dmc, "depmap-rnaseq-expression-data-80ef", files=[('CCLE_expression_full', 'expression_genes_expected_count'), 
('CCLE_RNAseq_transcripts', 'expression_transcripts_tpm'),
('CCLE_RNAseq_reads', 'expression_genes_tpm'),
('CCLE_expression', 'expression_proteincoding_genes_tpm'), ('expression_proteincoding_genes_expected_count', 'expression_proteincoding_genes_expected_count'),('expression_transcripts_expected_count', 'expression_transcripts_expected_count')])

[('CCLE_expression_full', 'depmap-rnaseq-expression-data-80ef.21/expression_genes_expected_count'), ('CCLE_RNAseq_transcripts', 'depmap-rnaseq-expression-data-80ef.21/expression_transcripts_tpm'), ('CCLE_RNAseq_reads', 'depmap-rnaseq-expression-data-80ef.21/expression_genes_tpm'), ('CCLE_expression', 'depmap-rnaseq-expression-data-80ef.21/expression_proteincoding_genes_tpm'), ('expression_proteincoding_genes_expected_count', 'depmap-rnaseq-expression-data-80ef.21/expression_proteincoding_genes_expected_count'), ('expression_transcripts_expected_count', 'depmap-rnaseq-expression-data-80ef.21/expression_transcripts_expected_count')]
hitting https://cds.team/taiga/api/datafile/e90c75655bcd49828604a9956132b74c
hitting https://cds.team/taiga/api/datafile/e90c75655bcd49828604a9956132b74c
hitting https://cds.team/taiga/api/datafile/e90c75655bcd49828604a9956132b74c
hitting https://cds.team/taiga/api/datafile/e90c75655bcd49828604a9956132b74c
hitting https://cds.team/taiga/api/datafile/e90c75655

## Public

In [147]:
print('not present')
removed = set(prevpublic) - set(genes_tpm.index)
print(removed)
print('removed for QC reasons')
print(set(rename.keys()))
print('removed')
removed = set(public_rna) - set(genes_tpm.index)
print(removed - set(rename.keys()))
missing = set(public) - set(genes_tpm.index)
embargo = set(genes_tpm.index) - (prevpublic | set(public))
print('missing')
print(missing)
newlines = set(public) 
print('embargo')
print(len(embargo), embargo)
print(newlines)

not present
{'ACH-002196', 'ACH-002205', 'ACH-002289', 'ACH-001064', 'ACH-001641', 'ACH-002127', 'ACH-002344', 'ACH-002384', 'ACH-002383', 'ACH-002157', 'ACH-001037', 'ACH-001143', 'ACH-002216', 'ACH-001680', 'ACH-000727', 'ACH-002351', 'ACH-002207', 'ACH-002114', 'ACH-002300', 'ACH-000795', 'ACH-002377', 'ACH-002204', 'ACH-002360', 'ACH-002292', 'ACH-002058', 'ACH-002104', 'ACH-002314', 'ACH-002183', 'ACH-002213', 'ACH-002168', 'ACH-002382', 'ACH-002260', 'ACH-001208', 'ACH-002095', 'ACH-002200', 'ACH-002269', 'ACH-002387', 'ACH-002375', 'ACH-002129', 'ACH-001199', 'ACH-002312', 'ACH-002282', 'ACH-002395', 'ACH-001338', 'ACH-002232', 'ACH-002267', 'ACH-002190', 'ACH-001093', 'ACH-002336', 'ACH-002281', 'ACH-001137', 'ACH-002199', 'ACH-002208', 'ACH-002185', 'ACH-002276', 'ACH-002374', 'ACH-002396', 'ACH-001383', 'ACH-002237', 'ACH-002119', 'ACH-002136', 'ACH-002359', 'ACH-001182', 'ACH-002106', 'ACH-001187', 'ACH-002394', 'ACH-002180', 'ACH-001045', 'ACH-002291', 'ACH-002230', 'ACH-00

In [148]:
## removing first blacklisted, then embargoed, to create two datasets
print(len(genes_expected_count))
genes_expected_count = genes_expected_count[~genes_expected_count.index.isin(embargo)]
print(len(genes_expected_count))
genes_expected_count.to_csv('temp/expression_genes_expected_count.csv')
print(len(genes_tpm))
genes_tpm = genes_tpm[~genes_tpm.index.isin(embargo)]
print(len(genes_tpm))
genes_tpm.to_csv('temp/expression_genes_tpm.csv')
print(len(proteincoding_genes_tpm))
proteincoding_genes_tpm = proteincoding_genes_tpm[~proteincoding_genes_tpm.index.isin(embargo)]
print(len(proteincoding_genes_tpm))
proteincoding_genes_tpm.to_csv('temp/expression_proteincoding_genes_tpm.csv')
print(len(transcripts_tpm))
transcripts_tpm = transcripts_tpm[~transcripts_tpm.index.isin(embargo)]
print(len(transcripts_tpm))
transcripts_tpm.to_csv('temp/expression_transcripts_tpm.csv')
print(len(proteincoding_genes_expected_count))
proteincoding_genes_expected_count = proteincoding_genes_expected_count[~proteincoding_genes_expected_count.index.isin(embargo)]
print(len(proteincoding_genes_expected_count))
proteincoding_genes_expected_count.to_csv('temp/expression_proteincoding_genes_expected_count.csv')
print(len(transcripts_expected_count))
transcripts_expected_count = transcripts_expected_count[~transcripts_expected_count.index.isin(embargo)]
print(len(transcripts_expected_count))
transcripts_expected_count.to_csv('temp/expression_transcripts_expected_count.csv')

1405
1376
1405
1376
1405
1376
1405
1376
1404
1375
1404
1375


In [149]:
tc.update_dataset(dataset_permaname="depmap-rnaseq-expression-data-ccd0",
                 upload_file_path_dict={
                     'temp/expression_genes_expected_count.csv': 'NumericMatrixCSV',
                   'temp/expression_transcripts_tpm.csv': 'NumericMatrixCSV',
                   'temp/expression_genes_tpm.csv': 'NumericMatrixCSV',
                   'temp/expression_proteincoding_genes_tpm.csv': 'NumericMatrixCSV',
                   'temp/expression_proteincoding_genes_expected_count.csv': 'NumericMatrixCSV',
                   'temp/expression_transcripts_expected_count.csv': 'NumericMatrixCSV'},
                  dataset_description=
"""
# PUBLIC RNA

* Version 1-2 Public 18Q1*

Original source (`CCLE_DepMap_18Q1_RNAseq_reads_20180214.gct`, `CCLE_DepMap_18Q1_RNAseq_RPKM_20180214.gct`) downloaded from portals.broadinstitute.org/ccle
RPKM file is log2(RPKM) with a "noisy floor" around -3 (-3 + N(0, 0.1))

* Version 3-5 Public 18Q2*

gene expression data (RNAseq for1,076 cell lines, including data for 28 newly released cell lines)

original source: (`/xchip/ccle_dist/public/DepMap_18Q2/CCLE_DepMap_18Q2_RNAseq_RPKM_20180502.gct`, `/xchip/ccle_dist/public/DepMap_18Q2/CCLE_DepMap_18Q2_RNAseq_reads_20180502.gct`)
* Version 6-7 Public 18Q3*

gene expression data (80 newly released cell lines)

Original data source:

`/xchip/ccle_dist/public/DepMap_18Q3/CCLE_DepMap_18q3_RNAseq_reads_20180718.gct`
`/xchip/ccle_dist/public/DepMap_18Q3/CCLE_DepMap_18q3_RNAseq_RPKM_20180718.gct`

RPKM data are log2-transformed with a noisy floor at -3 (-3 + N(0, 0.1)). Reads file unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are Broad (Arxspan) cell line IDs. Mapping between Broad IDs and CCLE IDs can be done using a R or python package

To install R implementation: options(repos = c("https://iwww.broadinstitute.org/~datasci/R-packages", "https://cran.cnr.berkeley.edu")) install.packages('celllinemapr')

To install python implementation: pip install https://intranet.broadinstitute.org/~datasci/python-packages/cell_line_mapper-0.1.9.tar.gz)

Columns: In the complete RPKM and read datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding RPKM and read datasets column names are HGNC_symbol (Entrez_ID)

version 7 removes duplicate gene names from the protein coding datasets

* Version 8-9 Public 18Q4*

_ 18Q4 transcript level data is found in version 11. (In versions 8-9 transcript data contains only gene level not transcript level data)

changing to TPM expression

Original data source:

`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_DepMap_18Q4_rsem_genes_tpm_20181029.txt`
`/xchip/ccle_dist/broad_only/CMAG/expression/CCLE_DepMap_18Q4_rsem_transcripts_tpm_20181029.gct`
`/xchip/ccle_dist/public/DepMap_18Q4/CCLE_DepMap_18q4_RNAseq_reads_20181029.gct`

`/xchip/ccle_dist/public/DepMap_18Q4/CCLE_DepMap_18q4_RNAseq_RPKM_20181029.gct`

TPM data is subsetted to just public cell lines using the cell line found in the RPKM dataset.

TPM data is the primary expression data now. It is log2-transformed with a pseudo count of 1 added

RPKM data are log2-transformed with a pseudo count of 1 added. RPKM values are no longer thresholded.

Reads/transcript files unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

Rows: are DepMap (Arxspan) cell line IDs

Columns: In the complete TPM, TPM transcripts, RPKM and read datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding TPM and read datasets column names are HGNC_symbol (Entrez_ID)

* Version 10-12 Public 19Q1*

version 12 contains the correct data for 19Q1

version 11 contains the correct transcript level data for 19Q1 and 18Q4

* Version 13 Public 19Q2*

* Version 14 Public 19Q3*

* Version 15 Public 19Q4*
Adding 93 new cell lines - Blacklisted - IBM - DMC
Some cells lines have been removed because they:

 - had too many 0 values = ["ACH-001388" "ACH-001577" "ACH-001767" "ACH-002463"] 

Some cells lines have been flagged as:

 - None
 
* Version 16 Public 19Q4*
removing unauthorized cell lines

* Version 17 Public 20Q1*
adding 6 new lines

* Version 18 Public 20Q1
Unknown reupdate

* Version 19 Public 20Q2
adding  new cell lines

* Version 20 Public 20Q2
unknown reupload

* Version 21 Public 20Q2
removing some missed blacklisted lines

* Version 22 Public 20Q3
nothing different from  20Q2. no new cell lines added

* Version 23 Public 20Q3
Some wrong annotations in the blacklists

* Version 24 Public 20Q3
Updated annotations in the blacklists

* Version 25 Public 20Q3
Updated the dmc list

* Version 25 Public 20Q3
Readding a cell line that was already in public before.

* Version 26 Public 20Q4
new samples, adding a new QC method that removed 10 samples. fully adding the reprocessed samples, removing some wrong genes, adding falsely removed genes (that are still expressed)

* Version 26 Public 20Q4
adding log transform

* Version 27 Public 20Q4
reverting gene names

* Version 28 Public 20Q4
removing more lines

data is hg38 aligned

read count data is created using RSEM, which, when a read maps to multiple places, splits the counts between genes, with the weight based on the likelihood that it came from one gene or the other, so counts data may not be integers

TPM data is log2-transformed with a pseudo count of 1 added. log2(X+1)

Reads/transcript files unaltered aside from formatting row / column names.

The ProteinCoding datasets contain just protein coding genes and are based on protein coding annotations from https://www.genenames.org/cgi-bin/download (locus_group == "protein-coding gene")

reverting gene names
Rows: are DepMap cell line IDs

Columns: In the complete TPM and read counts datasets column names are HGNC_symbol (Ensembl_ID), while in the ProteinCoding TPM dataset column names are HGNC_symbol (Entrez_ID). In the TPM transcript dataset column names are HGNC_symbol (Transcript_ID) - the HGNC symbols are not unique, use the transcript IDs for unique identifiers.


NEW LINES:
"""+str(newlines)+"""

REMOVED FOR QC REASONS:
"""+str(rename)+"""

BLACKLIST:
"""+str(embargo))

Uploading expression_genes_expected_count...
hitting https://cds.team/taiga/api/datafile/f185c92e3504461ea65f139ccf545935
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to deter

	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to d

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin

	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 1001)
	 Scanning through file to determine size (line 10

	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploadin


Dataset version with id e964dfa4d9414ee2b211977b2e2877db created. You can access to this dataset version directly with this url: https://cds.team/taiga/dataset_version/e964dfa4d9414ee2b211977b2e2877db


'e964dfa4d9414ee2b211977b2e2877db'

In [150]:
AddToVirtual(virtual_public, "depmap-rnaseq-expression-data-ccd0", files=[('CCLE_expression_full', 'expression_genes_expected_count'), 
('CCLE_RNAseq_transcripts', 'expression_transcripts_tpm'),
('CCLE_RNAseq_reads', 'expression_genes_tpm'),
('CCLE_expression', 'expression_proteincoding_genes_tpm'), ('expression_proteincoding_genes_expected_count', 'expression_proteincoding_genes_expected_count'),('expression_transcripts_expected_count', 'expression_transcripts_expected_count')])

[('CCLE_expression_full', 'depmap-rnaseq-expression-data-ccd0.31/expression_genes_expected_count'), ('CCLE_RNAseq_transcripts', 'depmap-rnaseq-expression-data-ccd0.31/expression_transcripts_tpm'), ('CCLE_RNAseq_reads', 'depmap-rnaseq-expression-data-ccd0.31/expression_genes_tpm'), ('CCLE_expression', 'depmap-rnaseq-expression-data-ccd0.31/expression_proteincoding_genes_tpm'), ('expression_proteincoding_genes_expected_count', 'depmap-rnaseq-expression-data-ccd0.31/expression_proteincoding_genes_expected_count'), ('expression_transcripts_expected_count', 'depmap-rnaseq-expression-data-ccd0.31/expression_transcripts_expected_count')]
hitting https://cds.team/taiga/api/datafile/2df40e32ef21449dbff869bce1d1d918
hitting https://cds.team/taiga/api/datafile/2df40e32ef21449dbff869bce1d1d918
hitting https://cds.team/taiga/api/datafile/2df40e32ef21449dbff869bce1d1d918
hitting https://cds.team/taiga/api/datafile/2df40e32ef21449dbff869bce1d1d918
hitting https://cds.team/taiga/api/datafile/2df40e32e

# Fusions

In [55]:
fusions=pd.read_csv('temp/unfiltered_fusions_'+release+'.csv')
filtered=pd.read_csv('temp/fusions_'+release+'.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'temp/unfiltered_fusions_20Q4.csv'

## Internal

In [56]:
print('not present')
removed = set(previnternal) - set(fusions.DepMap_ID)
print(removed)
print('removed for QC reasons')
print(set(rename.keys()))
print('removed')
removed = set(internal_rna) - set(fusions.DepMap_ID)
print(removed - set(rename.keys()))
missing = set(internal) - set(fusions.DepMap_ID)
blacklist = set(fusions.DepMap_ID) - (previnternal | set(internal))
print('missing')
print(missing)
newlines = set(internal) 
print('blacklist')
print(len(blacklist), blacklist)

not present
{'ACH-002196', 'ACH-002205', 'ACH-002289', 'ACH-001064', 'ACH-001641', 'ACH-002127', 'ACH-002344', 'ACH-002384', 'ACH-002383', 'ACH-002157', 'ACH-001037', 'ACH-001143', 'ACH-002216', 'ACH-001680', 'ACH-000727', 'ACH-002351', 'ACH-002207', 'ACH-002114', 'ACH-002300', 'ACH-000795', 'ACH-002377', 'ACH-002204', 'ACH-002360', 'ACH-002292', 'ACH-002058', 'ACH-002104', 'ACH-002314', 'ACH-002183', 'ACH-002213', 'ACH-002168', 'ACH-002382', 'ACH-002260', 'ACH-001208', 'ACH-002095', 'ACH-002200', 'ACH-002269', 'ACH-002387', 'ACH-002375', 'ACH-002129', 'ACH-001199', 'ACH-002312', 'ACH-000561', 'ACH-002282', 'ACH-002395', 'ACH-001338', 'ACH-002232', 'ACH-002267', 'ACH-002190', 'ACH-001502', 'ACH-001093', 'ACH-002336', 'ACH-002281', 'ACH-001137', 'ACH-002199', 'ACH-002208', 'ACH-002185', 'ACH-002276', 'ACH-002374', 'ACH-002396', 'ACH-001383', 'ACH-002237', 'ACH-001151', 'ACH-002119', 'ACH-002136', 'ACH-002359', 'ACH-001182', 'ACH-002399', 'ACH-002106', 'ACH-001187', 'ACH-002394', 'ACH-00

In [57]:
## removing first blacklisted, then embargoed, to create two datasets
print(len(fusions))
fusions = fusions[~fusions.DepMap_ID.isin(blacklist)]
print(len(fusions))
fusions.to_csv('temp/fusions.csv', index=False)
print(len(filtered))
filtered= filtered[~filtered.DepMap_ID.isin(blacklist)]
print(len(filtered))
filtered.to_csv('temp/filtered_fusions.csv', index=False)

357747
356696
44411
44066


In [50]:
tc.update_dataset(dataset_permaname="gene-fusions-8b7a",
                 upload_file_path_dict={
                     'temp/fusions.csv': 'TableCSV',
                     'temp/filtered_fusions.csv': 'TableCSV'},
                  dataset_description=
"""
# Internal Fusions

Description: Gene fusions derived from RNAseq data.

Rows: cell lines

Original Raw Data: Generated by Mahmoud Ghandi on April 25, 2017. Can be found at xchip_ccle_dist/broad_only/unpublished_Novartis_data/RNAseq/fusions.txt

Version 3: added a column containing the Broad_ID

* Version 4-5 Internal 19Q1*

version 5 contains the correct data for 19Q1

Description: Gene fusions derived from RNAseq data.

Rows: cell lines

* Version 6 Internal 19Q2*

* Version 7*

Josh D added "common_fusion_matrix".

Binary matrix of the most common gene fusions (those where the two involved genes are fused in at least 5 cell lines) with no additional filtering. Use at your own risk.

Description: Gene fusions derived from RNAseq data.

Rows: cell lines

* Version 8 Internal 19Q3*

* Version 9 Internal 19Q4*

* Version 10 Internal 20Q1*
Adding new lines

* Version 11 Internal 20Q1*
Adding a file

* Version 12 Internal 20Q2*
Adding 50 new lines

* Version 13 Internal 20Q3*
nothing different from  20Q2. no new cell lines added

* Version 14 Internal 20Q3*
issues with the blacklists

* Version 15 Internal 20Q4*
adding new lines, new fusion filtering, debugged sample filtering (should recover the same sample as in expression dataset).

* Version 16 Internal 20Q4*
renaming fusions

Description: Gene fusions derived from RNAseq data.

Rows: cell lines, IDs contained in the column DepMap_ID

Unfiltered data contains all output fusions, while the filtered data uses the filters suggested by the star fusion docs. These filters are:
- FFPM > 0.1 -  a cutoff of 0.1 means&nbsp;at least 1 fusion-supporting RNAseq fragment per 10M total reads
- Remove known false positives, such as GTEx recurrent fusions and certain paralogs
- Genes that are next to each other
- Fusions with mitochondrial breakpoints
- Removing fusion involving mitochondrial chromosomes or HLA genes
- Removed common false positive fusions (red herring annotations as described in the STAR-Fusion docs)
- Recurrent fusions observed in CCLE across cell lines (in 10% or more of the samples)
- Removed fusions where SpliceType='INCL_NON_REF_SPLICE' and LargeAnchorSupport='NO_LDAS' and FFPM < 0.1
- FFPM < 0.05

NEW LINES:
"""+str(newlines)+"""

REMOVED FOR QC REASONS:
"""+str(rename)+"""

BLACKLIST:
"""+str(blacklist))

Uploading fusions...
hitting https://cds.team/taiga/api/datafile/5a3c17c40fc44cf6946175ded53da89d
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3

	 Done: fusions properly converted and uploaded
Uploading filtered_fusions...
hitting https://cds.team/taiga/api/datafile/5a3c17c40fc44cf6946175ded53da89d
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Uploading to S3


'3fac9d6b279142438588a794f0da440a'

In [52]:
AddToVirtual('depmap-a0ab', "gene-fusions-8b7a", files=[('CCLE_fusions_unfiltered', 'fusions'),('CCLE_fusions', 'filtered_fusions')])

AddToVirtual(virtual_internal, "gene-fusions-8b7a", files=[('CCLE_fusions_unfiltered', 'fusions'),('CCLE_fusions', 'filtered_fusions')])

[('CCLE_fusions_unfiltered', 'gene-fusions-8b7a.16/fusions'), ('CCLE_fusions', 'gene-fusions-8b7a.16/filtered_fusions')]
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/api/datafile/e9e5878dd5db4d1585d0184db1854b46
hitting https://cds.team/taiga/a

## IBM

In [47]:
print('not present')
removed = set(prevdmc) - set(fusions.DepMap_ID)
print(removed)
print('removed for QC reasons')
print(set(rename.keys()))
print('removed')
removed = set(dmc_rna) - set(fusions.DepMap_ID)
print(removed - set(rename.keys()))
missing = set(ibm) - set(fusions.DepMap_ID)
embargo_ibm = set(fusions.DepMap_ID) - (prevdmc | set(ibm))
print('missing')
print(missing)
newlines = set(ibm) 
print('embargo_ibm')
print(len(embargo_ibm), embargo_ibm)

not present
{'ACH-002121', 'ACH-002384', 'ACH-002168', 'ACH-002337', 'ACH-002139', 'ACH-002231', 'ACH-002148', 'ACH-000561', 'ACH-002193', 'ACH-002400', 'ACH-002091', 'ACH-002236', 'ACH-001092', 'ACH-002360', 'ACH-002123', 'ACH-002260', 'ACH-001767', 'ACH-002052', 'ACH-002051', 'ACH-002093', 'ACH-002096', 'ACH-002274', 'ACH-002129', 'ACH-002122', 'ACH-001037', 'ACH-002314', 'ACH-001127', 'ACH-002294', 'ACH-000426', 'ACH-002124', 'ACH-002275', 'ACH-002105', 'ACH-001131', 'ACH-002194', 'ACH-002097', 'ACH-002264', 'ACH-002208', 'ACH-001064', 'ACH-002313', 'ACH-002133', 'ACH-002358', 'ACH-002154', 'ACH-002396', 'ACH-002181', 'ACH-001047', 'ACH-002229', 'ACH-002307', 'ACH-002374', 'ACH-001639', 'ACH-002226', 'ACH-002190', 'ACH-001045', 'ACH-002151', 'ACH-002182', 'ACH-002201', 'ACH-002270', 'ACH-001118', 'ACH-002204', 'ACH-002178', 'ACH-002210', 'ACH-002271', 'ACH-002283', 'ACH-001225', 'ACH-002369', 'ACH-002285', 'ACH-001224', 'ACH-002180', 'ACH-002290', 'ACH-001359', 'ACH-002394', 'ACH-00

In [48]:
## removing first blacklisted, then embargoed, to create two datasets
print(len(fusions))
a = fusions[~fusions.DepMap_ID.isin(embargo_ibm | prevdmc)]
print(len(a))
a.to_csv('temp/fusions.csv', index=False)
print(len(filtered))
a= filtered[~filtered.DepMap_ID.isin(embargo_ibm | prevdmc)]
print(len(a))
a.to_csv('temp/unfiltered_fusions.csv', index=False)

356696
0
44066
0


In [None]:
tc.update_dataset(dataset_permaname="gene-fusions-375f",
                 upload_file_path_dict={
                     'temp/fusions.csv': 'TableCSV',
                     'temp/unfiltered_fusions.csv': 'TableCSV'},
                  dataset_description=
"""
# IBM Fusions

* Version 1*
first ibm version, adding new lines, new fusion filtering, debugged sample filtering (should recover the same sample as in expression dataset).


Rows: cell lines, IDs contained in the column DepMap_ID

Unfiltered data contains all output fusions, while the filtered data uses the filters suggested by the star fusion docs. These filters are:
- FFPM > 0.1 -  a cutoff of 0.1 means&nbsp;at least 1 fusion-supporting RNAseq fragment per 10M total reads
- Remove known false positives, such as GTEx recurrent fusions and certain paralogs
- Genes that are next to each other
- Fusions with mitochondrial breakpoints
- Removing fusion involving mitochondrial chromosomes or HLA genes
- Removed common false positive fusions (red herring annotations as described in the STAR-Fusion docs)
- Recurrent fusions observed in CCLE across cell lines (in 10% or more of the samples)
- Removed fusions where SpliceType='INCL_NON_REF_SPLICE' and LargeAnchorSupport='NO_LDAS' and FFPM < 0.1
- FFPM < 0.05

NEW LINES:
"""+str(newlines)+"""

REMOVED FOR QC REASONS:
"""+str(rename)+"""

BLACKLIST:
"""+str(blacklist))

In [None]:
AddToVirtual(virtual_ibm, "gene-fusions-375f", files=[('CCLE_fusions', 'filtered_fusions_'+release),('CCLE_fusions_unfiltered', 'unfiltered_fusions_'+release)])

## DMC

In [58]:
print('not present')
removed = set(prevdmc) - set(fusions.DepMap_ID)
print(removed)
print('removed for QC reasons')
print(set(rename.keys()))
print('removed')
removed = set(dmc_rna) - set(fusions.DepMap_ID)
print(removed - set(rename.keys()))
missing = set(dmc) - set(fusions.DepMap_ID)
dmc_embargo = set(fusions.DepMap_ID) - (prevdmc | set(dmc))
print('missing')
print(missing)
newlines = set(dmc) 
print('dmc_embargo')
print(len(dmc_embargo), dmc_embargo)

not present
{'ACH-002196', 'ACH-002205', 'ACH-002289', 'ACH-001064', 'ACH-001641', 'ACH-002127', 'ACH-002344', 'ACH-002384', 'ACH-002383', 'ACH-002157', 'ACH-001037', 'ACH-001143', 'ACH-002216', 'ACH-001680', 'ACH-000727', 'ACH-002351', 'ACH-002207', 'ACH-002114', 'ACH-002300', 'ACH-000795', 'ACH-002377', 'ACH-002204', 'ACH-002360', 'ACH-002292', 'ACH-002058', 'ACH-002104', 'ACH-002314', 'ACH-002183', 'ACH-002213', 'ACH-002168', 'ACH-002382', 'ACH-002260', 'ACH-001208', 'ACH-002095', 'ACH-002200', 'ACH-002269', 'ACH-002387', 'ACH-002375', 'ACH-002129', 'ACH-001199', 'ACH-002312', 'ACH-000561', 'ACH-002282', 'ACH-002395', 'ACH-001338', 'ACH-002232', 'ACH-002267', 'ACH-002190', 'ACH-001502', 'ACH-001093', 'ACH-002336', 'ACH-002281', 'ACH-001137', 'ACH-002199', 'ACH-002208', 'ACH-002185', 'ACH-002276', 'ACH-002374', 'ACH-002396', 'ACH-001383', 'ACH-002237', 'ACH-001151', 'ACH-002119', 'ACH-002136', 'ACH-002359', 'ACH-001182', 'ACH-002399', 'ACH-002106', 'ACH-001187', 'ACH-002394', 'ACH-00

In [59]:
## removing first blacklisted, then embargoed, to create two datasets
print(len(fusions))
fusions = fusions[~fusions.DepMap_ID.isin(dmc_embargo)]
print(len(fusions))
fusions.to_csv('temp/fusions.csv', index=False)
print(len(filtered))
filtered= filtered[~filtered.DepMap_ID.isin(dmc_embargo)]
print(len(filtered))
filtered.to_csv('temp/filtered_fusions.csv', index=False)

356696
356375
44066
44018


In [56]:
tc.update_dataset(dataset_permaname="gene-fusions-375f",
                 upload_file_path_dict={
                     'temp/fusions.csv': 'TableCSV',
                     'temp/filtered_fusions.csv': 'TableCSV'},
                  dataset_description=
"""
# DMC Fusions

* Version 1-2 DMC 19Q1*

version 2 contains the correct data for 19Q1

Description: Gene fusions derived from RNAseq data.

Rows: cell lines

Unfiltered data contains all output fusions, while the filtered data uses the filters suggested by the star fusion docs. These filters are:
- FFPM > 0.1 -  a cutoff of 0.1 means&nbsp;at least 1 fusion-supporting RNAseq fragment per 10M total reads
- Remove known false positives, such as GTEx recurrent fusions and certain paralogs
- Genes that are next to each other
- Fusions with mitochondrial breakpoints

* Version 3 DMC 19Q2*

* Version 4 DMC 19Q3*

* Version 5 DMC 19Q4*

* Version 6 DMC 20Q1*

* Version 7 DMC 20Q2*
adding 50 new lines

* Version 7 DMC 20Q2*
Unknown change

* Version 9 DMC 20Q3*
nothing different from  20Q2. no new cell lines added

* Version 10 DMC 20Q3*
updating the blacklists

* Version 11 DMC 20Q3*
issues with the blacklists resolved
Description: Gene fusions derived from RNAseq data.

* Version 12 DMC 20Q3*
update in dmc list

* Version 13 DMC 20Q4*
adding new lines, new fusion filtering, debugged sample filtering (should recover the same sample as in expression dataset).

* Version 14 DMC 20Q4*
renaming fusios


Rows: cell lines, IDs contained in the column DepMap_ID
LeftGene and RightGene separated by an ampersand ("&").

Unfiltered data contains all output fusions, while the filtered data uses the filters suggested by the star fusion docs. These filters are:
- FFPM > 0.1 -  a cutoff of 0.1 means&nbsp;at least 1 fusion-supporting RNAseq fragment per 10M total reads
- Remove known false positives, such as GTEx recurrent fusions and certain paralogs
- Genes that are next to each other
- Fusions with mitochondrial breakpoints
- Removing fusion involving mitochondrial chromosomes or HLA genes
- Removed common false positive fusions (red herring annotations as described in the STAR-Fusion docs)
- Recurrent fusions observed in CCLE across cell lines (in 10% or more the samples)
- Removed fusions where SpliceType='INCL_NON_REF_SPLICE' and LargeAnchorSupport='NO_LDAS' and FFPM < 0.1
- FFPM < 0.05

NEW LINES:
"""+str(newlines)+"""

REMOVED FOR QC REASONS:
"""+str(rename)+"""

EMBARGO:
"""+str(dmc_embargo))

Uploading fusions...
hitting https://cds.team/taiga/api/datafile/2231566eb1634a3ba2a9349912f3fed7
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3

	 Done: fusions properly converted and uploaded
Uploading filtered_fusions...
hitting https://cds.team/taiga/api/datafile/2231566eb1634a3ba2a9349912f3fed7
Conversion and upload...:
	 Downloading the file fro

'9b521d5f1d644899af1451e2b3fe453f'

In [57]:
AddToVirtual(virtual_dmc, "gene-fusions-375f", files=[('CCLE_fusions_unfiltered', 'fusions'),('CCLE_fusions', 'filtered_fusions')])

[('CCLE_fusions_unfiltered', 'gene-fusions-375f.14/fusions'), ('CCLE_fusions', 'gene-fusions-375f.14/filtered_fusions')]
hitting https://cds.team/taiga/api/datafile/e2a36844d9b248ecb82600efa0988594
hitting https://cds.team/taiga/api/datafile/e2a36844d9b248ecb82600efa0988594
hitting https://cds.team/taiga/api/datafile/e2a36844d9b248ecb82600efa0988594
hitting https://cds.team/taiga/api/datafile/e2a36844d9b248ecb82600efa0988594
hitting https://cds.team/taiga/api/datafile/e2a36844d9b248ecb82600efa0988594
hitting https://cds.team/taiga/api/datafile/e2a36844d9b248ecb82600efa0988594
hitting https://cds.team/taiga/api/datafile/e2a36844d9b248ecb82600efa0988594
hitting https://cds.team/taiga/api/datafile/e2a36844d9b248ecb82600efa0988594
hitting https://cds.team/taiga/api/datafile/e2a36844d9b248ecb82600efa0988594
hitting https://cds.team/taiga/api/datafile/e2a36844d9b248ecb82600efa0988594
hitting https://cds.team/taiga/api/datafile/e2a36844d9b248ecb82600efa0988594
hitting https://cds.team/taiga/a

## Public

In [60]:
print('not present')
removed = set(prevpublic) - set(genes_tpm.index)
print(removed)
print('removed for QC reasons')
print(set(rename.keys()))
print('removed')
removed = set(public_rna) - set(genes_tpm.index)
print(removed - set(rename.keys()))
missing = set(public) - set(genes_tpm.index)
blacklist = set(genes_tpm.index) - (prevpublic | set(public))
print('missing')
print(missing)
newlines = set(public) 
print('embargo')
print(len(embargo), embargo)
print(newlines)

not present
{'ACH-002196', 'ACH-002205', 'ACH-002289', 'ACH-001064', 'ACH-001641', 'ACH-002127', 'ACH-002344', 'ACH-002384', 'ACH-002383', 'ACH-002157', 'ACH-001037', 'ACH-001143', 'ACH-002216', 'ACH-001680', 'ACH-000727', 'ACH-002351', 'ACH-002207', 'ACH-002114', 'ACH-002300', 'ACH-000795', 'ACH-002377', 'ACH-002204', 'ACH-002360', 'ACH-002292', 'ACH-002058', 'ACH-002104', 'ACH-002314', 'ACH-002183', 'ACH-002213', 'ACH-002168', 'ACH-002382', 'ACH-002260', 'ACH-001208', 'ACH-002095', 'ACH-002200', 'ACH-002269', 'ACH-002387', 'ACH-002375', 'ACH-002129', 'ACH-001199', 'ACH-002312', 'ACH-002282', 'ACH-002395', 'ACH-001338', 'ACH-002232', 'ACH-002267', 'ACH-002190', 'ACH-001093', 'ACH-002336', 'ACH-002281', 'ACH-001137', 'ACH-002199', 'ACH-002208', 'ACH-002185', 'ACH-002276', 'ACH-002374', 'ACH-002396', 'ACH-001383', 'ACH-002237', 'ACH-002119', 'ACH-002136', 'ACH-002359', 'ACH-001182', 'ACH-002106', 'ACH-001187', 'ACH-002394', 'ACH-002180', 'ACH-001045', 'ACH-002291', 'ACH-002230', 'ACH-00

In [61]:
## removing first blacklisted, then embargoed, to create two datasets
print(len(fusions))
fusions = fusions[~fusions.DepMap_ID.isin(embargo)]
print(len(fusions))
fusions.to_csv('temp/fusions.csv', index=False)
print(len(filtered))
filtered= filtered[~filtered.DepMap_ID.isin(embargo)]
print(len(filtered))
filtered.to_csv('temp/filtered_fusions.csv', index=False)

356375
352748
44018
43394


In [62]:
tc.update_dataset(dataset_permaname="gene-fusions-6212",
                 upload_file_path_dict={
                     'temp/fusions.csv': 'TableCSV',
                     'temp/filtered_fusions.csv': 'TableCSV'},
                  dataset_description=
"""
# PUBLIC Fusions

* Version 1 Public 2017 data*

Description: Gene fusions derived from RNAseq data.

Rows: cell lines, ID contained in the column Broad_ID

Original Raw Data: Generated by Mahmoud Ghandi on April 25, 2017. Can be found at xchip_ccle_dist/broad_only/unpublished_Novartis_data/RNAseq/fusions.txt

* Version 2-3 Public 19Q1*

version 3 contains the correct data for 19Q1

* Version 4-6 Public 19Q2*

in version 5 formatting of the columns is improved

* Version 7 Public 19Q3*

* Version 8 Public 19Q4*

* Version 9 Public 20Q1*

* Version 10 Public 20Q2*
adding 50 new lines

* Version 11 Public 20Q3*
nothing different from  20Q2. no new cell lines added

* Version 12 Public 20Q3*
updated blacklists

* Version 13 Public 20Q3*
issues with the blacklists

* Version 14 Public 20Q3*
updating the dmc list 

* Version 15 Public 20Q3*
re adding two missing, already released samples

* Version 16 Public 20Q4*
adding new lines, new fusion filtering, debugged sample filtering (should recover the same sample as in expression dataset).

* Version 17 Public 20Q4*
renaming fusions

* Version 18 Public 20Q4*
removing lines

Description: Gene fusions derived from RNAseq data.

Rows: cell lines, IDs contained in the column DepMap_ID
LeftGene and RightGene separated by an ampersand ("&").

Unfiltered data contains all output fusions, while the filtered data uses the filters suggested by the star fusion docs. These filters are:
- FFPM > 0.1 -  a cutoff of 0.1 means&nbsp;at least 1 fusion-supporting RNAseq fragment per 10M total reads
- Remove known false positives, such as GTEx recurrent fusions and certain paralogs
- Genes that are next to each other
- Fusions with mitochondrial breakpoints
- Removing fusion involving mitochondrial chromosomes or HLA genes
- Removed common false positive fusions (red herring annotations as described in the STAR-Fusion docs)
- Recurrent fusions observed in CCLE across cell lines (in 10% or more of the samples)
- Removed fusions where SpliceType='INCL_NON_REF_SPLICE' and LargeAnchorSupport='NO_LDAS' and FFPM < 0.1
- FFPM < 0.05

NEW LINES:
"""+str(newlines)+"""

REMOVED FOR QC REASONS:
"""+str(rename)+"""

EMBARGO:
"""+str(embargo))

Uploading fusions...
hitting https://cds.team/taiga/api/datafile/1ee63cfe016347a68cecce9a96529cb9
Conversion and upload...:
	 Waiting in the task queue
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Downloading the file from S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3
	 Uploading to S3

	 Done: fusions properly converted and uploaded
Uploading filtered_fusions...
hitting https://cds.team/taiga/api/datafile/1ee63cfe016347a68cecce9a96529cb9
Conversion and upload...:
	 Downloading the file from S3
	 Downloading the file from S3
	 Uploading to S3
	 U

'53052a872f974ab98820c6b0a2367972'

In [63]:
AddToVirtual(virtual_public, "gene-fusions-6212", files=[('CCLE_fusions_unfiltered', 'fusions'),('CCLE_fusions', 'filtered_fusions')])

[('CCLE_fusions_unfiltered', 'gene-fusions-6212.18/fusions'), ('CCLE_fusions', 'gene-fusions-6212.18/filtered_fusions')]
hitting https://cds.team/taiga/api/datafile/6300de9da8644133a9158a143914330d
hitting https://cds.team/taiga/api/datafile/6300de9da8644133a9158a143914330d
hitting https://cds.team/taiga/api/datafile/6300de9da8644133a9158a143914330d
hitting https://cds.team/taiga/api/datafile/6300de9da8644133a9158a143914330d
hitting https://cds.team/taiga/api/datafile/6300de9da8644133a9158a143914330d
hitting https://cds.team/taiga/api/datafile/6300de9da8644133a9158a143914330d
hitting https://cds.team/taiga/api/datafile/6300de9da8644133a9158a143914330d
hitting https://cds.team/taiga/api/datafile/6300de9da8644133a9158a143914330d
hitting https://cds.team/taiga/api/datafile/6300de9da8644133a9158a143914330d
hitting https://cds.team/taiga/api/datafile/6300de9da8644133a9158a143914330d
hitting https://cds.team/taiga/api/datafile/6300de9da8644133a9158a143914330d
hitting https://cds.team/taiga/a