# load tons of datasets (~60,000 RNAseq samples)

In [3]:
from taigapy import TaigaClient
tc = TaigaClient()

from depmapomics import tracker as track
from depmapomics import expressions

from genepy.utils import helper as h

import dalmatian as dm
from gsheets import Sheets
import pandas as pd

from anndata import AnnData, read_h5ad

MY_ID = '~/.client_secret.json'
MYSTORAGE_ID = "~/.storage.json"

Sheets.from_files(MY_ID, MYSTORAGE_ID)
#autoreload
%load_ext autoreload
%autoreload 2

## CCLE + TCGA

In [4]:
# load from taiga public (figshare link)
# load internal expression,
# latest version can be found at https://depmap.org/portal/download/
# can also be loaded like so pd.read_csv('gs://ccle_default_params/celligner_ex/CCLE_expression.csv.gz', index_col=0)
CCLE_expression = tc.get(name='internal-21q3-fe4c',
                         file='CCLE_expression_full')  #40,000x1,500


# load  TCGA expression
# this dataset was generated from  ,using this script: 
# caan be found here: pd.read_csv('gs://ccle_default_params/celligner_ex/TCGA_expression.csv.gz', index_col=0)
TCGA_expression = tc.get(name='celligner-input-9827',
                       file='tumor_expression') # 40,000x13,000

No dataset version provided. Using version 16.
No dataset version provided. Using version 1.


In [6]:
# loading annotations
CCLE_annotation = track.getTracker() # the function uses pygsheets to load this: REFSHEET_URL=https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY
# Sheets.from_files(MY_ID, MYSTORAGE_ID).get(REFSHEET_URL).sheets[0].to_frame(index_col=0)
# you can also get it from pd.read_csv('gs://ccle_default_params/celligner_ex/CCLE_annotation.csv.gz', index_col=0)


# can be loaded from 
# pd.read_csv('gs://ccle_default_params/celligner_ex/TCGA_annotation.csv.gz', index_col=0)
TCGA_annotation = tc.get(name='celligner-input-9827',
                         file='tumor_annotations') # generated manually 

No dataset version provided. Using version 1.


In [7]:
CCLE_annotation.iloc[0, :-25]

arxspan_id                                                ACH-001188
version                                                            1
sm_id                                                            NaN
PDO                                                              NaN
datatype                                                         wgs
                                         ...                        
atcclink                                                    CRL-2266
dsmzlink                                                     ACC-209
sequencing_date                                            13-9-2020
release_date                                                     NaN
bam_qc             ['gs://fc-secure-fd4a7e9f-cb15-423c-9805-04995...
Name: CDS-rIh3Gy, Length: 62, dtype: object

In [8]:
len(CCLE_annotation[(CCLE_annotation.blacklist==0)&(CCLE_annotation.version>1)&(CCLE_annotation.datatype=="rna")]) 
# we have .. replicates in CCLE

72

In [11]:
rename = {
'Acquisition Site': "collection_site",
'Actual Seq Technology': 'sequencer',
'Age At Acquisition (Years)': "age",
'age_at_dx':"age", 
'Age': "age",
'Aggregated': "aggregated",
'Assay title': 'assay',
'arxspan_id': "tissue_id",
'biopsy_tissue': "collection_site",
'Case ID': 'participant_id', 
'CCLF_ID': 'sample_id',
'Clinical Tumor Diagnosis': 'disease_type',
'Collaborator Participant ID': "participant_id",
'collection': 'collection_site', 
'Contamination % (First Agg)': 'contamination', 
'Contamination %': "contamination",
'Culture Medium': "media_type",
'Days to First Agg': 'exp_date', 
'Diagnosis': 'disease_type',
'disease': "disease_type", 
'Expansion Status': "expansion",
'External ID for BAM': "tissue_id",
'gender': "sex",
'Gender': "sex",
'Inferred_Ethnicity': "ethnicity",
'lineage': 'tissue_type',
'mediatype':"media_type",
'Original Material Type': 'history',
'Passage Number': "passage_number",
'Phase':'stage',
'Primary Disease': "disease_type",
'Primary Site': 'collection_site',
'primary_disease': "disease_type", 
'Product': 'preparation',
'Race': "ethnicity",
'RNA_Seq_cancertype': 'disease_type', 
'sample_source': 'participant_id',
'Sample_type': 'lineage', 
'sampleID': 'sample_id',
'Sequencing on Tissue or Cell model? (MT confirm)': 'cell_type', 
'Sex': "sex",
'tissue': "lineage",
'Participant ID': 'participant_id',
'RIN score from PAXgene tissue Aliquot': 'rin_score',
'RIN': 'rin_score',
'Cell types level 3': 'tissue_type',
'Age_bin':'age',
'cohort': "disease_type",
'site_donor_id': "participant_id",
'site_id': "tissue_id",
'Site.of.Specimen ': "collection_site",
'stripped_cell_line_name': "sample_id", 
'subtype': 'disease_subtype', 
'tc': 'tumor_purity',
'Therapy': "therapy",
'Tissue Site': "collection_site",
'Tissue Status': "metastatic/primary",
'Tumor Percent': "contamination",
'Tumor Type': "metastatic/primary",
'Tumor_type': 'cell_type', 
'type': "cell_type",
'Type': "cell_type",
'Sample Type': "cell_type",
}

In [12]:
CCLE_annotation = CCLE_annotation.rename(columns=rename)[['origin', 'sequencing_type', 'doublingt','hasebv'] + list(rename.values())]
CCLE_annotation['method']="bulk"
CCLE_annotation['cell_type']="historical_CL; 2D"
CCLE_annotation['sequencer']="Illumina Hiseq 2000"
CCLE_annotation['ends']="paired end"
CCLE_annotation['reference']="hg38"
CCLE_annotation['dataset']="ccle"

KeyError: "['origin', 'sequencer', 'aggregated', 'assay', 'contamination', 'exp_date', 'expansion', 'ethnicity', 'history', 'passage_number', 'stage', 'preparation', 'lineage', 'cell_type', 'rin_score', 'tumor_purity', 'therapy', 'metastatic/primary'] not in index"

In [None]:
TCGA_annotation.iloc[0]

In [None]:
TCGA_annotation = TCGA_annotation.rename(columns=rename)[rename.values()]
TCGA_annotation['method']="bulk"
TCGA_annotation['cell_type']="tumor"
TCGA_annotation['metastasis']="Primary"
TCGA_annotation['sequencer']= "Illumina Hiseq 2000"
TCGA_annotation['reference']= "hg38"
TCGA_annotation['ends']="paired end"
TCGA_annotation['dataset']="tcga"

In [None]:
pd.concat([CCLE_annotation, TCGA_annotation])

In [None]:
AnnData()

## CCLF

In [21]:
cclf_orga_info = tc.get(name='cclf-organoids-c23d', version=1, file='cclf_orga_info')
cclf_orga_info = cclf_orga_info.rename(columns=rename)[rename.values()]
cclf_orga_rnaseq = tc.get(name='cclf-organoids-c23d', version=1, file='cclf_orga_rnaseq').T # 40,000x24
cclf_orga_info.index = [i.split("_")[1] for i in cclf_orga_info.sample_id]
cclf_orga_rnaseq.index = [i.split('_')[0][:-1] for i in cclf_orga_rnaseq.index]

NameError: name 'rename' is not defined

In [536]:
cclf_orga_info

Unnamed: 0,Genomic_Seq,RNA_Seq_cancertype,RNA_Seq_marker,CCLF_ID,Diagnosis,Subtype_patient,Tumor_type,Sample_type
0,Pending,BREAST_BASAL,,CCLF_cRCRF1048,Advanced breast cancer,"ER/PR+, HER2-",Metastatic,Pleural effusion
1,Pending,BREAST_BASAL,,CCLF_cRCRF1060,Invasive breast carcinoma,"ER+/PR-, Her2 IHC 1+",Metastatic,Pleural effusion
2,Pending,BREAST_BASAL,,CCLF_KL1337,Breast met to brain,Pending,Metastatic,Brain met resection
3,Pending,BREAST_BASAL/LUMINAL,,CCLF_KL1310,Breast met to brain,Pending,Metastatic,
4,Pending,BREAST_LUMINAL,ER+,CCLF_KL1271,Breast met to brain,Pending,Metastatic,Brain met resection
5,Pending,BREAST_LUMINAL,ER+,CCLF_KL1333,Invasive breast carcinoma,Pending,Metastatic,Cryopreserved tissue
6,Pending,BREAST_LUMINAL,ER+,CCLF_KL1351,Breast met to brain,Pending,Metastatic,Brain met resection
7,Pending,BREAST_BASAL,,CCLF_cRCRF1092,Advanced breast cancer,"ER/PR+, HER2 negative",Metastatic,Pleural effusion
8,Pending,BREAST_LUMINAL,HER2+,CCLF_KL1312,Breast met to brain,Pending,Metastatic,Brain met resection
9,Pending,FIBROBLAST,,CCLF_cRCRF1038,Advanced breast cancer,"ER/PR+, HER2-",Metastatic,Pleural effusion


In [None]:
cclf_orga_info['sequencer'] = "illumina Hiseq 2000"
cclf_orga_info['method'] = "bulk"
cclf_orga_info['cell_type'] = "organoid; cancer"
cclf_orga_info['reference']= "hg38"
cclf_orga_info['ends']="paired end"
cclf_orga_info['dataset'] = "cclf_orga"

In [480]:
#cclf other
cclfrna = dm.WorkspaceManager("nci-mimoun-bi-org/CCLF_RNA_2_0").get_samples() #40,000x160

cclfrna_anno = cclfrna[["external_id_rna"]].replace({'NA': np.nan})

cclfrna_annot = Sheets.from_files(MY_ID, MYSTORAGE_ID).get("https://docs.google.com/spreadsheets/d/1O9IV_v2vMbebkk_KDWu3LdKBQ16c8lThJKiiWvRxMUo").sheets[2].to_frame()

cclfrna_annot2 = Sheets.from_files(MY_ID, MYSTORAGE_ID).get("https://docs.google.com/spreadsheets/d/1O9IV_v2vMbebkk_KDWu3LdKBQ16c8lThJKiiWvRxMUo").sheets[3].to_frame()
# get it  from https://docs.google.com/spreadsheets/d/1O9IV_v2vMbebkk_KDWu3LdKBQ16c8lThJKiiWvRxMUo and get 

In [392]:
#files, failed, _, _, lowqual, _ = await expressions.postProcess("nci-mimoun-bi-org/CCLF_RNA_2_0", "all_samples", samplesetToLoad = "all_samples", compute_enrichment=False, trancriptLevelCols = ['rsem_transcripts_expected_count', 'rsem_transcripts_tpm'], geneLevelCols = ["rsem_genes_tpm", "rsem_genes_expected_count"], save_output="data/")
#cclfrna = files['rsem_genes_tpm']
cclfrna = pd.read_csv('data/expression_genes_tpm.csv', index_col=0)

In [481]:
ina = (cclfrna_annot2['Passage Number'].isna() | (cclfrna_annot2['Passage Number']=="Unknown")) & ~(cclfrna_annot2["Passage Number on Receipt"].isna() | (cclfrna_annot2["Passage Number on Receipt"]=="Unknown"))
cclfrna_annot2.loc[ina, "Passage Number"] = cclfrna_annot2.loc[ina, "Passage Number on Receipt"].values

ina = (cclfrna_annot2['Gender'].isna() | (cclfrna_annot2['Gender']=="Unknown")) & ~(cclfrna_annot2["Gender.1"].isna() | (cclfrna_annot2["Gender.1"]=="Unknown"))
cclfrna_annot2.loc[ina, "genderA"] = cclfrna_annot2.loc[ina, "Gender.1"].values

ina = (cclfrna_annot2['Gender'].isna() | (cclfrna_annot2['Gender']=="Unknown")) & ~(cclfrna_annot2["FP Gender"].isna() | (cclfrna_annot2["FP Gender"]=="Unknown"))
cclfrna_annot2.loc[ina, "Gender"] = cclfrna_annot2.loc[ina, "FP Gender"].values

ina = (cclfrna_annot2['Race'].isna() | (cclfrna_annot2['Race']=="Unknown")) & ~(cclfrna_annot2["Ethnicity"].isna() | (cclfrna_annot2["Ethnicity"]=="Unknown"))
cclfrna_annot2.loc[ina, "Race"] = cclfrna_annot2.loc[ina, "Ethnicity"].values


In [537]:
cclfrna_annot2.iloc[0]

Age                              NaN
Gender                           NaN
Sample Type                   Normal
Tumor Type                       NaN
Tissue Site                      NaN
Tumor Percent                    NaN
Primary Disease    Mouse Fibroblasts
Race                             NaN
Culture Medium                   NaN
Passage Number                   NaN
index                            NaN
Name: 3T3/J2_CL, dtype: object

In [538]:
cclfrna_annot.iloc[0]

Sequencing on Tissue or Cell model? (MT confirm)                                           Cell model
External ID for BAM                                                                         AB002T_BT
Product                                             Tru-Seq Strand Specific Large Insert RNA Seque...
Original Material Type                                                        Cells:Cell Line, Viable
Collaborator Participant ID                                                                     AB002
Aggregated                                                                                        0.0
Actual Seq Technology                                                                             NaN
Contamination %                                                                                  1.07
Age                                                                                           Unknown
Gender                                                                            

In [482]:
cclfrna_annot2 = cclfrna_annot2.set_index('Collaborator Sample ID')[["Age",
"Gender",
"Tumor Type",
"Tissue Site",
"Primary Disease",
"Race",
"Culture Medium",
"Passage Number",]]

In [483]:
cclfrna_annot = cclfrna_annot[[
 'Sequencing on Tissue or Cell model? (MT confirm)',
 'External ID for BAM',
 'Product',
 'RIN',
 'Collaborator Sample ID',
 'Original Material Type',
 'Collaborator Participant ID',
 'Aggregated',
 'Actual Seq Technology',
 'Contamination %',
]].set_index('Collaborator Sample ID', drop=True)

In [522]:
for val in h.dups(cclfrna_annot2.index):
    for i in range(len(cclfrna_annot2.loc[val])-1):
        if cclfrna_annot2.loc[val].iloc[0].isna().sum() > cclfrna_annot2.loc[val].iloc[i+1].isna().sum():
            cclfrna_annot2.iloc[np.argwhere(cclfrna_annot2.index == val).flatten()[0]] = cclfrna_annot2.loc[val].iloc[i+1].values
cclfrna_annot2 = cclfrna_annot2[~cclfrna_annot2.index.duplicated(keep='first')]

In [525]:
for val in h.dups(cclfrna_annot.index):
    for i in range(len(cclfrna_annot.loc[val])-1):
        if cclfrna_annot.loc[val].iloc[0].isna().sum() > cclfrna_annot.loc[val].iloc[i+1].isna().sum():
            cclfrna_annot.iloc[np.argwhere(cclfrna_annot.index == val).flatten()[0]] = cclfrna_annot.loc[val].iloc[i+1].values
cclfrna_annot = cclfrna_annot[~cclfrna_annot.index.duplicated(keep='first')]

In [526]:
cclfrna_annot = pd.concat([cclfrna_annot, cclfrna_annot2], axis=1)

In [528]:
for i, val in cclfrna_annot.iterrows():
    cclfrna_anno.loc[cclfrna_anno.external_id_rna==i, cclfrna_annot.columns] = val.values
del cclfrna_annot

In [None]:
cclfrna_anno['reference']= "hg38"
cclfrna_anno['ends']="paired end"
cclfrna_anno['sequencer']="Illumina HiSeq 2000"
cclfrna_anno['method'] = "bulk"
cclfrna_anno['dataset'] = "cclf"

## MET500 and PDXs

In [542]:
# met500 
met500_meta = tc.get(name='met500-fc3c', file='met500_meta')
met500_TPM = tc.get(name='met500-fc3c', file='met500_TPM') #20,979x868 matrix

#Novartis_PDX
Novartis_PDX_ann = tc.get(name='pdx-data-3d29', file='Novartis_PDX_ann')
Novartis_PDX_TPM = tc.get(name='pdx-data-3d29', file='Novartis_PDX_TPM').T # 38,087x445

#pediatric_PDX
pediatric_PDX_ann = tc.get(name='pdx-data-3d29', file='pediatric_PDX_ann')
pediatric_PDX_TPM = tc.get(name='pdx-data-3d29', file='pediatric_PDX_TPM') #80,000x250

No dataset version provided. Using version 1.
No dataset version provided. Using version 1.
No dataset version provided. Using version 1.
No dataset version provided. Using version 2.
No dataset version provided. Using version 2.
No dataset version provided. Using version 2.
No dataset version provided. Using version 2.


In [544]:
met500_meta.iloc[0]

Sample_id        ES_5001-capt-SI_5013-C0LAMACXX
sample_type                               tumor
sample_source                           ES_5001
dataset                                    mctp
tissue                                   breast
cohort                                     BRCA
run.id           ES_5001-capt-SI_5013-C0LAMACXX
idx                                         461
test                                      False
tc                                         0.93
biopsy_tissue                             brain
Name: 0, dtype: object

In [606]:
tcga_dict = {
"LAML":	"Acute Myeloid Leukemia",
"ACC":	"Adrenocortical carcinoma",
"BLCA":	"Bladder Urothelial Carcinoma",
"BOCA":	"Bone Cancer",
"LGG":	"Brain Lower Grade Glioma",
"BRCA":	"Breast invasive carcinoma",
"CESC":	"Cervical squamous cell carcinoma and endocervical adenocarcinoma",
"CHOL":	"Cholangiocarcinoma",
"CLLE":	"Chronic Lymphocytic Leukemia",
"CMDI":	"Chronic Myeloid Disorders",
"COAD":	"Colon adenocarcinoma",
"COLO":	"Colorectal Cancer",
"COADREAD":	"Colorectal cancer",
"EOPC":	"Early Onset Prostate Cancer",
"ESAD":	"Esophageal Adenocarcinoma",
"ESCA":	"Esophageal carcinoma",
"CHOL":	"Gallbladder cancer",
"GBM":	"Glioblastoma multiforme",
"HNSC":	"Head and Neck squamous cell carcinoma",
"KDNY":	"Kidney Cancer",
"KICH":	"Kidney Chromophobe",
"KIRC":	"Kidney renal clear cell carcinoma",
"KIRP":	"Kidney renal papillary cell carcinoma",
"LIRI":	"Liver Cancer",
"LICA":	"Liver Cancer",
"LINC":	"Liver Cancer",
"HCC":	"Liver hepatocellular carcinoma",
"LIHC":	"Liver hepatocellular carcinoma",
"LGG":	"Lower Grade GLioma",
"LUNG":	"Lung Cancer",
"LUAD":	"Lung adenocarcinoma",
"LUSC":	"Lung squamous cell carcinoma",
"DLBC":	"Lymphoid Neoplasm Diffuse Large B-cell Lymphoma",
"MCTP":	"MCTP",
"MALY":	"Malignant Lymphoma",
"MESO":	"Mesothelioma",
"NBL":	"Neuroblastoma",
"ORCA":	"Oral Cancer",
"MISC":	"Other Cancer",
"OV":	"Ovarian serous cystadenocarcinoma",
"PACA":	"Pancreatic Cancer",
"PAEN":	"Pancreatic Cancer Endocrine neoplasms",
"PAAD":	"Pancreatic adenocarcinoma",
"PBCA":	"Pediatric Brain Cancer",
"PCPG":	"Pheochromocytoma and Paraganglioma",
"PRAD":	"Prostate adenocarcinoma",
"READ":	"Rectum adenocarcinoma",
"RECA":	"Renal Cancer",
"SARC":	"Sarcoma",
"SECR":	"Secretory Cancer",
"SKCM":	"Skin Cutaneous Melanoma",
"STAD":	"Stomach adenocarcinoma",
"TGCT":	"Testicular Germ Cell Tumor",
"TGCT":	"Testicular Germ Cell Tumors",
"THYM":	"Thymoma",
"THYM":	"Thymoma",
"THCA":	"Thyroid carcinoma",
"UCS":	"Uterine Carcinosarcoma",
"UCEC":	"Uterine Corpus Endometrial Carcinoma",
"UVM":	"Uveal Melanoma",
"ACC":	"adrenocortical carcinoma",
}

In [607]:
met500_meta = met500_meta.replace({"cohort": tcga_dict})

In [None]:
met500_meta = met500_meta.rename(columns={**rename, **{'subtype': "disease_type"}}).set_index('sample_id', drop=True)[rename.values()]

In [None]:
met500_meta['sequencer'] = "Illumina HiSeq 2000"
met500_meta['method'] = "bulk"
met500_meta['cell_type'] = "tumor"
met500_meta['reference']= "hg38"
met500_meta['ends']="paired end"
met500_meta['metastatic/primary'] = "metastatic"
met500_meta['dataset'] = "met500"

In [347]:
pediatric_PDX_ann.iloc[0]

sampleID                                                                                  ALL-102
lineage                                                                                     blood
subtype                                                                                       ALL
Histology                                                                                     ALL
Histology.Detailed                                                                     Ph-likeALL
Histology-Detailed2                                                                    Ph-likeALL
Molecular-Subtype-Brain                                                                      None
PI                                                                                           Lock
Sex                                                                                          Male
Phase                                                                                   Diagnosis
Age                 

In [70]:
[(i.split('me patient as ')[-1].split(' (')[0],v) if type(i) is str and 'ame patient' in i else '' for v, i in pediatric_PDX_ann[["sampleID","Other_info1"]].values]

[('ALL-105', 'ALL-102'),
 ('ALL-102', 'ALL-105'),
 '',
 '',
 '',
 '',
 ('ALL-102', 'ALL-115'),
 '',
 '',
 '',
 ('ALL-46', 'ALL-121'),
 '',
 ('ALL-58', 'ALL-123'),
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 ('ALL-121', 'ALL-46'),
 '',
 '',
 '',
 '',
 '',
 ('ALL-123', 'ALL-58'),
 '',
 ('ALL-25', 'ALL-61'),
 '',
 '',
 ('ALL-81', 'ALL-80'),
 ('ALL-80', 'ALL-81'),
 '',
 ('ALL-32', 'ALL-90'),
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 ('ALL-61', 'ALL-25'),
 '',
 '',
 '',
 ('ALL-90', 'ALL-32'),
 '',
 '',
 '',
 '',
 '',
 '',
 ('ALL-83', 'ALL-82'),
 ('ALL-82', 'ALL-83'),
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 ('COG-N-453x', 'COG-N-452x'),
 ('COG-N-452x', 'COG-N-453x'),
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 ('COG-N-623x', 'COG-N-603x'),
 ('COG-N-618x', 'COG-N-619x'),
 '',
 ('COG-N-603x', 'COG-N-623x'),
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 ('98

In [None]:
pediatric_PDX_ann['participant_id'] = pediatric_PDX_ann.index

#created frrom manual inspection
samepatient = [('NCH-CA-2', 'NCH-CA-1'), ('ALL-105', 'ALL-102', "ALL-115"), ('ALL-46', 'ALL-121'), ('ALL-25', 'ALL-61'), ('ALL-81', 'ALL-80'), ('ALL-32', 'ALL-90'), ('ALL-58', 'ALL-123'), ('ALL-82', 'ALL-83'), ("COG-N-623x", "COG-N-603x"), ("COG-N-453x","COG-N-452x"), ("COG-N-618x", "COG-N-619x"), ('22909PNET', '9850PNET'), ('OS-34', 'OS-34-SJ'), ('OS-36', 'OS-36-SJ', 'OS-32'),  ('Rh-30R', 'Rh-30')]

for val in samepatient:
    for i in val[1:]:
        pediatric_PDX_ann.loc[i, 'participant_id']=val[0]
        
pediatric_PDX_ann['age'] = ['adult' if i =='Adult' else 'child' for i in pediatric_PDX_ann['Other_info1']]

pediatric_PDX_ann.rename(columns={**rename, **{'subtype': "disease_type"}})[rename.values()]

In [None]:
pediatric_PDX_ann['sequencer'] = "Illumina HiSeq 2000"
pediatric_PDX_ann['align'] = "BWA"
pediatric_PDX_ann['method'] = "bulk"
pediatric_PDX_ann['reference']= "hg19"
pediatric_PDX_ann['ends']="paired end"
pediatric_PDX_ann['dataset'] = "pediatric_PDX"

In [350]:
Novartis_PDX_ann.iloc[0]

sampleID           0931HXXTM
lineage             pancreas
subtype     ductal_carcinoma
type                     PDX
Name: 0, dtype: object

In [None]:
Novartis_PDX_ann = Novartis_PDX_ann.rename(columns=rename).set_index('sample_id', drop=True)

In [None]:
Novartis_PDX_ann['sequencer'] = "Illumina HiSeq 2000"
Novartis_PDX_ann['method'] = "bulk"
Novartis_PDX_ann['align'] = "STAR"
Novartis_PDX_ann['reference']= "hg19"
Novartis_PDX_ann['ends']="paired end"
Novartis_PDX_ann['dataset'] = "Novartis_PDX"

## tumor inf elife

In [13]:
elife_tumorinf = tc.get(name='tumor-infiltration-3307', version=1, file='elife_tumorinf')

In [14]:
elife_tumorinf

Unnamed: 0,Bcells,CAFs,CD4_Tcells,CD8_Tcells,Endothelial,Macrophages,NKcells
A1BG,7.75210,4.47890,4.6702,5.02000,0.0000,3.46370,1.74260
A1BG-AS1,0.27272,28.98800,6.7471,11.76400,0.0000,2.36580,8.92140
A1CF,0.94832,0.03625,0.4978,0.56538,0.2582,0.13695,0.25936
A2M,0.14435,184.45000,2.9531,15.38600,1575.2000,418.67000,8.94110
A2M-AS1,0.00000,0.34500,7.3420,11.59100,8.6042,0.35669,0.21549
...,...,...,...,...,...,...,...
ZYG11A,13.37900,3.64120,18.2580,18.17900,9.0574,7.47320,16.04700
ZYG11B,18.64800,10.59500,15.2930,13.80700,5.1119,5.00460,8.10280
ZYX,31.07300,205.32000,79.5490,64.75700,70.3870,206.18000,102.91000
ZZEF1,5.69700,2.98250,9.9508,14.66500,7.2201,5.78570,16.66700


In [None]:
elife_tumorinf = elife_tumorinf.rename(columns={"Bcells": "B-cell", "CAFs": "CAF", "CD4_Tcells": "CD4_T-cells", "CD8_Tcells": "CD8_T-cells","macrophage": "macrophage", "Endothelial": "endothelial", "NKcells": "NK-cell"})

In [13]:
elife_tumorinf_ann = pd.DataFrame()
elife_tumorinf_ann["cell_type"] = "normal"
elife_tumorinf_ann["tissue_type"] = elife_tumorinf.columns
elife_tumorinf_ann["sample_ID"] = elife_tumorinf.columns
elife_tumorinf_ann['sequencer'] = ""
elife_tumorinf_ann['align'] = "bowtie"
elife_tumorinf_ann['method'] = "singlecell"
elife_tumorinf_ann['reference']= "hg19"
LAU125	59	male	iliac lymph node
LAU355	70	female	iliac-obturator lymph node
LAU1255	87	male	axillary lymph node
LAU1314	81	male	iliac-obturator lymph node
elife_tumorinf_ann['age'] = ""
elife_tumorinf_ann['sex'] = ""
elife_tumorinf_ann['dataset'] = "elife_tumorinf"

## tirosh's melanoma

In [4]:
melanoma = tc.get(name='tirosh-melanoma-scrnaseq-60f0', file='melanoma')

No dataset version provided. Using version 1.


In [19]:
melanoma.columns = [i.replace('-', '_').replace('Cy', "CY").replace('cy', "CY").replace('CY88C', 'CY88_C').replace('CY89A', "CY89_A").replace('CY89C', 'CY89_C').replace('CY89F', 'CY89_F').replace('CY89N', 'CY89_N').replace('CY94C', 'CY94_C') for i in melanoma.columns]

In [20]:
melanoma_ann = pd.DataFrame()

typ={1:"normal", 2:"tumor",0: np.nan}
orig={1:"melanoma", 2:"B-cell", 3: "macrophage", 4: "endothelial", 5: "CAF", 6:"NK-cell", 0: np.nan}

melanoma_ann['age'] = [int(i) for i in melanoma.loc['tumor']]
melanoma_ann["cell_type"] = [typ[int(i)] for i in melanoma.loc['malignant(1=no,2=yes,0=unresolved)']]
melanoma_ann['tissue_type'] = [orig[int(i)] for i in melanoma.loc['non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)']]
melanoma_ann['name'] = [i.split('_')[0] for i in melanoma.columns]
melanoma_ann['sample_id'] = melanoma.columns
melanoma_ann['other'] = [i.split('_')[-2] for i in melanoma.columns]
melanoma_ann['sequencer'] = ""
melanoma_ann['method'] = "singlecell"

melanoma_ann['reference']= "hg38"
melanoma_ann['ends']="paired end"
melanoma_ann['dataset'] = "melanoma"

## GTEX

In [9]:
#! curl https://storage.googleapis.com/gtex_analysis_v9/snrna_seq_data/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad --output temp/gtex_8_atlas_public.h5ad

## GTEX additional
https://storage.googleapis.com/gtex_external_datasets/eyegex_data/rna_seq_data/EyeGEx_retina_combined_genelevel_expectedcounts_byrid_nooutlier.tpm.matrix.gct
    
https://storage.googleapis.com/gtex_external_datasets/eyegex_data/annotations/EyeGEx_meta_combined_inferior_retina_summary_deidentified_geo_ids.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1810M  100 1810M    0     0   126M      0  0:00:14  0:00:14 --:--:--  126M


In [614]:
gtex_v9 = read_h5ad("temp/gtex_8_atlas_public.h5ad") #209,126 × 17,695

In [623]:
gtex_v9.obs = gtex_v9.obs[["Age_bin","Sex","Sample ID", "Participant ID", "RIN score from PAXgene tissue Aliquot", "Tissue", "tissue"]].rename(columns={**rename, **{"tissue": "tissue_type", "Tissue": "collection_site"}})

n_genes                                                                    2658
fpr                                                                         0.1
tissue                                                           skeletalmuscle
prep                                                                        CST
individual                                                                   01
nGenes                                                                     2902
nUMIs                                                                   11544.0
PercentMito                                                             0.07623
PercentRibo                                                            0.051195
Age_bin                                                                   51-60
Sex                                                                        Male
Sample ID                                              GTEX-1HSMQ-5011-SM-GKSJH
Participant ID                          

In [None]:
gtex_v9.obs['sequencer']="Illumina HiSeq 2000"
gtex_v9.obs['method']="bulk"
gtex_v9.obs['reference']= "hg38"
gtex_v9.obs['ends']="paired end"
gtex_v9.obs['dataset']="gtex"

In [None]:
gtex_add = # 80,000 x 500

## THEIS LAB scRNAseq datasets

In [None]:
https://theislab.github.io/sfaira-portal/Datasets #50,000x13,000


## HCMI

In [6]:
# HCMI dataset
# Code to generate this dataset can be found here:
# https://github.com/broadinstitute/hcmi-processing/blob/main/hcmi-rna-analysis-210226.ipynb
hcmi_ltpm = tc.get(name='hcmi-data-ac4b', file='hcmi_ltpm').T # 60486 x 157
hcmi_sample_info = tc.get(name='hcmi-data-ac4b', file='hcmi_sample_info')
#sample_info = tc.get(name='hcmi-data-ac4b', file='sample-info')

No dataset version provided. Using version 7.
No dataset version provided. Using version 7.


In [7]:
hcmi_sample_info['type'] = [i.split('_')[1] +"; "+str(j) for i,j in hcmi_sample_info[['type', 'Type']].values]

In [8]:
hcmi_sample_info[['Case ID', 'Clinical Tumor Diagnosis', 'subtype', 'Tissue Status', "Acquisition Site", 'Gender', 'Race', 'Age At Acquisition (Years)', 'Expansion Status', 'sampleID', 'type', 'lineage']].rename(columns=rename)

Case ID                                                                  HCM-CSHL-0092-C25
Primary Site                                                                      Pancreas
Clinical Tumor Diagnosis                                                 Pancreatic cancer
subtype                                                         Adenocarcinoma ductal type
Tissue Status                                                                      Primary
Acquisition Site                                                           Pancreatic head
Gender                                                                                Male
Race                                                                               Unknown
Age At Acquisition (Years)                                                            69.0
Age At Diagnosis (Years)                                                              69.0
Disease Status                                                         Progressive disease

In [None]:
hcmi_sample_info['sequencer'] =""
hcmi_sample_info['method'] =""
hcmi_sample_info['reference']= ""
hcmi_sample_info['ends']=""
hcmi_sample_info['dataset']="hcmi"

## L1000 dataset

In [4]:
import subprocess
from anndata import AnnData 

In [25]:
# you will need R > 4.0 
# https://www.charlesbordet.com/en/how-to-upgrade-to-R-4-0-0-on-debian/#the-naive-solution
! R -e "if(!requireNamespace('BiocManager', quietly = TRUE)){install.packages('BiocManager', repos='http://cran.us.r-project.org')};BiocManager::install('cmapR');"


R version 4.1.2 (2021-11-01) -- "Bird Hippie"
Copyright (C) 2021 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> if(!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager", repos="http://cran.us.r-project.org")};BiocManager::install(c("limma"));
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
  'lib = "/usr/local/lib/R/site-library"' is not writable
Error in inst

In [31]:
folder = "gs://ccle_default_params/celligner_ex/"
res = []
for val in ["level5_beta_ctl_n58022x12328.gctx",
            "level5_beta_trt_cp_n720216x12328.gctx",
            "level5_beta_trt_misc_n8283x12328.gctx"
            "level5_beta_trt_oe_n34171x12328.gctx",
            "level5_beta_trt_sh_n238351x12328.gctx",
            "level5_beta_trt_xpr_n142901x12328.gctx",]:
    cmd = "gsutil cp " + folder + val + " temp/"
    ! $cmd
    res.append(h.loadGCTXasAnnData('temp/'+val))

Copying gs://ccle_default_params/celligner_ex/level5_beta_trt_misc_n8283x12328.gctx...
==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").


Operation completed over 1 objects/389.6 MiB.                                    


NameError: name 'pg' is not defined

In [None]:
ann['sequencer'] = "L1000"
ann['method'] = "L1000"
ann['reference']= "L1000"
ann['ends']="L1000"
ann['dataset']="L1000"

## encode

In [136]:
todl = h.fileToList('data/encode_rna.txt')

In [134]:
# 40,000 x 1100
report = pd.read_csv('data/encode_report.tsv', sep="\t", skiprows=1)
report = report[report.columns[report.isna().sum()!=len(report)]]
report

Unnamed: 0,ID,Accession,Assay name,Assay title,Biosample summary,Biosample term name,Dbxrefs,Description,Lab,Project,...,Biosample treatment,Biosample treatment ontology ID,Biosample treatment amount,Biosample treatment amount units,Biosample treatment duration,Biosample treatment duration units,Biosample modification site target organism,Replicates,Cellular component,Library construction method
0,/experiments/ENCSR620LQN/,ENCSR620LQN,RNA-seq,total RNA-seq,Homo sapiens esophagus muscularis mucosa tissu...,esophagus muscularis mucosa,GEO:GSE88409,The libraries contained in this Experiment com...,"Thomas Gingeras, CSHL",ENCODE,...,,,,,,,,/replicates/499e1412-5ef3-4ac2-98be-fd55bb01fad5/,,
1,/experiments/ENCSR406SAW/,ENCSR406SAW,RNA-seq,total RNA-seq,Homo sapiens upper lobe of left lung tissue fe...,upper lobe of left lung,GEO:GSE88254,The libraries contained in this Experiment com...,"Thomas Gingeras, CSHL",ENCODE,...,,,,,,,,/replicates/782b72f8-3197-4a66-951c-e2de88158b6a/,,
2,/experiments/ENCSR019MXZ/,ENCSR019MXZ,polyA plus RNA-seq,polyA plus RNA-seq,Homo sapiens HepG2 insoluble cytoplasmic fraction,HepG2,GEO:GSE87958,Initial insoluble fractions on HepG2 Long Poly...,"Eric Lécuyer, IRCM",ENCODE,...,,,,,,,,/replicates/443af8c7-5ed1-4930-b4f7-e1ca62c553...,insoluble cytoplasmic fraction,
3,/experiments/ENCSR630VJN/,ENCSR630VJN,RNA-seq,total RNA-seq,Homo sapiens transverse colon tissue male adul...,transverse colon,GEO:GSE88418,The libraries contained in this Experiment com...,"Thomas Gingeras, CSHL",ENCODE,...,,,,,,,,/replicates/aa7ff7b6-5bf5-4f0d-a09c-8612c945df37/,,
4,/experiments/ENCSR035SKV/,ENCSR035SKV,RNA-seq,total RNA-seq,Homo sapiens gastroesophageal sphincter tissue...,gastroesophageal sphincter,GEO:GSE87978,The libraries contained in this Experiment com...,"Thomas Gingeras, CSHL",ENCODE,...,,,,,,,,/replicates/8c825033-47e6-4659-8b02-0b399fab0435/,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1006,/experiments/ENCSR672JUF/,ENCSR672JUF,polyA plus RNA-seq,polyA plus RNA-seq,Homo sapiens foreskin keratinocyte male newborn,foreskin keratinocyte,GEO:GSM958177,,"Joseph Costello, UCSF",Roadmap,...,,,,,,,,/replicates/dd125c50-da07-473b-90cf-6be56b7fe968/,,
1007,/experiments/ENCSR999CPT/,ENCSR999CPT,polyA plus RNA-seq,polyA plus RNA-seq,Homo sapiens placental basal plate tissue fema...,placental basal plate,GEO:GSM1127098,,"Joseph Costello, UCSF",Roadmap,...,,,,,,,,/replicates/d92f45a2-c13f-42d4-a52d-2f974f6e7f41/,,
1008,/experiments/ENCSR634LOX/,ENCSR634LOX,polyA plus RNA-seq,polyA plus RNA-seq,Homo sapiens foreskin melanocyte male newborn,foreskin melanocyte,GEO:GSM958174,,"Joseph Costello, UCSF",Roadmap,...,,,,,,,,/replicates/f0421981-9298-4ade-a256-59aa55666f27/,,
1009,/experiments/ENCSR714QAF/,ENCSR714QAF,polyA plus RNA-seq,polyA plus RNA-seq,Homo sapiens mole tissue female,mole,GEO:GSM1582478,The RNA-Seq libraries contained in this experi...,"Joseph Costello, UCSF",Roadmap,...,,,,,,,,/replicates/919a47f8-b23e-431c-9ea0-9dcb6990df63/,,


In [137]:
region = []
platform = []

for val in list(report['Description']):
    if val is np.nan:
        region.append('')
        platform.append('')
        continue
    if "cytosol" in val or "cytosolic" in val:
        region.append('cytosol')
    elif "nucleolus" in val:
        region.append('nucleolus')
    elif "nucleus" in val:
        region.append('nucleus')
    elif "whole cell" in val:
        region.append('whole')
    elif "chromatin" in val:
        region.append('chromatin')
    elif "cytoplasmic" in val:
        region.append('cytoplasm')
    else:
        region.append(None)
    if "Hi-Seq" in val:
        platform.append('illumina Hi-Seq 2000')
    elif "Illumina GAIIx" in val:
        platform.append('illumina gaiix')
    else:
        platform.append('')

In [138]:
report.loc[report['Cellular component'].isna(), 'Cellular component'] = np.array(region)[report[report['Cellular component'].isna()].index]

In [139]:
report['sequencer'] = platform

In [140]:
report['Files'] = report['Files'].str.split(',').apply(lambda x: [i.split('/')[-2] for i in x])

In [141]:
report = report[report['Cellular component'].isna() | (report['Cellular component']=="nucleus")]
report = report[['Assay title', 'Biosample summary', 'Biosample term name', 'Description', 'Lab', 'Project', 'Files', 'Biosample accession', 'Organism', 'Life stage', 'Biosample age', 'sequencer', 'Biosample treatment ontology ID', 'Biosample treatment amount',
'Biosample treatment amount units', 'Biosample treatment duration',
'Biosample treatment duration units']].rename(columns=rename)

NameError: name 'rename' is not defined

In [145]:
set(report.platform)

{'', 'illumina Hi-Seq 2000', 'illumina gaiix'}

In [None]:
match = {}
for i, val in report['Files'].iteritems():
    for j in val:
        match[j] = i

In [None]:
encode_ann = pd.DataFrame(columns=report.columns)
for val in todl:
    val = val.split('/')[-1].split('.')[0]
    if val not in match:
        print(val)
    else:
        encode_ann.loc[val] = report.loc[match[val]].values

In [None]:
encode_ann['sequencer'] = ""# MISSING
encode_ann['reference']= "hg38"
encode_ann['ends']="paired-end"
encode_ann['align'] = "STAR"
encode_ann['dataset']="encode"

In [17]:
res = []
for val in encode_ann.index:
    vl = pd.read_csv('https://www.encodeproject.org/files/'+val+'/@@download/'+val+'.tsv', sep='\t')
    try:
        vl = vl.set_index('gene_id', drop=True)['TPM'].rename(val)
    except KeyError:
        vl = vl.set_index('gene_ID', drop=True)[vl.columns[-1]].rename(val)
    res.append(vl)
encode = pd.concat(res, axis=1)

ValueError: cannot reindex from a duplicate axis

## ICGC

In [70]:
icgclist = ["https://dcc.icgc.org/api/v1/download?fn=/current/Projects/BOCA-FR/exp_seq.BOCA-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/BPLL-FR/exp_seq.BPLL-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/BRCA-KR/exp_seq.BRCA-KR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/LICA-FR/exp_seq.LICA-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/LIRI-JP/exp_seq.LIRI-JP.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/ORCA-IN/exp_seq.ORCA-IN.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/OV-AU/exp_seq.OV-AU.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/PACA-AU/exp_seq.PACA-AU.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/PACA-CA/exp_seq.PACA-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/PRAD-CA/exp_seq.PRAD-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/PRAD-FR/exp_seq.PRAD-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/CLLE-ES/exp_seq.CLLE-ES.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/MALY-DE/exp_seq.MALY-DE.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/PAEN-AU/exp_seq.PAEN-AU.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/RECA-EU/exp_seq.RECA-EU.tsv.gz"]

In [69]:
res = []
for val in icgc:
    val = pd.read_csv(val, sep='\t')
    print(set(val['platform']), set(val['gene_model']), set(val['experimental_protocol']), set(val['assembly_version']), set(val['alignment_algorithm']), set(val['normalization_algorithm']))
    res.append(pd.concat([val.loc[val['icgc_sample_id']==i, ['gene_id', 'normalized_read_count']].set_index('gene_id').rename(columns={'normalized_read_count':i}) for i in list(set(val['icgc_sample_id']))], axis=1))
icgc = pd.concat(res, axis=1)

'download\\?fn\\=/current/Projects/BOCA-FR/exp_seq.BOCA-FR.tsv.gz'

In [49]:
donor = pd.read_csv("data/donor.tsv", sep="\t", index_col=0)
donor.columns

Index(['project_code', 'study_donor_involved_in', 'submitted_donor_id',
       'donor_sex', 'donor_vital_status', 'disease_status_last_followup',
       'donor_relapse_type', 'donor_age_at_diagnosis',
       'donor_age_at_enrollment', 'donor_age_at_last_followup',
       'donor_relapse_interval', 'donor_diagnosis_icd10',
       'donor_tumour_staging_system_at_diagnosis',
       'donor_tumour_stage_at_diagnosis',
       'donor_tumour_stage_at_diagnosis_supplemental', 'donor_survival_time',
       'donor_interval_of_last_followup', 'prior_malignancy',
       'cancer_type_prior_malignancy', 'cancer_history_first_degree_relative'],
      dtype='object')

In [51]:
more = pd.read_csv('data/donors_more_csv', index_col=0)
donor.loc[more.index, 'Primary Site'] = more['Primary Site']

In [52]:
donor = donor[["donor_sex", "donor_relapse_type", "donor_age_at_enrollment", "donor_diagnosis_icd10", "donor_tumour_stage_at_diagnosis", 'Primary Site']]

In [53]:
specimen = pd.read_csv("data/specimen.tsv", sep="\t", index_col=0)
specimen.iloc[0]

project_code                                                                     CLLE-ES
study_specimen_involved_in                                                           NaN
submitted_specimen_id                                                      060-0123-01TD
icgc_donor_id                                                                    DO51966
submitted_donor_id                                                                    60
specimen_type                          Primary tumour - blood derived (peripheral blood)
specimen_type_other                                                                  NaN
specimen_interval                                                                 3586.0
specimen_donor_treatment_type                                               no treatment
specimen_donor_treatment_type_other                                                  NaN
specimen_processing                                                                fresh
specimen_processing_o

In [54]:
specimen = specimen[["icgc_donor_id", "specimen_type", "specimen_storage", "specimen_processing"]]

In [55]:
for i, val in specimen.iterrows():
    specimen.loc[i, donor.columns] = donor.loc[val.icgc_donor_id].values 

In [59]:
sample = pd.read_csv("data/sample.tsv", sep="\t", index_col=0)
sample

Unnamed: 0_level_0,project_code,submitted_sample_id,icgc_specimen_id,submitted_specimen_id,icgc_donor_id,submitted_donor_id,analyzed_sample_interval,percentage_cellularity,level_of_cellularity,study
icgc_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SA564770,CLLE-ES,060-0123-01TD,SP130360,060-0123-01TD,DO51966,60,315.0,97.0,,
SA564151,CLLE-ES,060-01-2ND,SP130410,060-01-2ND,DO51966,60,,98.0,,
SA602505,CLLE-ES,060-0123-02ND,SP199735,060-0123-02ND,DO51966,60,,,,
SA564150,CLLE-ES,060-02-01ND,SP130409,060-02-01ND,DO51966,60,,99.0,,
SA538928,CLLE-ES,060-0123-03TR,SP114999,060-0123-03TR,DO51966,60,315.0,,,
...,...,...,...,...,...,...,...,...,...,...
SA607197,BPLL-FR,14_3,SP202799,B-PLL_32_tumor,DO233963,B-PLL_32,,,,
SA607150,BPLL-FR,14_2,SP202986,B-PLL_32_control,DO233963,B-PLL_32,,,,
SA538993,CLLE-ES,356-01-8ND,SP115086,356-01-8ND,DO7084,356,,100.0,,PCAWG
SA86542,CLLE-ES,356-01-4TR,SP15992,356-01-4TR,DO7084,356,7.0,95.0,,PCAWG


In [60]:
icgcann = sample[['project_code', 'submitted_sample_id', 'icgc_specimen_id', 'percentage_cellularity']]

In [62]:
for i, val in icgcann.iterrows():
    icgcann.loc[i, specimen.columns] = specimen.loc[val.icgc_specimen_id]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, v, pi)


In [None]:
icgcann['sequencer'] = "" # missing
icgcann['method'] = "bulk"
icgcann['reference']= "hg38"
icgcann['ends']="paired end"

## st jude

In [None]:
pd.read_csv('') # 40,000 x 3500


## NCI 60

## tumor cell atlas

## other random datasets from SRA

## DUOS datasets

In [None]:
#https://duos.broadinstitute.org/dataset_catalog

## hartwig

## Pancreas from Sri

## Neurosphere from Keith

## EBI

# QC

## get the same set of overlapping genes

## compute correlation

- find unknown duplicates
- find problematic duplicates

## analyse  annotation files

- find mismatch annotations
- add missing annotations, need: 
    - sequencer, 
    - expression_type, 
    - media, 
    - origin, 
    - tissue type,
    - disease,
    - sub_disease,
    - cell type, 
    - group, 
    - sex, 
    - age, 
    - contamination
    - organism
- find good set of names for annotations