# load tons of datasets (~60,000 RNAseq samples)

In [8]:
from taigapy import TaigaClient
tc = TaigaClient()

from depmapomics import tracker as track
from depmapomics import expressions

from genepy.utils import helper as h

import dalmatian as dm
from gsheets import Sheets
import pandas as pd

from anndata import AnnData, read_h5ad

MY_ID = '~/.client_secret.json'
MYSTORAGE_ID = "~/.storage.json"

Sheets.from_files(MY_ID, MYSTORAGE_ID)
#autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
rename = {
'Acquisition Site': "collection_site",
'Actual Seq Technology': 'sequencer',
'Age At Acquisition (Years)': "age",
'age_at_dx':"age", 
'Age': "age",
#'Aggregated': "aggregated",
'Assay title': 'assay',
'arxspan_id': "tissue_id",
'biopsy_tissue': "collection_site",
'Case ID': 'participant_id', 
'CCLF_ID': 'sample_id',
'Clinical Tumor Diagnosis': 'disease_type',
'Collaborator Participant ID': "participant_id",
'collection': 'collection_site', 
'Contamination % (First Agg)': 'contamination', 
'Contamination %': "contamination",
'Culture Medium': "media_type",
'Days to First Agg': 'exp_date', 
'Diagnosis': 'disease_subtype',
'disease': "disease_type", 
'Expansion Status': "expansion",
'External ID for BAM': "tissue_id",
'gender': "sex",
'Gender': "sex",
'Histology.Detailed': "disease_subtype",
'Inferred_Ethnicity': "ethnicity",
'infered_ethnicity': "ethnicity",
'lineage': 'tissue_type',
'mediatype':"media_type",
'Original Material Type': 'history',
'Passage Number': "passage_number",
'Phase':'stage',
'Primary Disease': "disease_type",
'Primary Site': 'collection_site',
'primary_disease': "disease_type", 
'Product': 'preparation',
'Race': "ethnicity",
'RNA_Seq_cancertype': 'lineage', 
'sample_source': 'participant_id',
'Sample_type': 'lineage', 
'sampleID': 'sample_id',
'Sample_id': 'sample_id',
'Sequencing on Tissue or Cell model? (MT confirm)': 'cell_type', 
'Sex': "sex",
'tissue': "lineage",
'Participant ID': 'participant_id',
'RIN score from PAXgene tissue Aliquot': 'rin_score',
'RIN': 'rin_score',
'Cell types level 3': 'tissue_type',
'Age_bin':'age',
'cohort': "disease_type",
'site_donor_id': "participant_id",
#'site_id': "tissue_id",
'Site.of.Specimen': "collection_site",
'stripped_cell_line_name': "sample_name",
'arxspan_id': 'tissue_id',
'subtype': 'disease_subtype', 
'tc': 'contamination',
'Therapy': "therapy",
'Tissue Site': "collection_site",
'Tissue Status': "metastatic/primary",
'cancer_format': "metastatic/primary",
'Tumor Percent': "contamination",
'Tumor Type': "metastatic/primary",
'Tumor_type': 'cell_type', 
'instability': "instability",
'type': "cell_type",
'Type': "cell_type",
'Sample Type': "cell_type",
'sample_type': 'sample_type',
'dataset': 'dataset',
'ends': 'ends',
'reference': 'reference',
'doublingt': 'doublingt',
'method': 'method',
'hasebv': 'hasebv',
'sequencing_type':'sequencing_type',
'base_media': 'base_media',
'growth_pattern': 'morphology'
}

## CCLE + TCGA

In [24]:
# load from taiga public (figshare link)
# load internal expression,
# latest version can be found at https://depmap.org/portal/download/
# can also be loaded like so pd.read_csv('gs://ccle_default_params/celligner_ex/CCLE_expression.csv.gz', index_col=0)
CCLE_expression = tc.get(name='internal-21q3-fe4c',
                         file='CCLE_expression_full')  #40,000x1,500


# load  TCGA expression
# this dataset was generated from  ,using this script: 
# caan be found here: pd.read_csv('gs://ccle_default_params/celligner_ex/TCGA_expression.csv.gz', index_col=0)
TCGA_expression = tc.get(name='celligner-input-9827',
                       file='tumor_expression') # 40,000x13,000

No dataset version provided. Using version 16.


In [4]:
# loading annotations
CCLE_annotation = track.getTracker() # the function uses pygsheets to load this: REFSHEET_URL=https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY
# Sheets.from_files(MY_ID, MYSTORAGE_ID).get(REFSHEET_URL).sheets[0].to_frame(index_col=0)
# you can also get it from pd.read_csv('gs://ccle_default_params/celligner_ex/CCLE_annotation.csv.gz', index_col=0)


# can be loaded from 
# pd.read_csv('gs://ccle_default_params/celligner_ex/TCGA_annotation.csv.gz', index_col=0)
TCGA_annotation = tc.get(name='celligner-input-9827',
                         file='tumor_annotations') # generated manually 

No dataset version provided. Using version 1.


In [323]:
CCLE_annotation.iloc[0, :-25]

arxspan_id                       ACH-001200
participant_id                  PT-P85FcSaJ
sex                                    Male
age                                       0
stated_race                             NaN
                             ...           
update_time                             NaN
processing_qc                           NaN
sequencing_date                  15-10-2014
crc32c_hash                        M9Me+A==
md5_hash           gqfEDcwyI389m8TXG8ovMg==
Name: CDS-3wjudI, Length: 94, dtype: object

In [31]:
len(CCLE_annotation[(CCLE_annotation.blacklist==0)&(CCLE_annotation.version>1)&(CCLE_annotation.datatype=="rna")]) 
# we have .. replicates in CCLE

79

In [328]:
CCLE_annotation = CCLE_annotation.set_index('arxspan_id', drop=True).loc[CCLE_expression.index]

In [330]:
CCLE_annotation = CCLE_annotation.rename(columns=rename)
print(set(CCLE_annotation.columns)-set(rename.values()))
CCLE_annotation = CCLE_annotation[set(rename.values()) & set(CCLE_annotation.columns)]

CCLE_annotation['method']="bulk"
CCLE_annotation['cell_type']="historical_CL; 2D"
CCLE_annotation['sequencer']="Illumina Hiseq 2000"
CCLE_annotation['ends']="paired end"
CCLE_annotation['reference']="hg38"
CCLE_annotation['dataset']="ccle"
CCLE_annotation['align'] = "STAR"
CCLE_annotation['counter'] = "RSEM"

CCLE_annotation = CCLE_annotation[set(CCLE_annotation.columns) & set(rename.values())]

{'pathology_subtype2', '19q3', 'sequence_chemistry', 'atcclink', 'serums', 'release_date', 'histology', 'baits', 'visual_characteristics', 'its', 'legacy_size', '18q4', 'internal_bam_filepath', 'month_passaging_billed', 'datatype', 'CCLE_doubling_time', 'pathology_subtype3', 'observed_cell_representation', 'low_quality', 'issue', 'subsubtype', 'pen_strep', 'dsmzlink', 'bam_qc', '19q4', 'update_time', 'bam_public_sra_path', '21Q2', 'DMX_doubling_time', 'version', 'geo_loc', 'size', 'blacklist', 'characteristics', '22minerva', 'finding_site', 'freeze_media', 'puromycin_selection_ug_ml', 'histology_subtype', 'prioritized', '18q2', 'sequencing_date', 'GPP_cell_doubling_time', 'parent_cell_line', '20q2', 'sodium_pyruv', 'md5_hash', 'histology_subtype2', '21Q4v2', '21Q4', 'internal_bai_filepath', 'pathology', '20q4', 'comments', 'lineage_molecular_subtype', 'other_media', 'source', 'original_source', 'year_passaging_billed', 'year_sequencing_billed', 'gm_csf', 'condition', 'cellosaurus_origi

In [331]:
CCLE_annotation.index.name = "sample_id"

In [10]:
CCLE_expression.columns = [i.split('(')[-1][:-1] if '(' in i else i for i in CCLE_expression.columns]

In [1]:
TCGA_annotation.iloc[0]

NameError: name 'TCGA_annotation' is not defined

In [7]:
set(TCGA_annotation.lineage)

{'adrenal',
 'bile_duct',
 'blood',
 'bone',
 'breast',
 'central_nervous_system',
 'cervix',
 'colorectal',
 'endocrine',
 'esophagus',
 'eye',
 'gastric',
 'germ_cell',
 'kidney',
 'liver',
 'lung',
 'lymphocyte',
 'nasopharynx',
 'nerve',
 'ovary',
 'pancreas',
 'peripheral_nervous_system',
 'pineal',
 'prostate',
 'skin',
 'soft_tissue',
 'teratoma',
 'thymus',
 'thyroid',
 'unknown',
 'upper_aerodigestive',
 'urinary_tract',
 'uterus'}

In [391]:
TCGA_annotation = TCGA_annotation.rename(columns=rename)
print(set(TCGA_annotation.columns)-set(rename.values()))
TCGA_annotation['tissue_id'] = TCGA_annotation['sample_id']
TCGA_annotation = TCGA_annotation[set(rename.values()) & set(TCGA_annotation.columns)]
TCGA_annotation['method']="bulk"
TCGA_annotation['cell_type']="tumor"
TCGA_annotation['metastatic/primary']="Primary"
TCGA_annotation['sequencer']= "Illumina Hiseq 2000"
TCGA_annotation['reference']= "hg38"
TCGA_annotation['ends']="paired end"
TCGA_annotation['dataset']="tcga"
TCGA_annotation['align'] = "STAR"
TCGA_annotation['counter'] = "RSEM"

TCGA_annotation = TCGA_annotation.set_index('sample_id')

{'Cancer type', 'UMAP_1', 'CPE', 'site_id', 'TCGA_disease', 'ABSOLUTE', 'pedaya', 'UMAP_2', 'site_sampleid', 'TCGA_primary_site', 'cluster'}


In [392]:
TCGA_annotation

Unnamed: 0_level_0,sex,tissue_id,age,disease_subtype,sample_type,tissue_type,participant_id,disease_type,method,cell_type,metastatic/primary,sequencer,reference,ends,dataset,align,counter
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
TH27_1241_S01,unknown,TH27_1241_S01,,glioma,,central_nervous_system,,Brain Cancer,bulk,tumor,Primary,Illumina Hiseq 2000,hg38,paired end,tcga,STAR,RSEM
TARGET-40-0A4I65-01A-01R,male,TARGET-40-0A4I65-01A-01R,17.86,osteosarcoma,,bone,TARGET-40-0A4I65,Bone Cancer,bulk,tumor,Primary,Illumina Hiseq 2000,hg38,paired end,tcga,STAR,RSEM
THR24_1965_S01,male,THR24_1965_S01,13.27,glioma,,central_nervous_system,SJHGG060,Brain Cancer,bulk,tumor,Primary,Illumina Hiseq 2000,hg38,paired end,tcga,STAR,RSEM
THR24_2080_S01,male,THR24_2080_S01,3.75,,,skin,SJMEL001005,Skin Cancer,bulk,tumor,Primary,Illumina Hiseq 2000,hg38,paired end,tcga,STAR,RSEM
THR20_0494_S01,female,THR20_0494_S01,6.00,medulloblastoma,,central_nervous_system,icgc/_EGAR00001049890_RNA_tumor_ICGC_MB46,Brain Cancer,bulk,tumor,Primary,Illumina Hiseq 2000,hg38,paired end,tcga,STAR,RSEM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-95-7947-01,male,TCGA-95-7947-01,67.00,lung adenocarcinoma,Primary Tumor,lung,TCGA-95-7947,Lung Cancer,bulk,tumor,Primary,Illumina Hiseq 2000,hg38,paired end,tcga,STAR,RSEM
TCGA-VQ-AA6F-01,male,TCGA-VQ-AA6F-01,57.00,stomach adenocarcinoma,Primary Tumor,gastric,TCGA-VQ-AA6F,Gastric Cancer,bulk,tumor,Primary,Illumina Hiseq 2000,hg38,paired end,tcga,STAR,RSEM
TCGA-BR-8588-01,female,TCGA-BR-8588-01,55.00,stomach adenocarcinoma,Primary Tumor,gastric,TCGA-BR-8588,Gastric Cancer,bulk,tumor,Primary,Illumina Hiseq 2000,hg38,paired end,tcga,STAR,RSEM
TCGA-24-2254-01,female,TCGA-24-2254-01,66.00,ovarian serous cystadenocarcinoma,Primary Tumor,ovary,TCGA-24-2254,Ovarian Cancer,bulk,tumor,Primary,Illumina Hiseq 2000,hg38,paired end,tcga,STAR,RSEM


In [12]:
set(CCLE_expression.index) & set(TCGA_expression.index)

set()

In [46]:
len(set(CCLE_expression.columns) - set(TCGA_expression.columns)), len(set(CCLE_expression.columns) & set(TCGA_expression.columns)), len(set(TCGA_expression.columns) - set(CCLE_expression.columns)),

(22411, 29643, 4390)

## CCLF

In [13]:
cclf_orga_info = tc.get(name='cclf-organoids-c23d', version=1, file='cclf_orga_info')
cclf_orga_info.iloc[0]

Genomic_Seq                          Pending
RNA_Seq_cancertype              BREAST_BASAL
RNA_Seq_marker                          None
CCLF_ID                       CCLF_cRCRF1048
Diagnosis             Advanced breast cancer
Subtype_patient                ER/PR+, HER2-
Tumor_type                        Metastatic
Sample_type                 Pleural effusion
Name: 0, dtype: object

In [184]:
cclf_orga_info
# age, sex, media?

Unnamed: 0,lineage,collection_site,cell_type,sample_id,disease_subtype,sequencer,method,reference,ends,dataset
cRCRF1048,BREAST_BASAL,Pleural effusion,organoid; cancer,CCLF_cRCRF1048,Advanced breast cancer,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
cRCRF1060,BREAST_BASAL,Pleural effusion,organoid; cancer,CCLF_cRCRF1060,Invasive breast carcinoma,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1337,BREAST_BASAL,Brain met resection,organoid; cancer,CCLF_KL1337,Breast met to brain,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1310,BREAST_BASAL/LUMINAL,,organoid; cancer,CCLF_KL1310,Breast met to brain,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1271,BREAST_LUMINAL,Brain met resection,organoid; cancer,CCLF_KL1271,Breast met to brain,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1333,BREAST_LUMINAL,Cryopreserved tissue,organoid; cancer,CCLF_KL1333,Invasive breast carcinoma,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1351,BREAST_LUMINAL,Brain met resection,organoid; cancer,CCLF_KL1351,Breast met to brain,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
cRCRF1092,BREAST_BASAL,Pleural effusion,organoid; cancer,CCLF_cRCRF1092,Advanced breast cancer,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1312,BREAST_LUMINAL,Brain met resection,organoid; cancer,CCLF_KL1312,Breast met to brain,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
cRCRF1038,FIBROBLAST,Pleural effusion,organoid; cancer,CCLF_cRCRF1038,Advanced breast cancer,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga


In [14]:
cclf_orga_info = cclf_orga_info.rename(columns={**rename, **{"Sample_type": "collection_site"}})
print(set(cclf_orga_info.columns)-set(rename.values()))
cclf_orga_info = cclf_orga_info[set(rename.values()) & set(cclf_orga_info.columns)]
cclf_orga_rnaseq = tc.get(name='cclf-organoids-c23d', version=1, file='cclf_orga_rnaseq').T # 40,000x24
cclf_orga_info.index = [i.split("_")[1] for i in cclf_orga_info.sample_id]
cclf_orga_rnaseq.index = [i.split('_')[0][:-1] for i in cclf_orga_rnaseq.index]

cclf_orga_rnaseq.columns = [i.split('.')[0] for i in cclf_orga_rnaseq.columns]

{'Genomic_Seq', 'RNA_Seq_marker', 'Subtype_patient'}


In [15]:
cclf_orga_info['sequencer'] = "illumina Hiseq 2000"
cclf_orga_info['method'] = "bulk"
cclf_orga_info['cell_type'] = "organoid; cancer"
cclf_orga_info['reference']= "hg38"
cclf_orga_info['ends']="paired end"
cclf_orga_info['dataset'] = "cclf_orga"
cclf_orga_info['align'] = ""
cclf_orga_info['counter'] = ""

In [63]:
cclf_orga_info[cclf_orga_info.disease_subtype.str.contains(' met')]

Unnamed: 0,disease_type,cell_type,disease_subtype,lineage,sample_id,sequencer,method,reference,ends,dataset
KL1337,BREAST_BASAL,organoid; cancer,Breast met to brain,Brain met resection,CCLF_KL1337,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1310,BREAST_BASAL/LUMINAL,organoid; cancer,Breast met to brain,,CCLF_KL1310,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1271,BREAST_LUMINAL,organoid; cancer,Breast met to brain,Brain met resection,CCLF_KL1271,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1351,BREAST_LUMINAL,organoid; cancer,Breast met to brain,Brain met resection,CCLF_KL1351,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1312,BREAST_LUMINAL,organoid; cancer,Breast met to brain,Brain met resection,CCLF_KL1312,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1220,,organoid; cancer,Breast met to brain,Brain met resection,CCLF_KL1220,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1282,,organoid; cancer,Ovarian met to brain,Brain met resection,CCLF_KL1282,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga


In [61]:
cclf_orga_info

Unnamed: 0,disease_type,cell_type,disease_subtype,lineage,sample_id,sequencer,method,reference,ends,dataset
cRCRF1048,BREAST_BASAL,organoid; cancer,Advanced breast cancer,Pleural effusion,CCLF_cRCRF1048,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
cRCRF1060,BREAST_BASAL,organoid; cancer,Invasive breast carcinoma,Pleural effusion,CCLF_cRCRF1060,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1337,BREAST_BASAL,organoid; cancer,Breast met to brain,Brain met resection,CCLF_KL1337,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1310,BREAST_BASAL/LUMINAL,organoid; cancer,Breast met to brain,,CCLF_KL1310,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1271,BREAST_LUMINAL,organoid; cancer,Breast met to brain,Brain met resection,CCLF_KL1271,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1333,BREAST_LUMINAL,organoid; cancer,Invasive breast carcinoma,Cryopreserved tissue,CCLF_KL1333,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1351,BREAST_LUMINAL,organoid; cancer,Breast met to brain,Brain met resection,CCLF_KL1351,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
cRCRF1092,BREAST_BASAL,organoid; cancer,Advanced breast cancer,Pleural effusion,CCLF_cRCRF1092,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
KL1312,BREAST_LUMINAL,organoid; cancer,Breast met to brain,Brain met resection,CCLF_KL1312,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga
cRCRF1038,FIBROBLAST,organoid; cancer,Advanced breast cancer,Pleural effusion,CCLF_cRCRF1038,illumina Hiseq 2000,bulk,hg38,paired end,cclf_orga


In [16]:
#cclf other
cclfrna = dm.WorkspaceManager("nci-mimoun-bi-org/CCLF_RNA_2_0").get_samples() #40,000x160

cclfrna_anno = cclfrna[["external_id_rna"]].replace({'NA': np.nan})

cclfrna_annot = Sheets.from_files(MY_ID, MYSTORAGE_ID).get("https://docs.google.com/spreadsheets/d/1O9IV_v2vMbebkk_KDWu3LdKBQ16c8lThJKiiWvRxMUo").sheets[2].to_frame()

cclfrna_annot2 = Sheets.from_files(MY_ID, MYSTORAGE_ID).get("https://docs.google.com/spreadsheets/d/1O9IV_v2vMbebkk_KDWu3LdKBQ16c8lThJKiiWvRxMUo").sheets[3].to_frame()
# get it  from https://docs.google.com/spreadsheets/d/1O9IV_v2vMbebkk_KDWu3LdKBQ16c8lThJKiiWvRxMUo and from Terra: in the workspace: nci-mimoun-bi-org/CCLF_RNA_2_0



In [17]:
#files, failed, _, _, lowqual, _ = await expressions.postProcess("nci-mimoun-bi-org/CCLF_RNA_2_0", "all_samples", samplesetToLoad = "all_samples", compute_enrichment=False, trancriptLevelCols = ['rsem_transcripts_expected_count', 'rsem_transcripts_tpm'], geneLevelCols = ["rsem_genes_tpm", "rsem_genes_expected_count"], save_output="data/")
#cclfrna = files['rsem_genes_tpm']
cclfrna = pd.read_csv('data/expression_genes_tpm.csv.gz', index_col=0)
cclfrna = np.log2(1+cclfrna)

In [18]:
ina = (cclfrna_annot2['Passage Number'].isna() | (cclfrna_annot2['Passage Number']=="Unknown")) & ~(cclfrna_annot2["Passage Number on Receipt"].isna() | (cclfrna_annot2["Passage Number on Receipt"]=="Unknown"))
cclfrna_annot2.loc[ina, "Passage Number"] = cclfrna_annot2.loc[ina, "Passage Number on Receipt"].values

ina = (cclfrna_annot2['Gender'].isna() | (cclfrna_annot2['Gender']=="Unknown")) & ~(cclfrna_annot2["Gender.1"].isna() | (cclfrna_annot2["Gender.1"]=="Unknown"))
cclfrna_annot2.loc[ina, "genderA"] = cclfrna_annot2.loc[ina, "Gender.1"].values

ina = (cclfrna_annot2['Gender'].isna() | (cclfrna_annot2['Gender']=="Unknown")) & ~(cclfrna_annot2["FP Gender"].isna() | (cclfrna_annot2["FP Gender"]=="Unknown"))
cclfrna_annot2.loc[ina, "Gender"] = cclfrna_annot2.loc[ina, "FP Gender"].values

ina = (cclfrna_annot2['Race'].isna() | (cclfrna_annot2['Race']=="Unknown")) & ~(cclfrna_annot2["Ethnicity"].isna() | (cclfrna_annot2["Ethnicity"]=="Unknown"))
cclfrna_annot2.loc[ina, "Race"] = cclfrna_annot2.loc[ina, "Ethnicity"].values


In [123]:
cclfrna_annot2.iloc[0]

Sample ID                                                               SM-5KIB3
Collaborator Participant ID                                               3T3/J2
Collaborator Sample ID                                                 3T3/J2_CL
Sample LSID                             broadinstitute.org:bsp.prod.sample:5KIB3
Sample Status                                                          INSTORAGE
                                                     ...                        
Volume Changed                 Volume changed from 100.0ul to 88.0ul, Volume ...
XL20 Used for Total Weight                                                   NaN
XL20 Used to Tare                                                            NaN
rRNA Height Ratio 28S/18S                                                    NaN
genderA                                                                      NaN
Name: 0, Length: 126, dtype: object

In [124]:
cclfrna_annot.iloc[0]

Data File Type? (CCLF QC, CCLF Full Model Characterization, Other R&D, Other Exclude)                      CCLF Full Model Characterization
Collaborator Sample ID                                                                                                            AB002T_BT
Related project catagory                                                                                                           CCLF-mix
Sequencing purpose? (Cell model genomic profiling/Driver mutation discovery/Mouse material)(MT confirm)        Cell model genomic profiling
Sequencing on Tissue or Cell model? (MT confirm)                                                                                 Cell model
                                                                                                                         ...               
Mean Coverage (Raw)                                                                                                                     0.0
Median Coverage (Raw

In [19]:
cclfrna_annot2 = cclfrna_annot2.set_index('Collaborator Sample ID')[["Age",
"Gender",
"Tumor Type",
"Tissue Site",
"Primary Disease",
"Race",
"Culture Medium",
"Passage Number",]]

In [81]:
len(set(CCLE_expression.columns) - set(cclf_orga_rnaseq.columns)), len(set(CCLE_expression.columns) & set(cclf_orga_rnaseq.columns)), len(set(cclf_orga_rnaseq.columns) - set(CCLE_expression.columns)),

(12002, 40052, 560)

In [20]:
cclfrna_annot = cclfrna_annot[[
 'Sequencing on Tissue or Cell model? (MT confirm)',
 'External ID for BAM',
 'Product',
 'RIN',
 'Collaborator Sample ID',
 'Original Material Type',
 'Collaborator Participant ID',
 'Actual Seq Technology',
 'Contamination %',
]].set_index('Collaborator Sample ID', drop=True)

In [21]:
for val in h.dups(cclfrna_annot2.index):
    for i in range(len(cclfrna_annot2.loc[val])-1):
        if cclfrna_annot2.loc[val].iloc[0].isna().sum() > cclfrna_annot2.loc[val].iloc[i+1].isna().sum():
            cclfrna_annot2.iloc[np.argwhere(cclfrna_annot2.index == val).flatten()[0]] = cclfrna_annot2.loc[val].iloc[i+1].values
cclfrna_annot2 = cclfrna_annot2[~cclfrna_annot2.index.duplicated(keep='first')]

In [22]:
for val in h.dups(cclfrna_annot.index):
    for i in range(len(cclfrna_annot.loc[val])-1):
        if cclfrna_annot.loc[val].iloc[0].isna().sum() > cclfrna_annot.loc[val].iloc[i+1].isna().sum():
            cclfrna_annot.iloc[np.argwhere(cclfrna_annot.index == val).flatten()[0]] = cclfrna_annot.loc[val].iloc[i+1].values
cclfrna_annot = cclfrna_annot[~cclfrna_annot.index.duplicated(keep='first')]

In [23]:
cclfrna_annot = pd.concat([cclfrna_annot, cclfrna_annot2], axis=1)

In [24]:
for i, val in cclfrna_annot.iterrows():
    cclfrna_anno.loc[cclfrna_anno.external_id_rna==i, cclfrna_annot.columns] = val.values
del cclfrna_annot

In [25]:
cclfrna_anno['reference']= "hg38"
cclfrna_anno['ends']="paired end"
cclfrna_anno['method'] = "bulk"
cclfrna_anno['dataset'] = "cclf"
cclfrna_anno['sequencer'] = "illlumina Tru-Seq"
cclfrna_anno['align'] = "STAR"
cclfrna_anno['counter'] = "RSEM"

cclfrna_anno = cclfrna_anno.rename(columns=rename)
print(set(cclfrna_anno.columns)-set(rename.values()))
cclfrna_anno = cclfrna_anno[set(rename.values()) & set(cclfrna_anno.columns)]

{'external_id_rna'}


In [None]:
cclfrna_anno.

In [27]:
cclfrna_anno

Unnamed: 0_level_0,history,collection_site,passage_number,method,rin_score,preparation,reference,age,cell_type,ends,sequencer,participant_id,disease_type,sex,tissue_id,dataset,ethnicity,media_type,metastatic/primary,contamination
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AA02-Tumor-SM-5LB98,"Cells:Cell Line, Viable",Pleural Cavity,6.0,bulk,,Tru-Seq Strand Specific Large Insert RNA Seque...,hg38,,Cell model,paired end,HISEQ_2500,AA02,Pancreatic Adenocarcinoma,Male,AA02T,cclf,,,Metastatic,1.91
AA13-Tumor-SM-GXH5W,Cells:Pellet frozen,Pancreas,,bulk,,Tru-Seq Strand Specific Large Insert RNA Seque...,hg38,,Cell model,paired end,HISEQ_2500,AA13,Pancreatic adenocarcinoma,,AA13T_Repeat,cclf,,,Primary,0.18
AA22-Tumor-SM-ALFLY,Cells:Growing,Pancreas,3.0,bulk,,Tru-Seq Strand Specific Large Insert RNA Seque...,hg38,,Cell model,paired end,HISEQ_2500,AA22,Pancreatic Adenocarcinoma,Female,AA22T,cclf,,AR5: 100.0%,Primary,0.60
AA22-Tumor-SM-GXH2L,Cells:Pellet frozen,Pancreas,,bulk,,Tru-Seq Strand Specific Large Insert RNA Seque...,hg38,,Cell model,paired end,HISEQ_2500,AA22,Pancreatic Adenocarcinoma,Female,AA22T_OPAC,cclf,,3D/Organoid,Primary,0.42
AA26-Tumor-SM-AU5YZ,Cells:Pellet frozen,Pancreas,,bulk,,Tru-Seq Strand Specific Large Insert RNA Seque...,hg38,,Cell model,paired end,HISEQ_2500,AA26,Pancreatic Adenocarcinoma,Male,AA26T,cclf,,CM/Organoid,Primary,0.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RP-571_OSUDES23T_v1_RNA_OnPrem,,,,bulk,,,hg38,,,paired end,,,,,,cclf,,,,
SM045-Tumor-SM-AREHF,Tissue:Fresh Tissue,Ovary,10.0,bulk,,Tru-Seq Strand Specific Large Insert RNA Seque...,hg38,,Cell model,paired end,HISEQ_2500,SM045,Ovarian Cancer,Female,SM045T,cclf,,WIT-P-NC: 100.0%,Primary,0.41
SP022-Tumor-SM-B289M,"Cells:Cell Line, Viable",,15.0,bulk,,Tru-Seq Strand Specific Large Insert RNA Seque...,hg38,,Cell model,paired end,HISEQ_2500,SP022,HNSCC,Male,SP022T,cclf,,Manufactured in BSP: CM1:021017: CM1:021017 95...,Primary,0.40
SP025-Tumor-SM-C1JOO,"Cells:Cell Line, Viable",,15.0,bulk,,Tru-Seq Strand Specific Large Insert RNA Seque...,hg38,,Cell model,paired end,HISEQ_2500,SP025,HNSCC,Female,SP025T_P,cclf,,"Manufactured in BSP: RETM: 100.0 %, Manufactur...",Primary,0.89


In [33]:
loc = cclfrna_anno[cclfrna_anno.isna().sum(1)> 14].index
cclfrna_anno=cclfrna_anno.drop(index=loc)
cclfrna=cclfrna.drop(index=set(loc)&set(cclfrna.index))

In [34]:
set(cclfrna_anno.preparation)

{'Tru-Seq Strand Specific Large Insert RNA Sequencing (50M pairs) v1',
 'Tru-Seq Strand Specific Large Insert RNA Sequencing - High Coverage (50M pairs)'}

In [35]:
cclfrna_anno = cclfrna_anno.drop(columns=['preparation'])

In [66]:
cclfrna_anno.iloc[0]

reference                                                          hg38
participant_id                                                     AA02
collection_site                                          Pleural Cavity
contamination                                                      1.91
tissue_id                                                         AA02T
history                                         Cells:Cell Line, Viable
sequencer                                                    HISEQ_2500
rin_score                                                           NaN
ends                                                         paired end
disease_type                                  Pancreatic Adenocarcinoma
age                                                                 NaN
ethnicity                                                           NaN
sex                                                                Male
dataset                                                         

In [132]:
cclfrna_anno = cclfrna_anno.drop(columns="rin_score") # because it is empty

Series([], Name: rin_score, dtype: float64)

In [None]:
cclfrna_anno.replace({'age':{'1': 'pediatric',
'49': 'adult',
'52': 'adult',
'53': 'adult',
'61': 'adult',
'66': 'adult',
'67': 'adult',
'77': 'adult',}})

In [188]:
from genepy.google.google_sheet import dfToSheet
from depmapomics.config import SHEETCREDS
dfToSheet(cclfrna_anno, "CCLF sheet", secret=SHEETCREDS)

In [275]:
cclfrna_anno.sample_id

AttributeError: 'DataFrame' object has no attribute 'sample_id'

In [185]:
cclfrna_anno

Unnamed: 0_level_0,history,collection_site,passage_number,method,rin_score,reference,age,cell_type,ends,sequencer,...,disease_type,sex,tissue_id,dataset,ethnicity,media_type,metastatic/primary,contamination,align,counter
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA02-Tumor-SM-5LB98,"Cells:Cell Line, Viable",Pleural Cavity,6.0,bulk,,hg38,,Cell model,paired end,illlumina Tru-Seq,...,Pancreatic Adenocarcinoma,Male,AA02T,cclf,,,Metastatic,1.91,STAR,RSEM
AA13-Tumor-SM-GXH5W,Cells:Pellet frozen,Pancreas,,bulk,,hg38,,Cell model,paired end,illlumina Tru-Seq,...,Pancreatic adenocarcinoma,,AA13T_Repeat,cclf,,,Primary,0.18,STAR,RSEM
AA22-Tumor-SM-ALFLY,Cells:Growing,Pancreas,3.0,bulk,,hg38,,Cell model,paired end,illlumina Tru-Seq,...,Pancreatic Adenocarcinoma,Female,AA22T,cclf,,AR5: 100.0%,Primary,0.60,STAR,RSEM
AA22-Tumor-SM-GXH2L,Cells:Pellet frozen,Pancreas,,bulk,,hg38,,Cell model,paired end,illlumina Tru-Seq,...,Pancreatic Adenocarcinoma,Female,AA22T_OPAC,cclf,,3D/Organoid,Primary,0.42,STAR,RSEM
AA26-Tumor-SM-AU5YZ,Cells:Pellet frozen,Pancreas,,bulk,,hg38,,Cell model,paired end,illlumina Tru-Seq,...,Pancreatic Adenocarcinoma,Male,AA26T,cclf,,CM/Organoid,Primary,0.53,STAR,RSEM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RP-571_DW036T_v2_RNA_OnPrem,"Cells:Cell Line, Viable",Blood,8.0,bulk,,hg38,,Cell model,paired end,illlumina Tru-Seq,...,T-PLL,Female,DW036T,cclf,,"XVIVO: 100.0%, Manufactured in BSP: XVIVO: 100...",Primary,0.72,STAR,RSEM
SM045-Tumor-SM-AREHF,Tissue:Fresh Tissue,Ovary,10.0,bulk,,hg38,,Cell model,paired end,illlumina Tru-Seq,...,Ovarian Cancer,Female,SM045T,cclf,,WIT-P-NC: 100.0%,Primary,0.41,STAR,RSEM
SP022-Tumor-SM-B289M,"Cells:Cell Line, Viable",,15.0,bulk,,hg38,,Cell model,paired end,illlumina Tru-Seq,...,HNSCC,Male,SP022T,cclf,,Manufactured in BSP: CM1:021017: CM1:021017 95...,Primary,0.40,STAR,RSEM
SP025-Tumor-SM-C1JOO,"Cells:Cell Line, Viable",,15.0,bulk,,hg38,,Cell model,paired end,illlumina Tru-Seq,...,HNSCC,Female,SP025T_P,cclf,,"Manufactured in BSP: RETM: 100.0 %, Manufactur...",Primary,0.89,STAR,RSEM


In [138]:
set(CCLE_expression.index) & set(cclfrna.index)

set()

In [40]:
cclfrna.columns = [i.split('(')[-1][:-1] if '(' in i else i for i in cclfrna.columns]

In [139]:
len(set(CCLE_expression.columns) - set(cclfrna.columns)), len(set(CCLE_expression.columns) & set(cclfrna.columns)), len(set(cclfrna.columns) - set(CCLE_expression.columns)),

(6843, 45211, 10)

## MET500 and PDXs

In [429]:
met500_meta_t = tc.get(name='met500-fc3c', file='met500_meta')

No dataset version provided. Using version 1.


In [430]:
met500_meta_t.iloc[0]

Sample_id        ES_5001-capt-SI_5013-C0LAMACXX
sample_type                               tumor
sample_source                           ES_5001
dataset                                    mctp
tissue                                   breast
cohort                                     BRCA
run.id           ES_5001-capt-SI_5013-C0LAMACXX
idx                                         461
test                                      False
tc                                         0.93
biopsy_tissue                             brain
Name: 0, dtype: object

In [None]:
# met500 
met500_meta = tc.get(name='met500-fc3c', file='met500_meta')
met500_TPM = tc.get(name='met500-fc3c', file='met500_TPM') #20,979x868 matrix

#Novartis_PDX
Novartis_PDX_ann = tc.get(name='pdx-data-3d29', file='Novartis_PDX_ann')
Novartis_PDX_TPM = tc.get(name='pdx-data-3d29', file='Novartis_PDX_TPM').T # 38,087x445

#pediatric_PDX
pediatric_PDX_ann = tc.get(name='pdx-data-3d29', file='pediatric_PDX_ann')
pediatric_PDX_TPM = tc.get(name='pdx-data-3d29', file='pediatric_PDX_TPM') #80,000x250

In [177]:
met500_meta.iloc[0]

Sample_id        ES_5001-capt-SI_5013-C0LAMACXX
sample_type                               tumor
sample_source                           ES_5001
dataset                                    mctp
tissue                                   breast
cohort                                     BRCA
run.id           ES_5001-capt-SI_5013-C0LAMACXX
idx                                         461
test                                      False
tc                                         0.93
biopsy_tissue                             brain
Name: 0, dtype: object

In [42]:
tcga_dict = {
"LAML":	"Acute Myeloid Leukemia",
"ACC":	"Adrenocortical carcinoma",
"BLCA":	"Bladder Urothelial Carcinoma",
"BOCA":	"Bone Cancer",
"LGG":	"Brain Lower Grade Glioma",
"BRCA":	"Breast invasive carcinoma",
"CESC":	"Cervical squamous cell carcinoma and endocervical adenocarcinoma",
"CHOL":	"Cholangiocarcinoma",
"CLLE":	"Chronic Lymphocytic Leukemia",
"CMDI":	"Chronic Myeloid Disorders",
"COAD":	"Colon adenocarcinoma",
"COLO":	"Colorectal Cancer",
"COADREAD":	"Colorectal cancer",
"EOPC":	"Early Onset Prostate Cancer",
"ESAD":	"Esophageal Adenocarcinoma",
"ESCA":	"Esophageal carcinoma",
"CHOL":	"Gallbladder cancer",
"GBM":	"Glioblastoma multiforme",
"HNSC":	"Head and Neck squamous cell carcinoma",
"KDNY":	"Kidney Cancer",
"KICH":	"Kidney Chromophobe",
"KIRC":	"Kidney renal clear cell carcinoma",
"KIRP":	"Kidney renal papillary cell carcinoma",
"LIRI":	"Liver Cancer",
"LICA":	"Liver Cancer",
"LINC":	"Liver Cancer",
"HCC":	"Liver hepatocellular carcinoma",
"LIHC":	"Liver hepatocellular carcinoma",
"LGG":	"Lower Grade GLioma",
"LUNG":	"Lung Cancer",
"LUAD":	"Lung adenocarcinoma",
"LUSC":	"Lung squamous cell carcinoma",
"DLBC":	"Lymphoid Neoplasm Diffuse Large B-cell Lymphoma",
"MCTP":	"MCTP",
"MALY":	"Malignant Lymphoma",
"MESO":	"Mesothelioma",
"NBL":	"Neuroblastoma",
"ORCA":	"Oral Cancer",
"MISC":	"Other Cancer",
"OV":	"Ovarian serous cystadenocarcinoma",
"PACA":	"Pancreatic Cancer",
"PAEN":	"Pancreatic Cancer Endocrine neoplasms",
"PAAD":	"Pancreatic adenocarcinoma",
"PBCA":	"Pediatric Brain Cancer",
"PCPG":	"Pheochromocytoma and Paraganglioma",
"PRAD":	"Prostate adenocarcinoma",
"READ":	"Rectum adenocarcinoma",
"RECA":	"Renal Cancer",
"SARC":	"Sarcoma",
"SECR":	"Secretory Cancer",
"SKCM":	"Skin Cutaneous Melanoma",
"STAD":	"Stomach adenocarcinoma",
"TGCT":	"Testicular Germ Cell Tumor",
"TGCT":	"Testicular Germ Cell Tumors",
"THYM":	"Thymoma",
"THYM":	"Thymoma",
"THCA":	"Thyroid carcinoma",
"UCS":	"Uterine Carcinosarcoma",
"UCEC":	"Uterine Corpus Endometrial Carcinoma",
"UVM":	"Uveal Melanoma",
"ACC":	"adrenocortical carcinoma",
}

In [43]:
met500_meta = met500_meta.replace({"cohort": tcga_dict})

In [44]:
set(met500_meta.columns) - set(rename.values())

{'Sample_id',
 'biopsy_tissue',
 'cohort',
 'idx',
 'run.id',
 'sample_source',
 'tc',
 'test',
 'tissue'}

In [45]:
met500_meta['sequencer'] = "Illumina HiSeq 2000"
met500_meta['method'] = "bulk"
met500_meta['cell_type'] = "tumor"
met500_meta['reference']= "hg38"
met500_meta['ends']="paired end"
met500_meta['metastatic/primary'] = "metastatic"
met500_meta['dataset'] = "met500"
met500_meta['align'] = "STAR"
met500_meta['counter'] = "featureCounts"

In [46]:
met500_meta = met500_meta.rename(columns={**rename, **{'subtype': "disease_type"}})
print(set(met500_meta.columns)-set(rename.values()))
met500_meta = met500_meta[set(rename.values()) & set(met500_meta.columns)].set_index('sample_id', drop=True)

{'run.id', 'test', 'idx', 'counter', 'align'}


In [47]:
met500_meta.contamination = (1-met500_meta.contamination)*100

In [83]:
met500_meta

Unnamed: 0_level_0,lineage,reference,collection_site,cell_type,sample_type,dataset,metastatic/primary,ends,sequencer,participant_id,disease_type,method,contamination
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ES_5001-capt-SI_5013-C0LAMACXX,breast,hg38,brain,tumor,tumor,met500,metastatic,paired end,Illumina HiSeq 2000,ES_5001,Breast invasive carcinoma,bulk,7.0
ES_5004-capt-SI_5834-C19KEACXX,colon,hg38,soft_tissue,tumor,tumor,met500,metastatic,paired end,Illumina HiSeq 2000,ES_5004,Colorectal Cancer,bulk,53.0
ES_5004-poly-SI_5767-C19KEACXX,colon,hg38,soft_tissue,tumor,tumor,met500,metastatic,paired end,Illumina HiSeq 2000,ES_5004,Colorectal Cancer,bulk,53.0
ES_5005-capt-SI_5505-D130HACXX,other,hg38,cervix,tumor,tumor,met500,metastatic,paired end,Illumina HiSeq 2000,ES_5005,Other Cancer,bulk,59.0
ES_5005-poly-SI_5486-D12YGACXX,other,hg38,cervix,tumor,tumor,met500,metastatic,paired end,Illumina HiSeq 2000,ES_5005,Other Cancer,bulk,59.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TP_2123-poly-SI_11689-C7G60ANXX,brain,hg38,brain,tumor,tumor,met500,metastatic,paired end,Illumina HiSeq 2000,TP_2123,Other Cancer,bulk,48.0
TP_2130-capt-SI_11905-C7FMDANXX,,hg38,oral,tumor,tumor,met500,metastatic,paired end,Illumina HiSeq 2000,TP_2130,Head and Neck squamous cell carcinoma,bulk,54.0
TP_2131-capt-SI_11906-C7F4VANXX,bladder,hg38,liver,tumor,tumor,met500,metastatic,paired end,Illumina HiSeq 2000,TP_2131,Bladder Urothelial Carcinoma,bulk,57.0
TP_2141-capt-SI_12056-H53C5ADXX,breast,hg38,liver,tumor,tumor,met500,metastatic,paired end,Illumina HiSeq 2000,TP_2141,Breast invasive carcinoma,bulk,33.0


In [166]:
set(CCLE_expression.index) & set(met500_meta.index)

set()

In [186]:
len(set(CCLE_expression.columns) - set(met500_TPM.columns)), len(set(CCLE_expression.columns) & set(met500_TPM.columns)), len(set(met500_TPM.columns) - set(CCLE_expression.columns)),

(33199, 18855, 2124)

In [187]:
pediatric_PDX_ann.iloc[0]

sampleID                                                                                  ALL-102
lineage                                                                                     blood
subtype                                                                                       ALL
Histology                                                                                     ALL
Histology.Detailed                                                                     Ph-likeALL
Histology-Detailed2                                                                    Ph-likeALL
Molecular-Subtype-Brain                                                                      None
PI                                                                                           Lock
Sex                                                                                          Male
Phase                                                                                   Diagnosis
Age                 

In [188]:
[(i.split('me patient as ')[-1].split(' (')[0],v) if type(i) is str and 'ame patient' in i else '' for v, i in pediatric_PDX_ann[["sampleID","Other_info1"]].values]

[('ALL-105', 'ALL-102'),
 ('ALL-102', 'ALL-105'),
 '',
 '',
 '',
 '',
 ('ALL-102', 'ALL-115'),
 '',
 '',
 '',
 ('ALL-46', 'ALL-121'),
 '',
 ('ALL-58', 'ALL-123'),
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 ('ALL-121', 'ALL-46'),
 '',
 '',
 '',
 '',
 '',
 ('ALL-123', 'ALL-58'),
 '',
 ('ALL-25', 'ALL-61'),
 '',
 '',
 ('ALL-81', 'ALL-80'),
 ('ALL-80', 'ALL-81'),
 '',
 ('ALL-32', 'ALL-90'),
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 ('ALL-61', 'ALL-25'),
 '',
 '',
 '',
 ('ALL-90', 'ALL-32'),
 '',
 '',
 '',
 '',
 '',
 '',
 ('ALL-83', 'ALL-82'),
 ('ALL-82', 'ALL-83'),
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 ('COG-N-453x', 'COG-N-452x'),
 ('COG-N-452x', 'COG-N-453x'),
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 ('COG-N-623x', 'COG-N-603x'),
 ('COG-N-618x', 'COG-N-619x'),
 '',
 ('COG-N-603x', 'COG-N-623x'),
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 ('98

In [87]:
pediatric_PDX_ann.iloc[0]

sampleID                                                                                  ALL-102
lineage                                                                                     blood
subtype                                                                                       ALL
Histology                                                                                     ALL
Histology.Detailed                                                                     Ph-likeALL
Histology-Detailed2                                                                    Ph-likeALL
Molecular-Subtype-Brain                                                                      None
PI                                                                                           Lock
Sex                                                                                          Male
Phase                                                                                   Diagnosis
Age                 

In [48]:
pediatric_PDX_ann = pediatric_PDX_ann.rename(columns={**rename, **{'subtype': "disease_type"}})
print(set(pediatric_PDX_ann.columns)-set(rename.values()))
pediatric_PDX_ann = pediatric_PDX_ann[set(rename.values()) & set(pediatric_PDX_ann.columns)].set_index('sample_id', drop=True)

{'Molecular-Subtype-Brain', 'Prior Therapy', 'PC2', 'Risk.Group', 'PC3', 'Patient.last.alive.year', 'PC1', 'Histology', 'Time.from.Dx.to.sample.for.model.days.', 'Patient.EFS.from.Dx.to.1st.Progression.days.', 'Site.of.Initial.Tumor', 'New Histopathology', 'PI', 'Stage.of.Disease', 'COG studies', 'Other_info1', 'Histology-Detailed2', 'Reported_Ethnicity', 'Patient.OS.to.last.alive.date.days.'}


In [49]:
pediatric_PDX_ann['participant_id'] = pediatric_PDX_ann.index

#created frrom manual inspection
samepatient = [('NCH-CA-2', 'NCH-CA-1'), ('ALL-105', 'ALL-102', "ALL-115"), ('ALL-46', 'ALL-121'), ('ALL-25', 'ALL-61'), ('ALL-81', 'ALL-80'), ('ALL-32', 'ALL-90'), ('ALL-58', 'ALL-123'), ('ALL-82', 'ALL-83'), ("COG-N-623x", "COG-N-603x"), ("COG-N-453x","COG-N-452x"), ("COG-N-618x", "COG-N-619x"), ('22909PNET', '9850PNET'), ('OS-34', 'OS-34-SJ'), ('OS-36', 'OS-36-SJ', 'OS-32'),  ('Rh-30R', 'Rh-30')]

for val in samepatient:
    for i in val[1:]:
        pediatric_PDX_ann.loc[i, 'participant_id']=val[0]

In [51]:
pediatric_PDX_ann['sequencer'] = "Illumina HiSeq 2000"
pediatric_PDX_ann['align'] = "BWA"
pediatric_PDX_ann['method'] = "bulk"
pediatric_PDX_ann['reference']= "hg19"
pediatric_PDX_ann['ends']="paired end"
pediatric_PDX_ann['dataset'] = "pediatric_PDX"
pediatric_PDX_ann['counter'] = "RSEM"

In [104]:
set(CCLE_expression.index) & set(pediatric_PDX_TPM.index)

set()

In [105]:
len(set(CCLE_expression.columns) - set(pediatric_PDX_TPM.columns)), len(set(CCLE_expression.columns) & set(pediatric_PDX_TPM.columns)), len(set(pediatric_PDX_TPM.columns) - set(CCLE_expression.columns)),

(1404, 50650, 9848)

In [350]:
Novartis_PDX_ann.iloc[0]

sampleID           0931HXXTM
lineage             pancreas
subtype     ductal_carcinoma
type                     PDX
Name: 0, dtype: object

In [133]:
Novartis_PDX_TPM

Unnamed: 0,ENSG00000223972,ENSG00000227232,ENSG00000243485,ENSG00000237613,ENSG00000268020,ENSG00000240361,ENSG00000186092,ENSG00000238009,ENSG00000239945,ENSG00000233750,...,ENSG00000198763,ENSG00000198804,ENSG00000198712,ENSG00000198899,ENSG00000198938,ENSG00000198840,ENSG00000212907,ENSG00000198886,ENSG00000198786,ENSG00000198727
0931HXXTM,0.707850,1.654398,0.008603,0.000000,0.000000,0.0,0.000000,0.308393,0.658607,3.893554,...,14.038146,16.646155,15.832115,15.824465,15.962676,14.063832,14.925528,15.572025,14.415241,14.761388
0933HXXTM,0.670265,1.823898,0.200587,0.023538,0.000000,0.0,0.000000,0.231936,0.774517,2.563475,...,14.724974,16.984094,16.727474,16.056407,16.393623,14.953868,14.227997,15.673121,13.746635,15.038422
0991HXXTM,1.370931,0.942223,0.201408,0.000000,0.000000,0.0,0.000000,0.234001,1.818346,2.358349,...,13.735879,16.772542,16.675368,16.420344,16.329661,13.979689,14.767361,15.390771,13.857549,14.874256
1004HXXTM,0.541312,0.790362,0.000873,0.000000,0.000000,0.0,0.000000,0.241857,0.665133,3.041003,...,15.194522,16.454436,16.478030,15.652987,15.454535,14.741263,14.943251,15.482346,13.840229,14.959345
1008HXXTM,1.849403,1.759838,0.000966,0.000000,0.000000,0.0,0.000000,0.583893,0.080184,1.940394,...,14.816793,16.296781,16.462403,15.509920,16.191241,14.327289,15.356956,15.945985,14.666138,15.034136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5974HXXTM,0.622797,2.863350,0.131486,0.000000,0.000000,0.0,0.000000,0.815683,1.705518,3.840716,...,14.781736,16.295266,15.791943,15.441183,15.020428,14.767094,14.909953,15.700394,14.499695,14.045916
5975HXXTM,0.558531,2.922259,0.065139,0.000000,0.062741,0.0,0.000000,1.174320,2.333667,3.645435,...,14.745845,16.608791,16.214100,15.658530,15.807763,14.121360,14.979645,15.963183,13.954107,14.759596
6015HXXTM,1.149507,2.333365,0.260704,0.061446,0.117329,0.0,0.007528,0.224560,1.214549,2.548831,...,14.037049,15.962157,15.163556,14.994434,15.155136,13.666339,14.190415,15.131232,13.256761,13.872659
6030HXXTM,0.373995,2.338552,0.000031,0.000000,0.000000,0.0,0.000000,0.433003,0.561156,3.250460,...,13.881378,15.813251,14.968894,13.985161,15.046663,14.021569,14.081102,14.121672,13.882580,13.515175


In [None]:
Novartis_PDX_ann = Novartis_PDX_ann.rename(columns=rename).set_index('sample_id', drop=True)

In [None]:
Novartis_PDX_ann['sequencer'] = "Illumina HiSeq 2000"
Novartis_PDX_ann['method'] = "bulk"
Novartis_PDX_ann['align'] = "Salmon"
Novartis_PDX_ann['counter'] = "Salmon"
Novartis_PDX_ann['reference']= "U"
Novartis_PDX_ann['ends']="paired end"
Novartis_PDX_ann['dataset'] = "Novartis_PDX"

## GTEX

In [9]:
#! curl https://storage.googleapis.com/gtex_analysis_v9/snrna_seq_data/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad --output temp/gtex_8_atlas_public.h5ad

## GTEX additional
https://storage.googleapis.com/gtex_external_datasets/eyegex_data/rna_seq_data/EyeGEx_retina_combined_genelevel_expectedcounts_byrid_nooutlier.tpm.matrix.gct
    
https://storage.googleapis.com/gtex_external_datasets/eyegex_data/annotations/EyeGEx_meta_combined_inferior_retina_summary_deidentified_geo_ids.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1810M  100 1810M    0     0   126M      0  0:00:14  0:00:14 --:--:--  126M


In [614]:
gtex_v9 = read_h5ad("temp/gtex_8_atlas_public.h5ad") #209,126 × 17,695

In [623]:
gtex_v9.obs = gtex_v9.obs[["Age_bin","Sex","Sample ID", "Participant ID", "RIN score from PAXgene tissue Aliquot", "Tissue", "tissue"]].rename(columns={**rename, **{"tissue": "tissue_type", "Tissue": "collection_site"}})

n_genes                                                                    2658
fpr                                                                         0.1
tissue                                                           skeletalmuscle
prep                                                                        CST
individual                                                                   01
nGenes                                                                     2902
nUMIs                                                                   11544.0
PercentMito                                                             0.07623
PercentRibo                                                            0.051195
Age_bin                                                                   51-60
Sex                                                                        Male
Sample ID                                              GTEX-1HSMQ-5011-SM-GKSJH
Participant ID                          

In [None]:
gtex_v9.obs['sequencer']="Illumina HiSeq 2000"
gtex_v9.obs['method']="bulk"
gtex_v9.obs['reference']= "hg38"
gtex_v9.obs['ends']="paired end"
gtex_v9.obs['dataset']="gtex"

In [None]:
gtex_add = # 80,000 x 500

## HCMI

In [52]:
# HCMI dataset
# Code to generate this dataset can be found here:
# https://github.com/broadinstitute/hcmi-processing/blob/main/hcmi-rna-analysis-210226.ipynb
hcmi_ltpm = tc.get(name='hcmi-data-ac4b', file='hcmi_ltpm').T # 60486 x 157
hcmi_sample_info = tc.get(name='hcmi-data-ac4b', file='hcmi_sample_info')
#sample_info = tc.get(name='hcmi-data-ac4b', file='sample-info')

No dataset version provided. Using version 7.
No dataset version provided. Using version 7.


In [109]:
hcmi_sample_info.iloc[0]

Case ID                                                                  HCM-CSHL-0092-C25
Primary Site                                                                      Pancreas
Clinical Tumor Diagnosis                                                 Pancreatic cancer
subtype                                                         Adenocarcinoma ductal type
Tissue Status                                                                      Primary
Acquisition Site                                                           Pancreatic head
Gender                                                                                Male
Race                                                                               Unknown
Age At Acquisition (Years)                                                            69.0
Age At Diagnosis (Years)                                                              69.0
Disease Status                                                         Progressive disease

In [53]:
hcmi_sample_info['type'] = [i.split('_')[1] +"; "+str(j) for i,j in hcmi_sample_info[['type', 'Type']].values]

In [54]:
hcmi_sample_info = hcmi_sample_info[['Case ID', 'Clinical Tumor Diagnosis', 'subtype', 'Tissue Status', "Acquisition Site", 'Gender', 'Race', 'Age At Acquisition (Years)', 'Expansion Status', 'sampleID', 'type', 'lineage']].rename(columns=rename)

In [116]:
set(hcmi_sample_info.cell_type)

{'model; 2-D: Adherent',
 'model; 2-D: Conditionally reprogrammed cells',
 'model; 3-D: Organoid',
 'model; 3-D: Other (e.g. neurosphere, air-liquid interface, etc.)',
 'model; Mixed adherent and suspension',
 'model; None',
 'tumor; 2-D: Adherent',
 'tumor; 2-D: Conditionally reprogrammed cells',
 'tumor; 3-D: Organoid',
 'tumor; 3-D: Other (e.g. neurosphere, air-liquid interface, etc.)',
 'tumor; Mixed adherent and suspension',
 'tumor; None'}

In [117]:
hcmi_sample_info

Unnamed: 0,participant_id,disease_type,disease_subtype,metastatic/primary,collection_site,sex,ethnicity,age,expansion,sample_id,cell_type,tissue_type
0,HCM-CSHL-0092-C25,Pancreatic cancer,Adenocarcinoma ductal type,Primary,Pancreatic head,Male,Unknown,69.0,Expanded,HCM-CSHL-0092-C25-01A,tumor; 3-D: Organoid,pancreas
1,HCM-CSHL-0062-C18,Colorectal cancer,Adenocarcinoma,Primary,Rectosigmoid junction,Female,White,70.0,Expanded,HCM-CSHL-0062-C18-01A,tumor; 3-D: Organoid,rectum
2,HCM-CSHL-0384-D37,Other,Tubulovillous adenoma,Primary,Cecum,Male,Black or African American,51.0,Expanded,HCM-CSHL-0384-D37-31B,tumor; 3-D: Organoid,colon
3,HCM-BROD-0027-C34,Lung cancer,Carcinoma (NOS),Metastasis,Brain,Female,White,66.0,Expanded,HCM-BROD-0027-C34-06A,"tumor; 3-D: Other (e.g. neurosphere, air-liqui...",bronchus and lung
4,HCM-BROD-0235-C16,Stomach cancer,Adenocarcinoma (NOS),Metastasis,Ascites,Male,White,51.0,Expanded,HCM-BROD-0235-C16-06A,tumor; 3-D: Organoid,stomach
...,...,...,...,...,...,...,...,...,...,...,...,...
151,HCM-BROD-0036-C41.1,Ewing's sarcoma,--,Metastasis,Pleural cavity,Male,White,26.0,Expanded,HCM-BROD-0036-C41-85A,model; Mixed adherent and suspension,bone
152,HCM-BROD-0025-C16,Stomach cancer,--,Primary,Stomach (NOS),Male,White,74.0,Expanded,HCM-BROD-0025-C16-85A,model; 3-D: Organoid,stomach
153,HCM-BROD-0002-C71.2,Glioblastoma,Gliosarcoma,Primary,Brain,Male,White,66.0,Expanded,HCM-BROD-0002-C71-85A,model; 2-D: Conditionally reprogrammed cells,brain
154,HCM-BROD-0043-C16,Stomach cancer,Adenocarcinoma (NOS),Metastasis,Pleural cavity,Male,White,70.0,Expanded,HCM-BROD-0043-C16-85B,model; 2-D: Adherent,stomach


In [55]:
hcmi_sample_info['sequencer'] ="Illumina HiSeq 4000"
hcmi_sample_info['method'] ="bulk"
hcmi_sample_info['reference']= "hg38"
hcmi_sample_info['align']= ""
hcmi_sample_info['counter']= "HTSEQ"
hcmi_sample_info['ends']="paired end"
hcmi_sample_info['dataset']="hcmi"

In [56]:
hcmi_ltpm = hcmi_ltpm[hcmi_ltpm.columns[~hcmi_ltpm.columns.isna()]]

## encode

In [134]:
# 40,000 x 1100
report = pd.read_csv('data/encode_report.tsv', sep="\t", skiprows=1)
report = report[report.columns[report.isna().sum()!=len(report)]]
report

Unnamed: 0,ID,Accession,Assay name,Assay title,Biosample summary,Biosample term name,Dbxrefs,Description,Lab,Project,...,Biosample treatment,Biosample treatment ontology ID,Biosample treatment amount,Biosample treatment amount units,Biosample treatment duration,Biosample treatment duration units,Biosample modification site target organism,Replicates,Cellular component,Library construction method
0,/experiments/ENCSR620LQN/,ENCSR620LQN,RNA-seq,total RNA-seq,Homo sapiens esophagus muscularis mucosa tissu...,esophagus muscularis mucosa,GEO:GSE88409,The libraries contained in this Experiment com...,"Thomas Gingeras, CSHL",ENCODE,...,,,,,,,,/replicates/499e1412-5ef3-4ac2-98be-fd55bb01fad5/,,
1,/experiments/ENCSR406SAW/,ENCSR406SAW,RNA-seq,total RNA-seq,Homo sapiens upper lobe of left lung tissue fe...,upper lobe of left lung,GEO:GSE88254,The libraries contained in this Experiment com...,"Thomas Gingeras, CSHL",ENCODE,...,,,,,,,,/replicates/782b72f8-3197-4a66-951c-e2de88158b6a/,,
2,/experiments/ENCSR019MXZ/,ENCSR019MXZ,polyA plus RNA-seq,polyA plus RNA-seq,Homo sapiens HepG2 insoluble cytoplasmic fraction,HepG2,GEO:GSE87958,Initial insoluble fractions on HepG2 Long Poly...,"Eric Lécuyer, IRCM",ENCODE,...,,,,,,,,/replicates/443af8c7-5ed1-4930-b4f7-e1ca62c553...,insoluble cytoplasmic fraction,
3,/experiments/ENCSR630VJN/,ENCSR630VJN,RNA-seq,total RNA-seq,Homo sapiens transverse colon tissue male adul...,transverse colon,GEO:GSE88418,The libraries contained in this Experiment com...,"Thomas Gingeras, CSHL",ENCODE,...,,,,,,,,/replicates/aa7ff7b6-5bf5-4f0d-a09c-8612c945df37/,,
4,/experiments/ENCSR035SKV/,ENCSR035SKV,RNA-seq,total RNA-seq,Homo sapiens gastroesophageal sphincter tissue...,gastroesophageal sphincter,GEO:GSE87978,The libraries contained in this Experiment com...,"Thomas Gingeras, CSHL",ENCODE,...,,,,,,,,/replicates/8c825033-47e6-4659-8b02-0b399fab0435/,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1006,/experiments/ENCSR672JUF/,ENCSR672JUF,polyA plus RNA-seq,polyA plus RNA-seq,Homo sapiens foreskin keratinocyte male newborn,foreskin keratinocyte,GEO:GSM958177,,"Joseph Costello, UCSF",Roadmap,...,,,,,,,,/replicates/dd125c50-da07-473b-90cf-6be56b7fe968/,,
1007,/experiments/ENCSR999CPT/,ENCSR999CPT,polyA plus RNA-seq,polyA plus RNA-seq,Homo sapiens placental basal plate tissue fema...,placental basal plate,GEO:GSM1127098,,"Joseph Costello, UCSF",Roadmap,...,,,,,,,,/replicates/d92f45a2-c13f-42d4-a52d-2f974f6e7f41/,,
1008,/experiments/ENCSR634LOX/,ENCSR634LOX,polyA plus RNA-seq,polyA plus RNA-seq,Homo sapiens foreskin melanocyte male newborn,foreskin melanocyte,GEO:GSM958174,,"Joseph Costello, UCSF",Roadmap,...,,,,,,,,/replicates/f0421981-9298-4ade-a256-59aa55666f27/,,
1009,/experiments/ENCSR714QAF/,ENCSR714QAF,polyA plus RNA-seq,polyA plus RNA-seq,Homo sapiens mole tissue female,mole,GEO:GSM1582478,The RNA-Seq libraries contained in this experi...,"Joseph Costello, UCSF",Roadmap,...,,,,,,,,/replicates/919a47f8-b23e-431c-9ea0-9dcb6990df63/,,


In [137]:
region = []
platform = []

for val in list(report['Description']):
    if val is np.nan:
        region.append('')
        platform.append('')
        continue
    if "cytosol" in val or "cytosolic" in val:
        region.append('cytosol')
    elif "nucleolus" in val:
        region.append('nucleolus')
    elif "nucleus" in val:
        region.append('nucleus')
    elif "whole cell" in val:
        region.append('whole')
    elif "chromatin" in val:
        region.append('chromatin')
    elif "cytoplasmic" in val:
        region.append('cytoplasm')
    else:
        region.append(None)
    if "Hi-Seq" in val:
        platform.append('illumina Hi-Seq 2000')
    elif "Illumina GAIIx" in val:
        platform.append('illumina gaiix')
    else:
        platform.append('')

In [138]:
report.loc[report['Cellular component'].isna(), 'Cellular component'] = np.array(region)[report[report['Cellular component'].isna()].index]

In [139]:
report['sequencer'] = platform

In [140]:
report['Files'] = report['Files'].str.split(',').apply(lambda x: [i.split('/')[-2] for i in x])

In [141]:
report = report[report['Cellular component'].isna() | (report['Cellular component']=="nucleus")]
report = report[['Assay title', 'Biosample summary', 'Biosample term name', 'Description', 'Lab', 'Project', 'Files', 'Biosample accession', 'Organism', 'Life stage', 'Biosample age', 'sequencer', 'Biosample treatment ontology ID', 'Biosample treatment amount',
'Biosample treatment amount units', 'Biosample treatment duration',
'Biosample treatment duration units']].rename(columns=rename)

NameError: name 'rename' is not defined

In [145]:
set(report.platform)

{'', 'illumina Hi-Seq 2000', 'illumina gaiix'}

In [None]:
match = {}
for i, val in report['Files'].iteritems():
    for j in val:
        match[j] = i

In [115]:
todl = h.fileToList('data/encode_rna.txt')

In [116]:
todl

['',
 'https://www.encodeproject.org/files/ENCFF490WXZ/@@download/ENCFF490WXZ.tsv',
 'https://www.encodeproject.org/files/ENCFF056CEY/@@download/ENCFF056CEY.tsv',
 'https://www.encodeproject.org/files/ENCFF076NNR/@@download/ENCFF076NNR.tsv',
 'https://www.encodeproject.org/files/ENCFF502KTX/@@download/ENCFF502KTX.tsv',
 'https://www.encodeproject.org/files/ENCFF222PXI/@@download/ENCFF222PXI.tsv',
 'https://www.encodeproject.org/files/ENCFF064QSR/@@download/ENCFF064QSR.tsv',
 'https://www.encodeproject.org/files/ENCFF005FVC/@@download/ENCFF005FVC.tsv',
 'https://www.encodeproject.org/files/ENCFF118WDU/@@download/ENCFF118WDU.tsv',
 'https://www.encodeproject.org/files/ENCFF577KCY/@@download/ENCFF577KCY.tsv',
 'https://www.encodeproject.org/files/ENCFF285NIX/@@download/ENCFF285NIX.tsv',
 'https://www.encodeproject.org/files/ENCFF461MFQ/@@download/ENCFF461MFQ.tsv',
 'https://www.encodeproject.org/files/ENCFF316BTV/@@download/ENCFF316BTV.tsv',
 'https://www.encodeproject.org/files/ENCFF682J

In [None]:
encode_ann = pd.DataFrame(columns=report.columns)
for val in todl:
    val = val.split('/')[-1].split('.')[0]
    if val not in match:
        print(val)
    else:
        encode_ann.loc[val] = report.loc[match[val]].values

In [None]:
encode_ann['sequencer'] = ""# MISSING
encode_ann['reference']= "hg38"
encode_ann['ends']="paired-end"
encode_ann['align'] = "STAR"
encode_ann['dataset']="encode"

In [17]:
res = []
for val in encode_ann.index:
    vl = pd.read_csv('https://www.encodeproject.org/files/'+val+'/@@download/'+val+'.tsv', sep='\t')
    try:
        vl = vl.set_index('gene_id', drop=True)['TPM'].rename(val)
    except KeyError:
        vl = vl.set_index('gene_ID', drop=True)[vl.columns[-1]].rename(val)
    res.append(vl)
encode = pd.concat(res, axis=1)

ValueError: cannot reindex from a duplicate axis

## ICGC

In [70]:
icgclist = ["https://dcc.icgc.org/api/v1/download?fn=/current/Projects/BOCA-FR/exp_seq.BOCA-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/BPLL-FR/exp_seq.BPLL-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/BRCA-KR/exp_seq.BRCA-KR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/LICA-FR/exp_seq.LICA-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/LIRI-JP/exp_seq.LIRI-JP.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/ORCA-IN/exp_seq.ORCA-IN.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/OV-AU/exp_seq.OV-AU.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/PACA-AU/exp_seq.PACA-AU.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/PACA-CA/exp_seq.PACA-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/PRAD-CA/exp_seq.PRAD-CA.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/PRAD-FR/exp_seq.PRAD-FR.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/CLLE-ES/exp_seq.CLLE-ES.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/MALY-DE/exp_seq.MALY-DE.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/PAEN-AU/exp_seq.PAEN-AU.tsv.gz",
"https://dcc.icgc.org/api/v1/download?fn=/current/Projects/RECA-EU/exp_seq.RECA-EU.tsv.gz"]

In [69]:
res = []
for val in icgc:
    val = pd.read_csv(val, sep='\t')
    print(set(val['platform']), set(val['gene_model']), set(val['experimental_protocol']), set(val['assembly_version']), set(val['alignment_algorithm']), set(val['normalization_algorithm']))
    res.append(pd.concat([val.loc[val['icgc_sample_id']==i, ['gene_id', 'normalized_read_count']].set_index('gene_id').rename(columns={'normalized_read_count':i}) for i in list(set(val['icgc_sample_id']))], axis=1))
icgc = pd.concat(res, axis=1)

'download\\?fn\\=/current/Projects/BOCA-FR/exp_seq.BOCA-FR.tsv.gz'

In [49]:
donor = pd.read_csv("data/donor.tsv", sep="\t", index_col=0)
donor.columns

Index(['project_code', 'study_donor_involved_in', 'submitted_donor_id',
       'donor_sex', 'donor_vital_status', 'disease_status_last_followup',
       'donor_relapse_type', 'donor_age_at_diagnosis',
       'donor_age_at_enrollment', 'donor_age_at_last_followup',
       'donor_relapse_interval', 'donor_diagnosis_icd10',
       'donor_tumour_staging_system_at_diagnosis',
       'donor_tumour_stage_at_diagnosis',
       'donor_tumour_stage_at_diagnosis_supplemental', 'donor_survival_time',
       'donor_interval_of_last_followup', 'prior_malignancy',
       'cancer_type_prior_malignancy', 'cancer_history_first_degree_relative'],
      dtype='object')

In [51]:
more = pd.read_csv('data/donors_more_csv', index_col=0)
donor.loc[more.index, 'Primary Site'] = more['Primary Site']

In [52]:
donor = donor[["donor_sex", "donor_relapse_type", "donor_age_at_enrollment", "donor_diagnosis_icd10", "donor_tumour_stage_at_diagnosis", 'Primary Site']]

In [53]:
specimen = pd.read_csv("data/specimen.tsv", sep="\t", index_col=0)
specimen.iloc[0]

project_code                                                                     CLLE-ES
study_specimen_involved_in                                                           NaN
submitted_specimen_id                                                      060-0123-01TD
icgc_donor_id                                                                    DO51966
submitted_donor_id                                                                    60
specimen_type                          Primary tumour - blood derived (peripheral blood)
specimen_type_other                                                                  NaN
specimen_interval                                                                 3586.0
specimen_donor_treatment_type                                               no treatment
specimen_donor_treatment_type_other                                                  NaN
specimen_processing                                                                fresh
specimen_processing_o

In [54]:
specimen = specimen[["icgc_donor_id", "specimen_type", "specimen_storage", "specimen_processing"]]

In [55]:
for i, val in specimen.iterrows():
    specimen.loc[i, donor.columns] = donor.loc[val.icgc_donor_id].values 

In [59]:
sample = pd.read_csv("data/sample.tsv", sep="\t", index_col=0)
sample

Unnamed: 0_level_0,project_code,submitted_sample_id,icgc_specimen_id,submitted_specimen_id,icgc_donor_id,submitted_donor_id,analyzed_sample_interval,percentage_cellularity,level_of_cellularity,study
icgc_sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SA564770,CLLE-ES,060-0123-01TD,SP130360,060-0123-01TD,DO51966,60,315.0,97.0,,
SA564151,CLLE-ES,060-01-2ND,SP130410,060-01-2ND,DO51966,60,,98.0,,
SA602505,CLLE-ES,060-0123-02ND,SP199735,060-0123-02ND,DO51966,60,,,,
SA564150,CLLE-ES,060-02-01ND,SP130409,060-02-01ND,DO51966,60,,99.0,,
SA538928,CLLE-ES,060-0123-03TR,SP114999,060-0123-03TR,DO51966,60,315.0,,,
...,...,...,...,...,...,...,...,...,...,...
SA607197,BPLL-FR,14_3,SP202799,B-PLL_32_tumor,DO233963,B-PLL_32,,,,
SA607150,BPLL-FR,14_2,SP202986,B-PLL_32_control,DO233963,B-PLL_32,,,,
SA538993,CLLE-ES,356-01-8ND,SP115086,356-01-8ND,DO7084,356,,100.0,,PCAWG
SA86542,CLLE-ES,356-01-4TR,SP15992,356-01-4TR,DO7084,356,7.0,95.0,,PCAWG


In [60]:
icgcann = sample[['project_code', 'submitted_sample_id', 'icgc_specimen_id', 'percentage_cellularity']]

In [62]:
for i, val in icgcann.iterrows():
    icgcann.loc[i, specimen.columns] = specimen.loc[val.icgc_specimen_id]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, v, pi)


In [None]:
icgcann['sequencer'] = "" # missing
icgcann['method'] = "bulk"
icgcann['reference']= "hg38"
icgcann['ends']="paired end"

## st jude

In [None]:
pd.read_csv('') # 40,000 x 3500


## DUOS datasets

In [None]:
#https://duos.broadinstitute.org/dataset_catalog

## hartwig

## Pancreas from Sri

## Neurosphere from Keith

## affymatrix

### NCI 60

### Sanger Cell lines

In [10]:
counts = pd.read_csv('gs://ccle_default_params/celligner_ex/CosmicCLP_RawGeneExpression.tsv.gz', sep='\t')

In [56]:
conv = CCLE_annotation.drop_duplicates('arxspan_id').set_index('arxspan_id')

In [33]:
CCLE_expression.columns = [i.split(' (')[0] for i in CCLE_expression.columns]

In [41]:
#sanger = pd.DataFrame()
for val in list(set(counts.SAMPLE_NAME))[679:]:
    c = counts[counts.SAMPLE_NAME==val].set_index("GENE_NAME")['GENE_EXPRESSION']
    sanger[val] = c

  sanger[val] = c


In [65]:
sanger.columns[100:140]

Index(['MFM-223', 'D-423MG', 'NCI-H1688', 'COR-L321', 'AsPC-1', 'NCI-H1734',
       'HOP-92', 'NCI-N87', 'OCI-AML2', 'OSC-20', 'RT-112', 'DJM-1',
       'BFTC-909', 'HOP-62', 'SNG-M', 'BFTC-905', 'COLO-829', 'HT55', 'NB17',
       'HT-144', 'HCC1395', 'C-4-I', 'LN-229', 'MDA-MB-330', 'A3-KAW',
       'TCCSUP', 'LB2518-MEL', 'MSTO-211H', 'COLO-792', 'KLE', 'JEKO-1',
       'ROS-50', 'CFPAC-1', 'HCC2998', 'M14', 'JVM-3', 'OE33', 'ST486',
       'Hs746T', 'SUP-HD1'],
      dtype='object')

In [47]:
conv = conv[~conv.isna()]

In [72]:
new = []
for val in sanger.columns:
    v = conv[conv.stripped_cell_line_name==val.replace('-','').replace('/','').replace('(','').replace(')','').upper()]
    if len(v)==0:
        print(val)
        new.append(val)
        continue
    new.append(v.index[0])
sanger.columns = new

NCI-SNU-5
NCI-H510A
U251
SCC90
COLO-320-HSR
NCI-H322M
BE2-M17
WM793B
HN
EoL-1-cell
786-0
LU-99A
NCI-SNU-16
U-266
H3255
NCI-SNU-1
U031
SJRH30
SR
OVCAR-3


In [80]:
sanger=sanger.T

In [97]:

cl = list((set(CCLE_expression.index) & set(sanger.index))-set(h.dups(sanger.index)))
sanger = sanger.loc[cl]
CCLE_expression = CCLE_expression.loc[cl]

In [98]:
genes = list((set(CCLE_expression.columns)&set(sanger.columns))-set(h.dups(CCLE_expression.columns)))
sanger = sanger[genes]
CCLE_expression = CCLE_expression[genes]

In [99]:
CCLE_expression

Unnamed: 0,MAP3K11,RTN4R,NOL11,CAPN2,TBC1D22A,IL27,DHRSX,RITA1,ZSCAN2,LSG1,...,TPM1,ZNF484,PABPC1L2A,SLC6A16,HUNK,MAPKBP1,OPALIN,TEX264,KDM1A,LDHB
ACH-001336,3.734439,3.111031,4.669594,6.900867,2.541019,0.000000,1.678072,2.771886,1.722466,3.508429,...,8.135709,1.269033,0.0,0.263034,0.739848,1.718088,0.000000,3.195348,4.989593,6.432792
ACH-001399,4.050502,2.887525,5.792074,7.203495,3.207893,0.000000,3.257011,3.852998,2.946731,5.304876,...,5.652200,2.066950,0.0,0.056584,3.867896,2.121015,0.000000,5.416502,5.459432,9.820323
ACH-000633,5.620880,2.970854,5.434962,1.250962,3.968091,0.505891,5.267910,4.466627,4.740388,5.678917,...,8.548013,1.327687,0.0,1.695994,2.422233,1.918386,0.000000,5.144454,5.459104,5.956057
ACH-000935,6.683837,1.042644,4.379898,8.649975,4.410070,0.000000,3.716991,4.653633,4.016140,4.795455,...,8.169574,0.678072,0.0,0.124328,0.443607,3.911692,0.000000,5.834408,5.147714,9.212326
ACH-000443,6.134016,2.746313,5.769507,6.821455,4.241840,0.000000,3.365972,4.602291,2.742006,5.076388,...,7.672850,2.263034,0.0,1.189034,1.970854,3.560715,0.000000,4.756490,5.355439,8.767688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACH-000641,6.184875,0.495695,5.736875,4.222650,3.631104,0.056584,3.886550,3.881665,3.647315,4.838448,...,6.130931,0.847997,0.0,0.516015,0.042644,2.622930,0.000000,5.316146,6.702242,10.115824
ACH-001016,4.963012,0.454176,6.158660,7.723900,2.981853,0.000000,3.787641,3.384050,3.867896,6.093391,...,8.686255,2.408712,0.0,0.678072,1.454176,2.698218,0.000000,5.553361,5.124328,8.716236
ACH-000562,5.746850,2.713696,5.576522,8.679762,3.740928,0.000000,3.894333,4.216455,3.077243,5.002703,...,6.175525,0.594549,0.0,0.189034,0.411426,3.575312,0.000000,5.679480,5.560715,9.298933
ACH-002059,3.734439,1.035624,6.091277,0.250962,2.887525,0.000000,3.633431,2.935460,4.563768,4.862451,...,1.636915,1.232661,0.0,0.014355,3.390943,2.883621,0.070389,5.104337,7.202712,10.737670


In [113]:
val = pd.concat([sanger, CCLE_expression]).T.corr(method='pearson')

In [114]:
val.iloc[683:,:683]

Unnamed: 0,ACH-001336,ACH-001399,ACH-000633,ACH-000935,ACH-000443,ACH-000183,ACH-000425,ACH-001345,ACH-000981,ACH-000004,...,ACH-000244,ACH-000059,ACH-000955,ACH-000810,ACH-000359,ACH-000641,ACH-001016,ACH-000562,ACH-002059,ACH-000611
ACH-001336,0.396181,0.099063,-0.049871,0.113723,-0.081289,0.000394,0.058830,0.116016,-0.144106,-0.223843,...,0.101298,-0.240279,0.034863,-0.010670,-0.083434,-0.012430,0.057825,0.025350,-0.348001,-0.215877
ACH-001399,0.224205,0.263846,-0.023719,0.136632,-0.085474,0.028415,0.054059,0.174483,-0.129783,-0.217597,...,0.081020,-0.241016,0.072724,-0.007113,-0.105373,-0.000235,0.054577,0.024113,-0.330455,-0.203462
ACH-000633,0.152553,0.062051,0.311684,0.086795,-0.107290,0.060567,0.067279,0.171471,-0.093131,-0.176589,...,0.069320,-0.213271,0.019378,0.010632,-0.068447,0.041131,0.056543,-0.019986,-0.305278,-0.184625
ACH-000935,0.222844,0.068089,0.002406,0.345396,-0.112003,0.036665,0.153364,0.070205,-0.119688,-0.213107,...,0.202151,-0.226769,-0.024150,0.038048,-0.024192,-0.016222,0.150562,-0.023979,-0.343879,-0.218307
ACH-000443,0.183042,0.084525,-0.008391,0.103790,0.230282,-0.009603,0.062319,0.156315,-0.172612,-0.216526,...,0.107221,-0.242491,0.028597,-0.022511,-0.055561,-0.026907,0.121117,0.055509,-0.333948,-0.227179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACH-000641,0.120072,0.001930,0.008013,0.070680,-0.156757,0.126368,0.034849,0.115847,0.029541,0.065515,...,0.042297,-0.099898,0.006418,-0.000773,-0.093190,0.369898,0.021961,-0.023932,-0.219472,-0.072080
ACH-001016,0.170826,0.046073,-0.073266,0.206746,-0.109747,-0.006886,0.116968,0.063946,-0.127686,-0.237226,...,0.229108,-0.235185,-0.038654,0.038726,-0.007618,0.002011,0.271896,-0.045174,-0.313442,-0.221034
ACH-000562,0.220138,0.087236,-0.013010,0.097063,-0.047374,0.041002,0.046675,0.161487,-0.126984,-0.198350,...,0.069801,-0.230876,0.070163,-0.027013,-0.076459,0.010534,0.062496,0.185152,-0.341838,-0.195360
ACH-002059,0.120156,0.015056,-0.059444,0.069714,-0.177642,0.134321,0.016100,0.108537,0.074584,-0.101555,...,0.043121,-0.041041,0.006230,-0.007258,-0.108672,0.152833,0.006327,-0.041406,-0.088965,-0.046274


## other

### tumor cell atlas

### other random datasets from SRA

### EBI

In [None]:
"https://docs.google.com/spreadsheets/d/1cLHxd9hVkDf1wtPkVAZpZ0rLgZvsOHkQsv9_txVqFzI"

## single cell

### tumor inf elife

In [13]:
elife_tumorinf = tc.get(name='tumor-infiltration-3307', version=1, file='elife_tumorinf')

In [14]:
elife_tumorinf

Unnamed: 0,Bcells,CAFs,CD4_Tcells,CD8_Tcells,Endothelial,Macrophages,NKcells
A1BG,7.75210,4.47890,4.6702,5.02000,0.0000,3.46370,1.74260
A1BG-AS1,0.27272,28.98800,6.7471,11.76400,0.0000,2.36580,8.92140
A1CF,0.94832,0.03625,0.4978,0.56538,0.2582,0.13695,0.25936
A2M,0.14435,184.45000,2.9531,15.38600,1575.2000,418.67000,8.94110
A2M-AS1,0.00000,0.34500,7.3420,11.59100,8.6042,0.35669,0.21549
...,...,...,...,...,...,...,...
ZYG11A,13.37900,3.64120,18.2580,18.17900,9.0574,7.47320,16.04700
ZYG11B,18.64800,10.59500,15.2930,13.80700,5.1119,5.00460,8.10280
ZYX,31.07300,205.32000,79.5490,64.75700,70.3870,206.18000,102.91000
ZZEF1,5.69700,2.98250,9.9508,14.66500,7.2201,5.78570,16.66700


In [None]:
elife_tumorinf = elife_tumorinf.rename(columns={"Bcells": "B-cell", "CAFs": "CAF", "CD4_Tcells": "CD4_T-cells", "CD8_Tcells": "CD8_T-cells","macrophage": "macrophage", "Endothelial": "endothelial", "NKcells": "NK-cell"})

In [13]:
elife_tumorinf_ann = pd.DataFrame()
elife_tumorinf_ann["cell_type"] = "normal"
elife_tumorinf_ann["tissue_type"] = elife_tumorinf.columns
elife_tumorinf_ann["sample_ID"] = elife_tumorinf.columns
elife_tumorinf_ann['sequencer'] = ""
elife_tumorinf_ann['align'] = "bowtie"
elife_tumorinf_ann['method'] = "singlecell"
elife_tumorinf_ann['reference']= "hg19"
LAU125	59	male	iliac lymph node
LAU355	70	female	iliac-obturator lymph node
LAU1255	87	male	axillary lymph node
LAU1314	81	male	iliac-obturator lymph node
elife_tumorinf_ann['age'] = ""
elife_tumorinf_ann['sex'] = ""
elife_tumorinf_ann['dataset'] = "elife_tumorinf"

### tirosh's melanoma

In [4]:
melanoma = tc.get(name='tirosh-melanoma-scrnaseq-60f0', file='melanoma')

No dataset version provided. Using version 1.


In [19]:
melanoma.columns = [i.replace('-', '_').replace('Cy', "CY").replace('cy', "CY").replace('CY88C', 'CY88_C').replace('CY89A', "CY89_A").replace('CY89C', 'CY89_C').replace('CY89F', 'CY89_F').replace('CY89N', 'CY89_N').replace('CY94C', 'CY94_C') for i in melanoma.columns]

In [20]:
melanoma_ann = pd.DataFrame()

typ={1:"normal", 2:"tumor",0: np.nan}
orig={1:"melanoma", 2:"B-cell", 3: "macrophage", 4: "endothelial", 5: "CAF", 6:"NK-cell", 0: np.nan}

melanoma_ann['age'] = [int(i) for i in melanoma.loc['tumor']]
melanoma_ann["cell_type"] = [typ[int(i)] for i in melanoma.loc['malignant(1=no,2=yes,0=unresolved)']]
melanoma_ann['tissue_type'] = [orig[int(i)] for i in melanoma.loc['non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)']]
melanoma_ann['name'] = [i.split('_')[0] for i in melanoma.columns]
melanoma_ann['sample_id'] = melanoma.columns
melanoma_ann['other'] = [i.split('_')[-2] for i in melanoma.columns]
melanoma_ann['sequencer'] = ""
melanoma_ann['method'] = "singlecell"

melanoma_ann['reference']= "hg38"
melanoma_ann['ends']="paired end"
melanoma_ann['dataset'] = "melanoma"

### THEIS LAB scRNAseq datasets

In [None]:
https://theislab.github.io/sfaira-portal/Datasets #50,000x13,000


## L1000 dataset

In [4]:
import subprocess
from anndata import AnnData 

In [25]:
# you will need R > 4.0 
# https://www.charlesbordet.com/en/how-to-upgrade-to-R-4-0-0-on-debian/#the-naive-solution
! R -e "if(!requireNamespace('BiocManager', quietly = TRUE)){install.packages('BiocManager', repos='http://cran.us.r-project.org')};BiocManager::install('cmapR');"


R version 4.1.2 (2021-11-01) -- "Bird Hippie"
Copyright (C) 2021 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> if(!requireNamespace("BiocManager", quietly = TRUE)){install.packages("BiocManager", repos="http://cran.us.r-project.org")};BiocManager::install(c("limma"));
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
  'lib = "/usr/local/lib/R/site-library"' is not writable
Error in inst

In [31]:
folder = "gs://ccle_default_params/celligner_ex/"
res = []
for val in ["level5_beta_ctl_n58022x12328.gctx",
            "level5_beta_trt_cp_n720216x12328.gctx",
            "level5_beta_trt_misc_n8283x12328.gctx"
            "level5_beta_trt_oe_n34171x12328.gctx",
            "level5_beta_trt_sh_n238351x12328.gctx",
            "level5_beta_trt_xpr_n142901x12328.gctx",]:
    cmd = "gsutil cp " + folder + val + " temp/"
    ! $cmd
    res.append(h.loadGCTXasAnnData('temp/'+val))

Copying gs://ccle_default_params/celligner_ex/level5_beta_trt_misc_n8283x12328.gctx...
==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").


Operation completed over 1 objects/389.6 MiB.                                    


NameError: name 'pg' is not defined

In [None]:
ann['sequencer'] = "L1000"
ann['method'] = "L1000"
ann['reference']= "L1000"
ann['ends']="L1000"
ann['dataset']="L1000"

# QC

## compute correlation

- find unknown duplicates
- find problematic duplicates

In [509]:
import seaborn as sns
from sklearn.decomposition import PCA, IncrementalPCA
import umap.umap_ as umap
from genepy.utils import plot
from bokeh.plotting import output_notebook
output_notebook()

AttributeError: module 'numba.core.types' has no attribute 'misc'

In [None]:
overlap = set(CCLE_expression.columns) & set(TCGA_expression.columns) & set(cclfrna.columns) & set(met500_TPM.columns) & set(pediatric_PDX_TPM.columns) & set(hcmi_ltpm.columns)
print(len(overlap))
total = pd.concat([CCLE_expression, TCGA_expression, cclfrna, met500_TPM, pediatric_PDX_TPM, hcmi_ltpm], axis=0)

In [119]:
len(set(CCLE_expression.columns) | set(TCGA_expression.columns) | set(cclfrna.columns) | set(met500_TPM.columns) | set(pediatric_PDX_TPM.columns) | set(hcmi_ltpm.columns))

64043

In [58]:
total

Unnamed: 0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSGR0000264819,ENSGR0000265658,ENSGR0000270726,ENSGR0000275287,ENSGR0000276543,ENSGR0000277120,ENSGR0000280767,ENSGR0000281849,ENSG00000275347,ENSG00000279195
ACH-001113,4.990501,0.000000,7.273702,2.765535,4.480265,0.028569,1.269033,3.058316,6.483171,5.053980,...,,,,,,,,,,
ACH-001289,5.209843,0.545968,7.070604,2.538538,3.510962,0.000000,0.176323,3.836934,4.200850,3.832890,...,,,,,,,,,,
ACH-001339,3.779260,0.000000,7.346425,2.339137,4.254745,0.056584,1.339137,6.724241,3.671293,3.775051,...,,,,,,,,,,
ACH-001538,5.726831,0.000000,7.086189,2.543496,3.102658,0.000000,5.914565,6.099716,4.475733,4.294253,...,,,,,,,,,,
ACH-000242,7.465648,0.000000,6.435462,2.414136,3.864929,0.831877,7.198003,5.452530,7.112492,4.710944,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HCM-BROD-0036-C41-85A,3.357805,0.580937,4.832175,3.557517,3.769348,0.237172,0.091225,0.465813,3.004066,5.164597,...,,,,,,,,,0.000000,0.0
HCM-BROD-0025-C16-85A,3.147936,0.306879,5.881256,3.050155,2.403453,0.000000,0.623771,4.471019,3.966237,3.711254,...,,,,,,,,,0.000000,0.0
HCM-BROD-0002-C71-85A,3.170656,0.000000,4.977356,1.924132,1.864139,0.993773,5.385322,5.471081,3.124767,4.076565,...,,,,,,,,,0.000000,0.0
HCM-BROD-0043-C16-85B,3.708295,0.125722,6.089992,2.473367,2.338759,0.067989,4.073735,6.979186,5.077681,4.491827,...,,,,,,,,,0.000000,0.0


### all log(tpm+1)?

In [62]:
total[total.min(1, skipna=True)<0].min(1, skipna=True)

Series([], dtype: float64)

In [63]:
total[total.max(1, skipna=True)>50].max(1, skipna=True)

Series([], dtype: float64)

In [64]:
total.max(1, skipna=True).max()

18.637356274800002

In [66]:
2**18.637356274800002

407758.74000515806

columns/samples with mostly nans or 0s? 

In [92]:
total = total[total.columns[total.isna().sum(0)< total.shape[0]*0.9]]

In [93]:
total = total[total.columns[((total==0)|total.isna()).sum(0)< total.shape[0]*0.99]]

In [98]:
total.shape[1]*0.6

27534.0

In [525]:
ann.shape

(15064, 27)

In [523]:
ann.loc['ACH-001032']

Unnamed: 0_level_0,collection_site,disease_type,doublingt,method,reference,age,cell_type,ends,sequencer,participant_id,...,tissue_id,sample_type,align,counter,passage_number,contamination,lineage,stage,conservation,cell_format
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACH-001032,bone,ewing_sarcoma,26.0,bulk,hg38,Children,U,paired end,Illumina HiSeq 2000,PT-x8JgnsDI,...,U,model,U,U,U,U,bone,U,fresh,2D
ACH-001032,bone,ewing_sarcoma,26.0,bulk,hg38,Children,U,paired end,Illumina HiSeq 2000,PT-x8JgnsDI,...,U,model,U,U,U,U,bone,U,fresh,2D


In [524]:
ann = ann[~ann.index.duplicated(keep='first')]

In [522]:
h.dups(ann.index)

['ACH-001641',
 'ACH-001694',
 'ACH-000355',
 'ACH-001525',
 'ACH-000462',
 'ACH-000805',
 'ACH-000492',
 'ACH-000996',
 'ACH-000960',
 'ACH-000213',
 'ACH-000594',
 'ACH-000211',
 'ACH-000334',
 'ACH-000129',
 'ACH-000186',
 'ACH-000553',
 'ACH-000773',
 'ACH-001970',
 'ACH-000078',
 'ACH-001645',
 'ACH-000995',
 'ACH-000734',
 'ACH-000768',
 'ACH-000729',
 'ACH-000372',
 'ACH-001611',
 'ACH-000922',
 'ACH-000961',
 'ACH-002460',
 'ACH-001060',
 'ACH-001354',
 'ACH-000262',
 'ACH-000838',
 'ACH-000396',
 'ACH-001795',
 'ACH-000483',
 'ACH-000230',
 'ACH-000873',
 'ACH-000182',
 'ACH-000901',
 'ACH-001395',
 'ACH-000167',
 'ACH-000243',
 'ACH-001536',
 'ACH-001020',
 'ACH-000443',
 'ACH-001960',
 'ACH-001672',
 'ACH-001421',
 'ACH-000013',
 'ACH-001668',
 'ACH-000187',
 'ACH-001353',
 'ACH-000864',
 'ACH-000615',
 'ACH-001030',
 'ACH-000986',
 'ACH-000011',
 'ACH-000983',
 'ACH-000007',
 'ACH-000867',
 'ACH-000008',
 'ACH-000158',
 'ACH-000956',
 'ACH-001376',
 'ACH-000460',
 'ACH-0006

In [516]:
ann = ann.loc[total.index]

In [517]:
total.shape

(15064, 45890)

In [511]:
total = total[total.isna().sum(1) < total.shape[1]*0.7]

In [519]:
ann = ann.fillna('U')

In [526]:
(ann=='U').sum() / len(ann)

collection_site    0.000531
disease_type       0.000000
doublingt          0.927841
method             0.000000
reference          0.000000
age                0.109931
cell_type          0.026753
ends               0.000000
sequencer          0.000000
participant_id     0.028678
sex                0.103359
base_media         0.909453
morphology         0.990773
dataset            0.000000
ethnicity          0.910515
instability        0.948420
hasebv             0.908192
tissue_id          0.177775
sample_type        0.000000
align              0.151221
counter            0.151221
passage_number     0.994025
contamination      0.932488
lineage            0.013741
stage              0.984466
conservation       0.869822
cell_format        0.870486
dtype: float64

In [512]:
total = total[((total==0)|total.isna()).sum(1) < total.shape[1]*0.75]

## analyse  annotation files

- find mismatch annotations
- add missing annotations, need: 
    - sequencer, 
    - expression_type, 
    - media, 
    - origin, 
    - tissue type,
    - disease,
    - sub_disease,
    - cell type, 
    - group, 
    - sex, 
    - age, 
    - contamination
    - organism
- find good set of names for annotations

In [None]:
cclfrna, cclfrna_anno, met500_meta, met500_TPM, pediatric_PDX_ann, pediatric_PDX_TPM, hcmi_ltpm,
hcmi_sample_info, CCLE_expression, TCGA_expression

In [393]:
CCLE_annotation.index.name = "sample_id"
TCGA_annotation.index.name = "sample_id"
cclfrna_anno.index.name = "sample_id"
met500_meta.index.name = "sample_id"
pediatric_PDX_ann.index.name = "sample_id"
hcmi_sample_info.index.name = "sample_id"

In [119]:
hcmi_sample_info = hcmi_sample_info.set_index('sample_id', drop=True)

In [394]:
ann = pd.concat([CCLE_annotation, TCGA_annotation, cclfrna_anno, met500_meta, pediatric_PDX_ann, hcmi_sample_info])

In [395]:
ann.passage_number = ann.passage_number.astype(float)

In [539]:
rep = {np.nan: 'U',
'UNKNOWN': 'U',
'Unknown': 'U',
'unknown': 'U',
'not reported': 'U',
None: 'U',
'C': 'Children',
'Post-treatment': 'post-treatment', 
'Relapse': 'relapse', 
'Progressing Disease': 'progressing', 
'Diagnosis': 'at-diagnosis',
'other':'U',
'Mixed_or_Unknown': 'U', 
'African American': "African", 
'European': 'White', 
'Hispanic or Latino': 'latino',
'Black or African American': "African",
'Illumina Hiseq 2000': 'Illumina HiSeq 2000',
'illlumina Tru-Seq': 'Illlumina TruSeq',
"adrenal_cortex": "adrenal",
"colorectal": "colon",
'thymus': 'thyroid',
'historical_CL; 2D':'historical_cl',
'MSI-high': 'MSI',
"MSI-low": "MSS",
      }

In [397]:
ann = ann.replace(rep)

In [398]:
loc = (ann['age'].apply(lambda x: isinstance(x, str))) & (~ann['age'].isin(['Adult',
'Children',
'Embryo',
'Fetus',
'Pediatric',
'U']))
loc = loc | ann['age'].apply(lambda x: isinstance(x, int))

ann.loc[loc, 'age'] = ann.loc[loc, 'age'].astype(float)

In [399]:
loc = ann['age'].apply(lambda x: isinstance(x, float))
ann.loc[ann[loc][ann[loc].age<0].index, "age"] = "Fetus"
loc = ann['age'].apply(lambda x: isinstance(x, float))
ann.loc[ann[loc][ann[loc].age<6].index, "age"] = "Pediatric"
loc = ann['age'].apply(lambda x: isinstance(x, float))
ann.loc[ann[loc][ann[loc].age<18].index, "age"] = "Children"
loc = ann['age'].apply(lambda x: isinstance(x, float))
ann.loc[ann[loc][ann[loc].age>=18].index, "age"] = "Adult"

In [400]:
cont = []
loc = ann.contamination!='U'
for val in ann[loc].contamination:
    if val > 50:
        cont.append('high')
    elif val > 10:
        cont.append('medium')
    elif val > 1:
        cont.append('low')
    else:
        cont.append('none')
ann.loc[loc, 'contamination'] = cont

In [401]:
loc = ann['lineage']=='U'
ann.loc[loc, 'lineage'] = ann.loc[loc, 'tissue_type']

In [402]:
ann.sex = ann.sex.str.lower()

In [403]:
for val in ['lineage', 'age', "sex", 'disease_type']:  
    loc = ann[val].isin(['unknown', 'not reported', 'Unknown', 'u']) | ann[val].isna()
    try:
        ann.loc[loc, val] = 'U'
    except ValueError:
        ann[val] = ann[val].cat.add_categories('U')
        ann.loc[loc, val] = 'U'

In [404]:
set(['_'.join(v) for v in ann[["sample_type", "metastatic/primary"]].values])

{'Additional - New Primary_Primary',
 'Additional Metastatic_Primary',
 'Metastatic_Primary',
 'Primary Blood Derived Cancer - Peripheral Blood_Primary',
 'Primary Tumor_Primary',
 'Recurrent Tumor_Primary',
 'U_Metastasis',
 'U_Metastatic',
 'U_Pre-malignant',
 'U_Primary',
 'U_Recurrent',
 'U_Secondary',
 'U_U',
 'U_benign_neoplasia',
 'U_metastasis',
 'U_primary',
 'tumor_metastatic'}

In [405]:
rep = {'Additional - New Primary_Primary': 'primary',
 'Additional Metastatic_Primary': 'U',
 'Metastatic_Primary': 'U',
 'Primary Blood Derived Cancer - Peripheral Blood_Primary': 'primary',
 'Primary Blood Derived Cancer - Peripheral Blood_U': 'primary',
 'Primary Tumor_Primary': 'primary',
 'Recurrent Tumor_Primary': 'recurrent',
 'U_Metastasis': 'metastatic',
 'U_metastasis': 'metastatic',
 'U_Metastatic': 'metastatic',
 'Metastatic_U': 'metastatic',
 'U_Pre-malignant': 'premalignant',
 'U_Primary': 'primary',
 'U_primary': 'primary',
 'U_Recurrent': 'metastatic',
 'U_Secondary': 'metastatic',
 'U_U': 'U',
 'Additional - New Primary_U': 'primary', 
 'Additional Metastatic_U': 'metastatic',
 'Recurrent Tumor_U':'recurrent',
 'Primary Tumor_U': 'primary',
 'tumor_metastatic': 'metastatic',
 'U_benign_neoplasia': 'benign_neoplasia',
 'U_normal': 'normal'}

In [406]:
ann['sample_type'] = [rep['_'.join(v)] for v in ann[["sample_type", "metastatic/primary"]].values]

In [407]:
ann = ann.drop(columns=['rin_score', 'tissue_type', "metastatic/primary",'sample_name'])

In [408]:
for k, val in {'verylow':[2.0, 3.0], 'low':[4.0, 5.0], 'medium':[6.0, 7.0, 8.0, 9.0, 10.0], 'high':[11.0, 12.0, 13.0, 14.0, 15.0, 17.0, 20.0]}.items():
    ann.loc[ann['passage_number'].isin(val), 'passage_number'] = k

In [409]:
for val in set(ann.columns)-set(['participant_id', 'tissue_id']):
    print('-----')
    print(val)
    print('\n')
    print(set(ann[~ann[val].isna()][val]))

-----
history


{'Whole Blood:Whole Blood', 'Tissue:Fresh Frozen Tissue', 'U', 'Tissue:Fresh Tissue', 'Tissue:DMSO Cryopreserved Tissue', 'Cells:Pellet frozen', 'Cells:Growing', 'Cells:Cell Line, Viable'}
-----
stage


{'relapse', 'U', 'post-treatment', 'at-diagnosis', 'progressing'}
-----
lineage


{'colon', 'sarcomatoid', 'lung', 'stomach', 'lymphocyte', 'adrenal', 'gall_bladder', 'extrahepatic bile duct', 'engineered_lung', 'endocrine', 'gastric', 'cervix', 'liver', 'central_nervous_system', 'embryo', 'soft_tissue', 'bladder', 'parotid', 'nasopharynx', 'skin', 'testis', 'engineered', 'fibroblast', 'engineered_blood', 'engineered_kidney', 'bronchus and lung', 'engineered_bile_duct', 'ovary', 'peripheral_nervous_system', 'U', 'teratoma', 'germ_cell', 'nerve', 'thyroid', 'engineered_prostate', 'breast', 'engineered_breast', 'prostate', 'ampulla of vater', 'bile_duct', 'small intestine', 'upper_aerodigestive', 'rectum', 'urinary_tract', 'intrahepatic bile duct', 'engineered_ovary', 'blo

{'U', 'low', 'medium', 'none', 'high'}
-----
method


{'bulk'}
-----
reference


{'hg38', 'hg19'}
-----
age


{'U', 'Children', 'Embryo', 'Pediatric', 'Adult', 'Fetus'}
-----
cell_type


{'tumor; None', 'Cell model', 'model; None', 'historical_cl', 'U', 'tumor; 2-D: Adherent', 'model; 2-D: Adherent', 'tumor; 3-D: Other (e.g. neurosphere, air-liquid interface, etc.)', 'tumor; 3-D: Organoid', 'PDX', 'tumor; 2-D: Conditionally reprogrammed cells', 'tumor', 'model; 3-D: Other (e.g. neurosphere, air-liquid interface, etc.)', 'model; 3-D: Organoid', 'model; 2-D: Conditionally reprogrammed cells', 'model; Mixed adherent and suspension', 'tumor; Mixed adherent and suspension', 'Tissue'}
-----
sample_type


{'U', 'primary', 'benign_neoplasia', 'recurrent', 'premalignant', 'metastatic'}
-----
ends


{'paired end'}
-----
hasebv


{False, True, 'U'}
-----
sequencer


{'Illumina HiSeq 2000', 'Illumina HiSeq 4000', 'Illlumina TruSeq'}
-----
disease_type


{'ASPS', 'Melanoma', 'Rhabdomyosarcoma', 'Th

In [None]:
# mediatype is bad and is just in CCLF --> drop for now

In [None]:
doublingt

In [250]:
from collections import Counter

#### do we have non matching things:

- collection_site
- media_type, base_media

In [412]:
mapping = Sheets.from_files(MY_ID, MYSTORAGE_ID).get("https://docs.google.com/spreadsheets/d/1GJkZ30pixBclfBRFa9BnNNJLRmVgSf1Dszy_tXdBkFI").sheets[0].to_frame().set_index('mapping', drop=True)

In [416]:
ann.temp = ann.temp.astype(str)
for val in set(ann.temp):
    ann.loc[ann.temp==val, ['sample_type', 'conservation', 'cell_type', 'cell_format', 'morphology']] = mapping.loc[val].values

In [410]:
ann['temp'] = [(i,j,k) for i,j,k in ann[['sample_type', 'cell_type', 'history']].values]
set(ann['temp'])

{('U', 'Cell model', 'Cells:Cell Line, Viable'),
 ('U', 'PDX', 'U'),
 ('U', 'Tissue', 'Tissue:Fresh Tissue'),
 ('U', 'U', 'U'),
 ('U', 'historical_cl', 'U'),
 ('U', 'model; None', 'U'),
 ('U', 'tumor', 'U'),
 ('U', 'tumor; None', 'U'),
 ('benign_neoplasia', 'historical_cl', 'U'),
 ('metastatic', 'Cell model', 'Cells:Cell Line, Viable'),
 ('metastatic', 'Cell model', 'Cells:Growing'),
 ('metastatic', 'Cell model', 'Cells:Pellet frozen'),
 ('metastatic', 'Cell model', 'Tissue:Fresh Tissue'),
 ('metastatic', 'Tissue', 'Tissue:Fresh Frozen Tissue'),
 ('metastatic', 'Tissue', 'Tissue:Fresh Tissue'),
 ('metastatic', 'historical_cl', 'U'),
 ('metastatic', 'model; 2-D: Adherent', 'U'),
 ('metastatic', 'model; 2-D: Conditionally reprogrammed cells', 'U'),
 ('metastatic', 'model; 3-D: Organoid', 'U'),
 ('metastatic',
  'model; 3-D: Other (e.g. neurosphere, air-liquid interface, etc.)',
  'U'),
 ('metastatic', 'model; Mixed adherent and suspension', 'U'),
 ('metastatic', 'tumor', 'U'),
 ('metasta

In [384]:
set(ann[ann.dataset=='tcga']['temp'])
#fresh / frozen?

set()

In [385]:
set(ann[ann.dataset=='ccle']['temp'])
#fresh / frozen?
# all CCLE are 2D? 
# all CCLE are adherent? not bloods, else yes
# all PDX are fresh? yes
# pdx in which tissue? depends on the tumor (brain->brain, blood->blood, other->flank)
# all from immuno comp mouse? Yes but diffferent version (see file)
# are all PDX primary ? --> yes except in annotation

{('U', 'historical_cl', 'U'),
 ('benign_neoplasia', 'historical_cl', 'U'),
 ('metastatic', 'historical_cl', 'U'),
 ('primary', 'historical_cl', 'U')}

In [386]:
set(ann[ann.dataset=='hcmi']['temp'])

{('U', 'model; None', 'U'),
 ('U', 'tumor; None', 'U'),
 ('metastatic', 'model; 2-D: Adherent', 'U'),
 ('metastatic', 'model; 2-D: Conditionally reprogrammed cells', 'U'),
 ('metastatic', 'model; 3-D: Organoid', 'U'),
 ('metastatic',
  'model; 3-D: Other (e.g. neurosphere, air-liquid interface, etc.)',
  'U'),
 ('metastatic', 'model; Mixed adherent and suspension', 'U'),
 ('metastatic', 'tumor; 2-D: Adherent', 'U'),
 ('metastatic', 'tumor; 2-D: Conditionally reprogrammed cells', 'U'),
 ('metastatic', 'tumor; 3-D: Organoid', 'U'),
 ('metastatic',
  'tumor; 3-D: Other (e.g. neurosphere, air-liquid interface, etc.)',
  'U'),
 ('metastatic', 'tumor; Mixed adherent and suspension', 'U'),
 ('premalignant', 'model; 3-D: Organoid', 'U'),
 ('primary', 'model; 2-D: Adherent', 'U'),
 ('primary', 'model; 2-D: Conditionally reprogrammed cells', 'U'),
 ('primary', 'model; 3-D: Organoid', 'U'),
 ('primary',
  'model; 3-D: Other (e.g. neurosphere, air-liquid interface, etc.)',
  'U'),
 ('primary', 'tu

In [387]:
set(ann[ann.dataset=='met500']['temp'])

{('metastatic', 'tumor', 'U')}

In [None]:
# Tissue, model, cells, cells-growing, blood_derived, PDX
# met, prim, secondary, Pre-malignant, recurrent, normal
# 2D, 3D_orga, 3D_neuro, tissue
# adherent, suspension, air_liquid, no
# frozen, fresh

In [420]:
ann['temp'] = [(i,j,k) for i,j,k in ann[[ 'lineage', 'disease_type', 'disease_subtype']].values]
set(ann['temp'])

{('U', 'ATRT', 'U'),
 ('U', 'Adenocarcinoma', 'U'),
 ('U', 'Alveolar RMS', 'U'),
 ('U', 'Alveolar Rabdomyosarcoma', 'U'),
 ('U', 'Alveolar Rhabdomyosarcoma', 'U'),
 ('U', 'Anaplastic Thyroid Cancer', 'U'),
 ('U', 'Anaplastic astrocytoma', 'U'),
 ('U', 'Angioimmunoblastic T-cell lymphoma', 'U'),
 ('U', 'Angiosarcoma', 'U'),
 ('U', 'Chordoma', 'U'),
 ('U', 'Colon adenocarcinoma', 'U'),
 ('U', 'Colorectal adenocarcinoma', 'U'),
 ('U', 'Desmoid Tumor', 'U'),
 ('U', 'Double-hit diffuse large B cell lymphoma', 'U'),
 ('U', 'Embryonal Rabdomyosarcoma', 'U'),
 ('U', 'Epithelioid sarcoma', 'U'),
 ('U', 'Esophageal Cancer', 'U'),
 ('U', 'Esophogeal carcinoma', 'U'),
 ('U', 'Ewing sarcoma', 'U'),
 ('U', "Ewing's", 'U'),
 ('U', 'Ewings', 'U'),
 ('U', 'GBM', 'U'),
 ('U', 'GBM, gliosarcoma variant', 'U'),
 ('U', 'Ganglioneuroblastoma', 'U'),
 ('U', 'Gastric Adenocarcinoma', 'U'),
 ('U', 'Gastric Cancer', 'U'),
 ('U', 'Gastric adenocarcinoma', 'U'),
 ('U', 'Glioblastoma', 'U'),
 ('U', 'Glioblastoma m

In [446]:
group_rename = {
    ('U', 'ATRT', 'U'): ('U', 'Rhabdoid', 'ATRT'),
     ('U', 'Adenocarcinoma', 'U'): ('U', 'Adenocarcinoma', 'Adenocarcinoma'),
     ('U', 'Alveolar RMS', 'U'): ('U', 'Alveolar RMS', 'rhabdomyosarcoma'),
     ('U', 'Alveolar Rabdomyosarcoma', 'U'): ('U', 'Alveolar Rabdomyosarcoma', 'rhabdomyosarcoma'),
     ('U', 'Alveolar Rhabdomyosarcoma', 'U'): ('U', 'Alveolar Rhabdomyosarcoma', 'rhabdomyosarcoma'),
     ('U', 'Anaplastic Thyroid Cancer', 'U'): ('thyroid', 'Anaplastic Thyroid Cancer', 'Anaplastic Thyroid Cancer'),
     ('U', 'Anaplastic astrocytoma', 'U'): ('U', 'Anaplastic astrocytoma', 'Anaplastic astrocytoma'),
     ('U', 'Angioimmunoblastic T-cell lymphoma', 'U'): ('lymphocyte', 'Angioimmunoblastic T-cell lymphoma', 'T-cell lymphoma'),
     ('U', 'Angiosarcoma', 'U'): ('U', 'Angiosarcoma', 'U'),
     ('U', 'Chordoma', 'U'): ('U', 'Chordoma', 'Chordoma'),
     ('U', 'Colon adenocarcinoma', 'U'): ('colon', 'Colon adenocarcinoma', 'colon_adenocarcinoma'),
     ('U', 'Colorectal adenocarcinoma', 'U'): ('colon', 'Colorectal adenocarcinoma', 'colon_adenocarcinoma'),
     ('U', 'Desmoid Tumor', 'U'): ('U', 'Desmoid Tumor', 'Desmoid Tumor'),
     ('U', 'Double-hit diffuse large B cell lymphoma', 'U'): ('lymphocyte', 'Double-hit diffuse large B cell lymphoma', 'diffuse large B-cell lymphoma'),
     ('U', 'Embryonal Rabdomyosarcoma', 'U'): ('embryonal', 'Embryonal Rabdomyosarcoma', 'U'),
     ('U', 'Epithelioid sarcoma', 'U'): ('U', 'Epithelioid sarcoma', 'Epithelioid sarcoma'),
     ('U', 'Esophageal Cancer', 'U'): ('esophageal', 'Esophageal Cancer', 'U'),
     ('U', 'Esophogeal carcinoma', 'U'): ('esophageal', 'Esophogeal carcinoma', 'Esophageal carcinoma'),
     ('U', 'Ewing sarcoma', 'U'): ('bone', 'Ewing sarcoma', 'Ewing sarcoma'),
     ('U', "Ewing's", 'U'): ('bone', "Ewing's", 'Ewing sarcoma'),
     ('U', 'Ewings', 'U'): ('bone', 'Ewings', 'Ewing sarcoma'),
     ('U', 'GBM', 'U'): ('central_nervous_system', 'GBM', 'glioblastoma'),
     ('U', 'GBM, gliosarcoma variant', 'U'): ('central_nervous_system', 'GBM, gliosarcoma variant', 'glioblastoma'),
     ('U', 'Ganglioneuroblastoma', 'U'): ('central_nervous_system', 'Ganglioneuroblastoma', 'Ganglioneuroblastoma'),
     ('U', 'Gastric Adenocarcinoma', 'U'): ('gastric', 'Gastric Adenocarcinoma', 'gastric_adenocarcinoma'),
     ('U', 'Gastric Cancer', 'U'): ('gastric', 'Gastric Cancer', 'U'),
     ('U', 'Gastric adenocarcinoma', 'U'): ('gastric', 'Gastric adenocarcinoma', 'gastric_adenocarcinoma'),
     ('U', 'Glioblastoma', 'U'): ('central_nervous_system', 'Glioblastoma', 'glioblastoma'),
     ('U', 'Glioblastoma multiforme', 'U'): ('central_nervous_system', 'Glioblastoma multiforme', 'glioblastoma_multiforme'),
     ('U', 'HNSCC', 'U'): ('upper_aerodigestive', 'HNSCC', 'squamous cell carcinoma'),
     ('U', 'Head and Neck squamous cell carcinoma', 'U'): ('upper_aerodigestive', 'Head and Neck squamous cell carcinoma', 'squamous cell carcinoma'),
     ('U', 'High grade glioma', 'U'): ('central_nervous_system', 'High grade glioma', 'High grade glioma'),
     ('U', 'Kidney renal clear cell carcinoma', 'U'): ('kidney', 'Kidney renal clear cell carcinoma', 'clear cell carcinoma'),
     ('U', 'Leg Mass', 'U'): ('U', 'Leg Mass', 'U'),
     ('U', 'Leiomyosarcoma', 'U'): ('U', 'Leiomyosarcoma', 'Leiomyosarcoma'),
     ('U', 'Liposarcoma', 'U'): ('U', 'Liposarcoma', 'Liposarcoma'),
     ('U', 'Lung Adenocarcinoma', 'U'): ('lung', 'Lung Adenocarcinoma', 'Lung Adenocarcinoma'),
     ('U', 'MPNST', 'U'): ('nerve', 'MPNST', 'malignant peripheral nerve sheath tumor'),
     ('U', 'Medullary Thyroid Cancer', 'U'): ('thyroid', 'Medullary Thyroid Cancer', 'Medullary Thyroid Cancer'),
     ('U', 'Melanoma', 'U'): ('skin', 'Melanoma', 'Melanoma'),
     ('U', 'Metastaticastatic Adenocarcinoma Colon', 'U'): ('colon', 'Metastaticastatic Adenocarcinoma Colon', 'Colon Adenocarcinoma'),
     ('U', 'Myofibroblastic neoplasm', 'U'): ('U', 'Myofibroblastic neoplasm', 'Myofibroblastic neoplasm'),
     ('U', 'Neuroblastoma', 'U'): ('central_nervous_system', 'Neuroblastoma', 'Neuroblastoma'),
     ('U', 'Oesophageal carcinoma', 'U'): ('esophagus', 'Oesophageal carcinoma', 'Esophageal carcinoma'),
     ('U', 'Other Cancer', 'U'): ('U', 'Other Cancer', 'U'),
     ('U', 'Ovarian Cancer', 'U'): ('ovaries', 'Ovarian Cancer', 'U'),
     ('U', 'Pancreatic Adenocarcinoma', 'U'): ('pancreas', 'Pancreatic Adenocarcinoma', 'Pancreatic adenocarcinoma'),
     ('U', 'Pancreatic adenocarcinoma', 'U'): ('pancreas', 'Pancreatic adenocarcinoma', 'Pancreatic adenocarcinoma'),
     ('U', 'Pancreatic adenocarcinoma [PAAD]', 'U'): ('pancreas', 'Pancreatic adenocarcinoma [PAAD]', 'Pancreatic adenocarcinoma'),
     ('U', 'Papillary Thyroid Cancer', 'U'): ('thyroid', 'Papillary Thyroid Cancer', 'Papillary Thyroid Cancer'),
     ('U', 'Renal medullary carcinoma', 'U'): ('kidney', 'Renal medullary carcinoma', 'Renal medullary carcinoma'),
     ('U', 'Rhabdoid Tumor', 'U'): ('U', 'Rhabdoid Tumor', 'Rhabdoid'),
     ('U', 'Sarcoma', 'U'): ('U', 'Sarcoma', 'Sarcoma'),
     ('U', 'Secretory Cancer', 'U'): ('U', 'Secretory Cancer', 'secretory'),
     ('U', 'Sertoli-Leydig Cell Tumor', 'U'): ('U', 'Sertoli-Leydig Cell Tumor', 'Sertoli-Leydig Cell Tumor'),
     ('U', 'Solitary Fibrous Tumor', 'U'): ('U', 'Solitary Fibrous Tumor', 'Solitary Fibrous Tumor'),
     ('U', 'Squamous Cell Carcinoma', 'U'): ('U', 'Squamous Cell Carcinoma', 'Squamous Cell Carcinoma'),
     ('U', 'Stomach Adenocarcinoma', 'U'): ('Stomach', 'Stomach Adenocarcinoma', 'Stomach Adenocarcinoma'),
     ('U', 'T-PLL', 'U'): ('U', 'T-PLL', 'TPLL'), 
     ('U', 'Thyroid Cancer', 'U'): ('thyroid', 'Thyroid Cancer', 'U'),
     ('U', 'Undifferentiated Sarcoma', 'U'): ('U', 'Undifferentiated Sarcoma', 'sarcoma'),
     ('U', 'Wilms', 'U'): ('kidney', 'Wilms', 'Wilms'),
     ('U', 'anaplastic thyroid cancer', 'U'): ('thyroid', 'anaplastic thyroid cancer', 'anaplastic thyroid cancer'),
     ('U', 'kidney renal clear cell carcinoma [KIRC]', 'U'): ('kidney', 'kidney renal clear cell carcinoma [KIRC]', 'clear cell carcinoma'),
     ('U', 'renal medullary carcinoma', 'U'): ('kidney', 'renal medullary carcinoma', 'renal medullary carcinoma'),
     ('U', 'thyroid ca', 'U'): ('thyroid', 'thyroid ca', 'U'),
    
    
    ('thyroid', 'Thymoma', 'U'): ('thyroid', 'Thymus Cancer', 'thymoma'),
    ('thyroid', 'Thymic Cancer', 'Carcinoma'): ('thyroid', 'Thymus Cancer', 'thymic carcinoma'),
    ('stomach', 'Stomach adenocarcinoma', 'U'): ('stomach', 'Stomach cancer', 'Adenocarcinoma (NOS)'),
    ('soft_tissue', 'Kidney Cancer', 'malignant_rhabdoid_tumor'): ('soft_tissue', 'Rhabdoid', 'malignant_rhabdoid_tumor'),
     ('central_nervous_system', 'PNET', 'CNS EFT-CIC'): ('central_nervous_system', 'PNET', 'PNET'),
     ('central_nervous_system', 'PNET', 'PNET'): ('central_nervous_system', 'PNET', 'CNS embryonal NOS'),
    ('small intestine', 'Rare cancers', 'Adenocarcinoma'): ('gastric','', 'Adenocarcinoma'),
    ('skin', 'Skin Cutaneous Melanoma', 'U'): ('skin', 'Skin Cutaneous Melanoma', 'melanoma'),
    ('skin', 'Melanoma', '--'): ('skin', 'Melanoma', 'melanoma'),
    ('skin', 'Melanoma', 'Melanoma, NOS'): ('skin', 'Melanoma', 'melanoma'),
    ('skin', 'Secretory Cancer', 'U'): ('skin', 'Secretory Cancer', 'secretory'),
    ('rectum', 'Colorectal cancer', 'Mucinous adenocarcinoma'): ('rectum', 'Colorectal cancer', 'Adenocarcinoma'),
    ('sarcomatoid', 'Sarcoma', 'U'): ('U', 'U', 'Sarcoma'),
    ('prostate', 'Prostate adenocarcinoma', 'U'): ('prostate', 'Prostate adenocarcinoma', 'Adenocarcinoma'),
    ('adrenal', 'adrenocortical carcinoma', 'U'): ('adrenal', 'adrenocortical carcinoma', 'carcinoma'),
    ('bladder', 'Bladder Urothelial Carcinoma', 'U'): ('bladder', 'Bladder Urothelial Carcinoma', 'bladder_carcinoma'),
    ('blood', 'ALL', 'BCP-ALL'): ('blood', 'ALL', 'ALL'),
    ('blood', 'ALL', 'ETP-ALL'): ('blood', 'ALL', 'ALL'),
    ('blood', 'ALL', 'MLL-ALL'): ('blood', 'ALL', 'ALL'),
    ('blood', 'ALL', 'Ph+-ALL'): ('blood', 'ALL', 'ALL'),
    ('blood', 'ALL', 'Ph-likeALL'): ('blood', 'ALL', 'ALL'),
    ('blood', 'ALL', 'T-ALL'): ('blood', 'ALL', 'ALL'),
    ('bone', "Ewing's sarcoma", '--'): ('bone', "Ewing's sarcoma", 'Ewing_sarcoma'),
    ('brain', 'Glioblastoma', 'NOS'): ('brain', 'Glioblastoma', 'Glioblastoma'),
    ('brain', 'Glioblastoma multiforme', 'U'): ('brain', 'Glioblastoma multiforme', 'Glioblastoma'),
    ('bronchus and lung', 'Rhabdomyosarcoma', 'Alveolar'): ('bronchus and lung', 'Rhabdomyosarcoma', 'rhabdomyosarcoma'),
    ('engineered', 'Engineered', 'U'): ('U', 'U', 'U'),
    ('engineered_bile_duct', 'Bile Duct Cancer', 'Engineered'): ('bile_duct', 'Bile Duct Cancer', 'U'),
    ('engineered_blood', 'Engineered', 'CML'): ('blood', 'Engineered', 'CML'),
    ('engineered_breast', 'Engineered', 'U'): ('breast', 'Engineered', 'U'),
    ('engineered_central_nervous_system', 'Engineered', 'U'): ('central_nervous_system', 'Engineered', 'U'),
    ('engineered_kidney', 'Engineered', 'U'): ('kidney', 'Engineered', 'U'),
    ('engineered_lung', 'Engineered', 'U'): ('lung', 'Engineered', 'U'),
    ('engineered_ovary', 'Engineered', 'U'): ('ovary', 'Engineered', 'U'),
    ('engineered_prostate', 'Engineered', 'U'): ('prostate', 'Engineered', 'U'),
    ('urinary_tract', 'Bladder Cancer', 'bladder urothelial carcinoma'): ('bladder', 'Bladder Cancer', 'bladder_carcinoma'),
    ('urinary_tract', 'Bladder Cancer', 'bladder_carcinoma'): ('bladder', 'Bladder Cancer', 'bladder_carcinoma'),
    ('breast', 'Breast Cancer', 'normal'): ('breast', 'Breast Cancer', 'U'),
    ('breast', 'Breast invasive carcinoma', 'U'): ('breast', 'Breast invasive carcinoma', 'breast_invasive_carcinoma'),
    ('parotid', 'Secretory Cancer', 'U'): ('parotid', 'Secretory Cancer', 'secretory'),
    ('pancreas', 'Pancreatic adenocarcinoma', 'U'):  ('pancreas', 'Pancreatic adenocarcinoma', 'adenocarcinoma'),
    ('ovary', 'Ovarian serous cystadenocarcinoma', 'U'): ('ovary', 'Ovarian serous cystadenocarcinoma', 'ovarian serous cystadenocarcinoma'),
    ('oral', 'Secretory Cancer', 'U'):('oral', 'Secretory Cancer', 'secretory'),
    ('oral', 'Skin Cutaneous Melanoma', 'U'): ('oral', 'Skin Cutaneous Melanoma', 'melanoma'),
    ('oral', 'Head and Neck squamous cell carcinoma', 'U'): ('esophagus', 'Head and Neck squamous cell carcinoma', 'squamous cell carcinoma'),
    ('lymphocyte', 'Lymphoma', 'U'):('lymphocyte', 'Lymphoma', 'lymphoma'),
    ('liver', 'Liver hepatocellular carcinoma', 'U'): ('liver', 'Liver hepatocellular carcinoma', 'U'),
    ('gastric', 'Small Cell Carcinoma', 'Small Cell Carcinoma'): ('gastric', 'Small Cell Carcinoma', 'gastric_small_cell'),

    ('fibroblast', 'normal', 'fibroblast_bone'): ('fibroblast_bone', 'normal', 'normal'),
    ('fibroblast', 'normal', 'fibroblast_breast'): ('fibroblast_breast', 'normal', 'normal'),
    ('fibroblast', 'normal', 'fibroblast_colorectal'): ('fibroblast_colorectal', 'normal', 'normal'),
    ('fibroblast', 'normal', 'fibroblast_lung'): ('fibroblast_lung', 'normal', 'normal'),
    ('fibroblast', 'normal', 'fibroblast_lymphocyte'): ('fibroblast_lymphocyte', 'normal', 'normal'),
    ('fibroblast', 'normal', 'fibroblast_skin'): ('fibroblast_skin', 'normal', 'normal'),
    ('fibroblast', 'normal', 'fibroblast_soft_tissue'): ('fibroblast_soft_tissue', 'normal', 'normal'),
    ('fibroblast', 'normal', 'fibroblast_upper_aerodigestive'): ('fibroblast_upper_aerodigestive', 'normal', 'normal'),
    ('fibroblast', 'normal', 'fibroblast_urinary_tract'): ('fibroblast_urinary_tract', 'normal', 'normal'),
    ('esophagus', 'Head and Neck squamous cell carcinoma', 'U'): ('esophagus', 'Head and Neck squamous cell carcinoma', 'esophagus_squamous'),
    ('esophagus', 'Esophageal carcinoma', 'U'): ('esophagus', 'Esophageal carcinoma', 'esophageal carcinoma'),
    ('embryo', 'Embryonal Cancer', 'Carcinoma'): ('embryo', 'Embryonal Cancer', 'embryo_carcinoma'),
    ('colon', 'Colorectal cancer', 'Adenocarcinoma'): ('colon', 'Colorectal cancer', 'colorectal_adenocarcinoma'),

 ('soft_tissue', 'Rhabdomyosarcoma', 'Fusion+ RMS'): ('soft_tissue', 'Rhabdomyosarcoma', 'Rhabdomyosarcoma'),
 ('soft_tissue', 'Rhabdomyosarcoma', 'Fusion- RMS'): ('soft_tissue', 'Rhabdomyosarcoma', 'Rhabdomyosarcoma'),
 ('soft_tissue', 'Sarcoma', 'U'): ('soft_tissue', 'Sarcoma', 'sarcoma'),

}

ann = ann.replace({'temp': group_rename})

  op = lambda x: operator.eq(x, b)


TypeError: Cannot compare types 'ndarray(dtype=object)' and 'tuple'

In [450]:
ann.temp = [group_rename[val] if val in group_rename else val for val in ann.temp.values]

In [451]:
ann[['lineage', 'disease_type', 'disease_subtype']] = [[k,v,u] for k,v,u in ann.temp]

In [453]:
name_rename = {
    'Gliosarcoma':'Glioblastoma',
    'ASPS' : 'Rhabdoid',
    'ATRT': 'Rhabdoid',
    'Extracranial Rhabdoid': 'Rhabdoid',
    'Melanoma': 'melanoma',
    'melanocytic': 'melanoma',
    'rectosigmoid junction': 'colon',
    '--': 'U',
    'Ependymoma': 'ependymoma',
    'rectum': 'colon',
    'upper aerodigestive squamous': 'upper_aerodigestive_squamous',
    'uterine carcinosarcoma': 'uterine_carcinosarcoma',
    'thyroid carcinoma': 'thyroid_carcinoma',
    'undifferentiated sarcoma NOS': 'sarcoma',
    'prostate adenocarcinoma': 'prostate_adenocarcinoma',
    'Cholangiocarcinoma': 'cholangiocarcinoma',
    'Acute Lymphoblastic Leukemia (ALL), B-cell': 'ALL',
    'Ewing sarcoma': 'Ewing_sarcoma',
    'Ewing Sarcoma': 'Ewing_sarcoma',
    'Ewings Sarcoma': 'Ewing_sarcoma',
    'acute lymphoblastic leukemia': 'ALL',
    'acute megakaryoblastic leukemia': 'AMegaL',
    'acute myeloid leukemia': 'AML',
    'Carcinoma': 'carcinoma',
    'Breast Ductal Carcinoma': 'breast_ductal_carcinoma',
    'Neuroblastoma': 'neuroblastoma',
    'parotid': 'oral',
    'lymphoma_unspecified': 'lymphoma',
    'hepatocellular carcinoma': 'hepatocellular_carcinoma',
    'undifferentiated hepatic sarcoma': 'sarcoma',
    'Hepatoblastoma': 'hepatoblastoma',
    'Wilms': 'wilms tumor',
    'stomach adenocarcinoma': 'gastric_adenocarcinoma', 
    'uveal melanoma':'uveal_melanoma',
    'Uveal Melanoma':'uveal_melanoma',
    'Esophagus adenocarcinoma (NOS)': 'esophagus_adenocarcinoma',
    'Engineered': 'U',
    'engineered': 'U',
    'caecum_adenocarcinoma': 'colorectal_adenocarcinoma',
    'colon adenocarcinoma': 'colorectal_adenocarcinoma',
    'rectum adenocarcinoma': 'colorectal_adenocarcinoma',
    'Medulloblastoma': 'medulloblastoma',
    'Carcinoma (NOS)': 'carcinoma',
    
    'epithelioid sarcoma': 'epithelioid_sarcoma',
     'fibromatosis': 'fibrosarcoma',
    'INI-deficient soft tissue sarcoma NOS': 'sarcoma',
    'dedifferentiated liposarcoma': "liposarcoma",
    'infantile fibrosarcoma': 'fibrosarcoma',
     'myxofibrosarcoma': 'myofibromatosis',
     'synovial sarcoma': 'synovial_sarcoma',
     'Extracranial Rhabdoid': 'Rhabdoid',
     'ATRT': 'Rhabdoid',
     'atypical teratoid/rhabdoid tumor': 'Rhabdoid',
    'malignant_rhabdoid_tumor': 'Rhabdoid',
    'undifferentiated pleomorphic sarcoma': 'pleomorphic_sarcoma',
    'undifferentiated sarcoma NOS': 'sarcoma',
    'undifferentiated spindle cell sarcoma': 'sarcoma',
    'undifferentiated_sarcoma': 'sarcoma',
    
    'bronchus and lung': 'lung',
}

ann = ann.replace(name_rename)

In [457]:
ann.disease_subtype = ann.disease_subtype.str.lower().str.replace(' ', '_')

In [460]:
#further renaming
ann = ann.replace({'embryonal': 'embryo',
    'esophageal': 'esophagus',
    'Stomach': 'stomach',
    'ovaries': 'ovary',
    'Adenocarcinoma (NOS)': 'Adenocarcinoma',
    'Esophogeal carcinoma': 'Esophageal carcinoma',
    'acute_undifferentiated_leukemia': 'acute_leukemia',
    'adrenal_carcinoma': 'adrenocortical_carcinoma',
    'mpnst':'malignant_peripheral_nerve_sheath_tumor',
    'mmmt':'mullerian_carcinoma'})

In [None]:
set(ann['lineage'])

In [None]:
set(ann['disease_subtype'])

In [464]:
ann.collection_site = ann.collection_site.str.lower().str.replace(' ', '_').str.replace('_mass', '').str.replace('_cavity', '').str.replace('l._', '').str.replace('left_', '').str.replace('r._', '').str.replace('right_', '')

  ann.collection_site = ann.collection_site.str.lower().str.replace(' ', '_').str.replace('_mass', '').str.replace('_cavity', '').str.replace('l._', '').str.replace('left_', '').str.replace('r._', '').str.replace('right_', '')


In [486]:
rename = {'femur': 'bone',
'femur/rib/verebra': 'bone',
'os_frontalis': 'bone',
'esophagus_-_distal_third': 'esophagus',
'oesophagus': 'esophagus',
'distal_femur': 'bone',
'adrenal_gland': 'adrenal',
'abdominal': 'abdomen',
'4th_ventricle': 'heart',
'cecum': 'colon',
'lymph_node(s)': 'lymph_node',
'tumor': 'U',
'Other': 'U',
'other'
'Rectosigmoid junction': 'colon',
'post mortem liver tumor':'liver',
'post mortem blood': 'blood_postmortem',
'pleural': 'pleura',
'pleural_effusion': 'pleura',
'proximal_tibia': 'bone',
'rectum': 'colon',
'smaintestine': 'stomach',
'tibia': 'bone',
'stomach_(nos)': 'stomach'}

In [487]:
ann = ann.replace({'collection_site': rename})

In [475]:
set([val for val in ann.collection_site if '_met' in val])

{'bone_marrow_metastasis',
 'breast_metastasis',
 'liver_metastasis',
 'lung_met',
 'lung_metastasis',
 'lymph_node_met',
 'paracaval_lymph_node_metastasis',
 'pleural_effusion_met'}

In [476]:
ann.loc[ann.collection_site.isin(set([val for val in ann.collection_site if '_met' in val])), 'cell_type']= 'metastatic'

In [477]:
ann = ann.replace({'collection_site': {'bone_marrow_metastasis': 'bone_marrow',
'breast_metastasis': 'breast',
'liver_metastasis': 'liver',
'lung_met': 'lung',
'lung_metastasis': 'lung_',
'lymph_node_met': 'lymph_node',
'paracaval_lymph_node_metastasis': 'paracaval_lymph_node',
'pleural_effusion_met': 'pleura'}})

In [488]:
set(ann.collection_site)

{'U',
 'abdomen',
 'adrenal',
 'ampulof_vater',
 'appendix',
 'arm',
 'ascites',
 'autonomic_ganglia',
 'bilateral',
 'biliatract',
 'bladder',
 'blood',
 'bone',
 'bone_marrow',
 'brain',
 'brain_stem',
 'breast',
 'buttock',
 'central_nervous_system',
 'cerebellum',
 'cerebrum',
 'cervical_node',
 'cervix',
 'colon',
 'common_biduct',
 'embryo',
 'endometrium',
 'esophagus',
 'extrahepatic_biduct',
 'eye',
 'fibroblast',
 'frontal_lobe',
 'gastroesophageal_junction',
 'haematopoietic_and_lymphoid_tissue',
 'head_and_neck_region',
 'heart',
 'humerus',
 'ileum',
 'kidney',
 'large_intestine',
 'leg',
 'liver',
 'lung',
 'lung/pleura',
 'lung_',
 'lymph_node',
 'neck',
 'occipital',
 'oral',
 'orbit',
 'other',
 'ovary',
 'pancreas',
 'pancreatic_head',
 'pancreatic_tail',
 'paracaval_lymph_node',
 'parapharyngeal_space',
 'paraspinal',
 'paratesticular',
 'parietal_lobe',
 'pericardial',
 'perineum',
 'peripheral_blood',
 'peritoneal',
 'peritoneum',
 'placenta',
 'pleura',
 'post_mor

In [492]:
ann[ann.base_media.isin(['organoid',
'Organoid',
'3D/Organoid',
'CM/Organoid',])]

Unnamed: 0_level_0,collection_site,disease_subtype,doublingt,method,reference,age,cell_type,ends,sequencer,participant_id,...,history,passage_number,media_type,contamination,lineage,stage,expansion,temp,conservation,cell_format
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AA22-Tumor-SM-GXH2L,pancreas,pancreatic_adenocarcinoma,U,bulk,hg38,U,primary,paired end,Illlumina TruSeq,AA22,...,Cells:Pellet frozen,U,3D/Organoid,none,pancreas,U,U,"(pancreas, Pancreatic Adenocarcinoma, pancreat...",frozen,2D
AA26-Tumor-SM-AU5YZ,pancreas,pancreatic_adenocarcinoma,U,bulk,hg38,U,primary,paired end,Illlumina TruSeq,AA26,...,Cells:Pellet frozen,U,CM/Organoid,none,pancreas,U,U,"(pancreas, Pancreatic Adenocarcinoma, pancreat...",frozen,2D
AA26-Tumor-SM-GXH2N,pancreas,pancreatic_adenocarcinoma,U,bulk,hg38,U,primary,paired end,Illlumina TruSeq,AA26,...,Cells:Pellet frozen,U,organoid,none,pancreas,U,U,"(pancreas, Pancreatic Adenocarcinoma, pancreat...",frozen,2D
AA29-Tumor-SM-GXH2P,liver,pancreatic_adenocarcinoma,U,bulk,hg38,U,metastatic,paired end,Illlumina TruSeq,AA29,...,Cells:Pellet frozen,U,Organoid,none,pancreas,U,U,"(pancreas, Pancreatic Adenocarcinoma, pancreat...",frozen,2D


In [491]:
loc = ann.base_media=='U' 
ann.loc[loc, 'base_media'] = ann.loc[loc, 'media_type'].values

In [495]:
rename = {'WIT-P-NC: 100.0%': 'WITP',
'XVIVO: 100.0%, Manufactured in BSP: XVIVO: 100.0 %': 'XVIVO',
'AR5: 100.0%': 'AR5',
'AR5_SMGM': 'AR5:SMGM (1:1)',
'CM': 'CM',
'CM/OPAC': 'Pancreas:CM (1:1)',
'CM: 100.0%': 'CM',
'Ham F-12: 25.0%, DMEM: 75.0%': 'DMEM:F12 (3:1)',
'Manufactured in BSP: AR5 Media: 100.0 %, Manufactured in BSP: AR5 Media: RPMI 93.0 %': "AR5",
'Manufactured in BSP: CM + OPAC: Manufactured in BSP: Pancreas Organoid Media 50.0 %, Manufactured in BSP: CM + OPAC: 100.0 %, Manufactured in BSP: CM + OPAC: Manufactured in BSP: CM1:021017 50.0 %': 'Pancreas:CM (1:1)',
'Manufactured in BSP: CM1:021017: 100.0 %, Manufactured in BSP: CM1:021017: CM1:021017 100.0 %': 'CM1',
'Manufactured in BSP: CM1:021017: CM1:021017 95.0 %, Manufactured in BSP: CM1:021017: 100.0 %': "CM1",
'Manufactured in BSP: CM1:022017: 50.0 %, Manufactured in BSP: Pancreas Organoid Media: Manufactured in BSP: Basic Medium 40.0 %, Manufactured in BSP: Pancreas Organoid Media: Wnt3A CM 50.0 %, Manufactured in BSP: CM1:022017: CM1:022017 100.0 %, Manufactured in BSP: Pancreas Organoid Media: RspondinI CM 10.0 %, Manufactured in BSP: Pancreas Organoid Media: 50.0 %': 'Pancreas:CM (1:1)',
'Manufactured in BSP: CM2:050417: 100.0 %, Manufactured in BSP: CM1:051917: 100.0 %, Manufactured in BSP: CM1:051917: CM1:051917 100.0 %, Manufactured in BSP: CM2:050417: CM2:050417 100.0 %': "CM1",
'Manufactured in BSP: CM2:050417: 100.0 %, Manufactured in BSP: CM2:050417: CM2:050417 100.0 %': 'CM2',
'Manufactured in BSP: CM2:062316: CM2:062316 100.0 %, Manufactured in BSP: CM1:051917: CM1:051917 99.0 %, Manufactured in BSP: CM1:051917: 100.0 %, Manufactured in BSP: CM2:062316: 100.0 %': 'CM1',
'Manufactured in BSP: M87_WITP: WIT_P 50.0 %, Manufactured in BSP: M87_WITP: M87 50.0 %, Manufactured in BSP: M87_WITP: 100.0 %, M87_WITP': 'M87:WITP (1:1)',
'Manufactured in BSP: NSA: NeuroCult NS-A Basal Medium 90.0 %, Neuro: 100.0%, Manufactured in BSP: NSA: 100.0 %': 'NSA',
'Manufactured in BSP: NSA: NeuroCult NS-A Basal Medium 90.0 %, Neurosphere: 99.0%, Manufactured in BSP: NSA: 100.0 %': 'NSA',
'Manufactured in BSP: NSA: Neurocult-NSA 98.0 %, Manufactured in BSP: NSA: 100.0 %': 'NSA',
'Manufactured in BSP: NSA: Neurocult-NSA 98.0 %, Manufactured in BSP: NSA: NeuroCult NS-A Basal Medium 90.0 %, Manufactured in BSP: NSA: 100.0 %': 'NSA',
'Manufactured in BSP: Neurocult-NSA Media: Neurocult-NSA 98.0 %, Manufactured in BSP: Neurocult-NSA Media: 100.0 %': 'NSA',
'Manufactured in BSP: Neurocult-NSA: Neurocult-NSA 98.0 %, Manufactured in BSP: Neurocult-NSA: 100.0 %': 'NSA',
'Manufactured in BSP: Pancreas Organoid Medai: Manufactured in BSP: Basic Medium 39.0 %, Manufactured in BSP: Pancreas Organoid Medai: 100.0 %, Manufactured in BSP: Pancreas Organoid Medai: Wnt3A CM 50.0 %': 'Pancreas',
'Manufactured in BSP: Pancreas Organoid Medai: RspondinI CM 10.0 %, Manufactured in BSP: Pancreas Organoid Medai: 100.0 %, Manufactured in BSP: Pancreas Organoid Medai: Wnt3A CM 50.0 %, Manufactured in BSP: Pancreas Organoid Medai: Manufactured in BSP: Basic Medium 40.0 %': 'Pancreas',
'Manufactured in BSP: Pancreas Organoid Medai: RspondinI CM 10.0 %, Manufactured in BSP: Pancreas Organoid Medai: 100.0 %, Manufactured in BSP: Pancreas Organoid Medai: Wnt3A CM 50.0 %, Manufactured in BSP: Pancreas Organoid Media: 100.0 %, Manufactured in BSP: Pancreas Organoid Medai: Manufactured in BSP: Basic Medium 40.0 %': 'Pancreas',
'Manufactured in BSP: Pancreas Organoid Medai: RspondinI CM 10.0 %, Manufactured in BSP: Pancreas Organoid Medai: Manufactured in BSP: Basic Medium 39.0 %, Manufactured in BSP: Pancreas Organoid Medai: 100.0 %, Manufactured in BSP: Pancreas Organoid Medai: Wnt3A CM 50.0 %': 'Pancreas',
'Manufactured in BSP: Pancreas Organoid Medai: RspondinI CM 10.0 %, Manufactured in BSP: Pancreas Organoid Medai: Manufactured in BSP: Basic Medium 39.0 %, Manufactured in BSP: Pancreas Organoid Medai: 100.0 %, Manufactured in BSP: Pancreas Organoid Medai: Wnt3A CM 50.0 %, Manufactured in BSP: Pancreas Organoid Media: 100.0 %': 'Pancreas',
'Manufactured in BSP: Pancreas Organoid Medai: RspondinI CM 10.0 %, Manufactured in BSP: Pancreas Organoid Medai: Manufactured in BSP: Basic Medium 39.0 %, Manufactured in BSP: Pancreas Organoid Medai: 100.0 %, Manufactured in BSP: Pancreas Organoid Medai: Wnt3A CM 50.0 %, Panc. Org.: 100.0%': 'Pancreas',
'Manufactured in BSP: Pancreas Organoid Media: Manufactured in BSP: Basic Medium 40.0 %, Manufactured in BSP: Pancreas Organoid Media: Wnt3A CM 50.0 %, Manufactured in BSP: CM2:022317: CM2:022317 100.0 %, Manufactured in BSP: Pancreas Organoid Media: RspondinI CM 10.0 %, Manufactured in BSP: CM2:022317: 50.0 %, Manufactured in BSP: Pancreas Organoid Media: 50.0 %': 'Pancreas',
'Manufactured in BSP: Pancreas Organoid Media: Manufactured in BSP: Basic Medium 40.0 %, Manufactured in BSP: Pancreas Organoid Media: Wnt3A CM 50.0 %, Manufactured in BSP: Pancreas Organoid Media: RspondinI CM 10.0 %, Manufactured in BSP: Pancreas Organoid Media: 100.0 %': 'Pancreas',
'Manufactured in BSP: Pancreas Organoid Media: Wnt3A CM 50.0 %, Manufactured in BSP: Pancreas Organoid Media: RspondinI CM 10.0 %, Manufactured in BSP: Pancreas Organoid Media: Manufactured in BSP: Basal Media 40.0 %, Manufactured in BSP: Pancreas Organoid Media: 100.0 %': 'Pancreas',
'Manufactured in BSP: RETM w/ supplements: 100.0 %, Manufactured in BSP: RETM w/ supplements: RETM Basal Media 91.0 %': 'RETM',
'Manufactured in BSP: RETM: 100.0 %': 'RETM',
'Manufactured in BSP: RETM: 100.0 %, Manufactured in BSP: RETM: RETM 91.0 %': 'RETM',
'Manufactured in BSP: RETM: 100.0 %, Manufactured in BSP: RETM: RETM 95.0 %': 'RETM',
'Manufactured in BSP: RETM: 100.0 %, Manufactured in BSP: RETM: RETM and supplement 100.0 %': 'RETM',
'Manufactured in BSP: RETM_CM: Manufactured in BSP: RETM 50.0 %, RETM_CM, Manufactured in BSP: RETM_CM: Manufactured in BSP: CM2:051117 50.0 %, Manufactured in BSP: RETM_CM: 100.0 %': 'RETM:CM (1:1)',
'Manufactured in BSP: RETM_M87: M87 50.0 %, RETM_M87, Manufactured in BSP: RETM_M87: Manufactured in BSP: RETM 50.0 %, Manufactured in BSP: RETM_M87: 100.0 %': 'RETM:M87 (1:1)',
'Manufactured in BSP: RPMI 90%: 100.0 %, Manufactured in BSP: RPMI 90%: RPMI-1640 90.0 %': 'RPMI',
'Manufactured in BSP: SMBM: 100.0 %, Manufactured in BSP: SMBM: SMBM 94.0 %, Manufactured in BSP: SMGM: 100.0 %': 'SMBM',
'Manufactured in BSP: SMBM: 100.0 %, Manufactured in BSP: SMBM: SMBM 94.0 %, Manufactured in BSP: SMGM: 100.0 %, Manufactured in BSP: SMGM: SMGM 100.0 %, Manufactured in BSP: SMGM: SMGM-2 single Quots 100.0 %': 'SMBM',
'Manufactured in BSP: SMBM: 100.0 %, Manufactured in BSP: SMGM: SMBM 1.0 %, Manufactured in BSP: SMBM: SMBM 94.0 %, Manufactured in BSP: SMGM: 100.0 %': 'SMBM',
'Manufactured in BSP: SMBM: 100.0 %, Manufactured in BSP: SMGM: SMGM-2 single Quots 1.0 %, Manufactured in BSP: SMGM: SMGM 99.0 %, Manufactured in BSP: SMBM: SMBM 94.0 %, Manufactured in BSP: SMGM: 100.0 %': 'SMBM',
'Manufactured in BSP: SMBM: 100.0 %, Manufactured in BSP: SMGM: SMGM-2 single Quots 99.0 %, Manufactured in BSP: SMBM: SMBM 94.0 %, Manufactured in BSP: SMGM: 100.0 %': 'SMBM',
'NS-A: 100.0%': 'NSA',
'NSA: 100.0%': 'NSA',
'Neurosphere: 99.0%, Manufactured in BSP: NSA: 100.0 %, Manufactured in BSP: NSA: NeuroCult NS-A Basal Medium 90.0 U/L': 'NSA',
'Panc. Org.: 100.0%': 'Pancreas',
'Panc: 100.0%': 'Pancreas',
'Pancreas: 100.0%': 'Pancreas',
'RETM: 100.0%, Manufactured in BSP: RETM: 100.0 %': 'RETM',
'RETM: 95.0%': 'RETM',
'RPMI: 100.0%': 'RPMI'}

In [496]:
ann = ann.replace({'base_media': rename})

In [497]:
ann = ann.drop(columns=['expansion', 'history', 'media_type', 'disease_type', 'temp']).rename(columns={'disease_subtype': 'disease_type'})

In [504]:
ann = ann.replace({"base_media":{"organoid": "Organoid",
'3D/Organoid': 'Organoid',
'EMEM': 'MEM',
'EMEM:F12 (1:1)': 'MEM:F12 (1:1)',
'Neurobasal Media:DMEM:F12 (2:1:1)': 'NSA:DMEM:F12 (2:1:1)',}})

In [None]:
ann.loc[ann.base_media=="XVIVO", ["tissue_type", "conversation", "format", "morphology"] = ["tumor", "U", '', '']


In [502]:
loc = ann.base_media.isin(['CM/Organoid',  'Organoid', 'Organoid'])
ann.loc[loc, 'cell_format'] = "3D"

Unnamed: 0_level_0,base_media,cell_format
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ACH-001719,OCMI-L,2D
ACH-001719,OCMI-L,2D
ACH-001201,Neurobasal Media:DMEM:F12 (2:1:1),2D
ACH-001201,Neurobasal Media:DMEM:F12 (2:1:1),2D
AA22-Tumor-SM-GXH2L,3D/Organoid,2D
AA26-Tumor-SM-AU5YZ,CM/Organoid,2D
AA26-Tumor-SM-GXH2N,organoid,2D
AA29-Tumor-SM-GXH2P,Organoid,2D
AA40-Tumor-SM-BZYSE,Pancreas,2D
AA45-Tumor-SM-EXXLQ,Pancreas:CM (1:1),2D


In [531]:
res = []
for val in ann.doublingt.values:
    if type(val) is float:
        if val<30:
            res.append('vslow')
        elif val < 70:
            res.append('slow')
        elif val < 150:
            res.append('medium')
        else:
            res.append('fast')
    else:
        res.append('U')
ann.doublingt = res

### plotting total

In [18]:
red = umap.UMAP(n_neighbors= 10, min_dist= 0.5).fit_transform(total)

In [506]:
from sklearn.decomposition import PCA

In [507]:
red = PCA(n_components=2).fit_transform(total)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
plot.scatter(red, xname="UMAP1", yname="UMAP2", colors=[1]*len(CCLE_expression)+[0]*len(TCGA_expression), radi=.1, labels={k: list(v) for k, v in ann[['tissue_type', 'disease_type', 'cell_type']].reset_index().T.iterrows()})

### saving

In [537]:
ann=ann.astype(str)

In [538]:
AnnData(X=total, obs=ann).write('temp/celligner_multi.h5ad.gz', compression='gzip')

... storing 'collection_site' as categorical
... storing 'disease_type' as categorical
... storing 'doublingt' as categorical
... storing 'method' as categorical
... storing 'reference' as categorical
... storing 'age' as categorical
... storing 'cell_type' as categorical
... storing 'ends' as categorical
... storing 'sequencer' as categorical
... storing 'participant_id' as categorical
... storing 'sex' as categorical
... storing 'base_media' as categorical
... storing 'morphology' as categorical
... storing 'dataset' as categorical
... storing 'ethnicity' as categorical
... storing 'instability' as categorical
... storing 'hasebv' as categorical
... storing 'tissue_id' as categorical
... storing 'sample_type' as categorical
... storing 'align' as categorical
... storing 'counter' as categorical
... storing 'passage_number' as categorical
... storing 'contamination' as categorical
... storing 'lineage' as categorical
... storing 'stage' as categorical
... storing 'conservation' as cat