## Loading packages

In [7]:
from genepy.utils import helper as h

# to comment in your case
from taigapy import TaigaClient
tc = TaigaClient()

from celligner import Celligner
import pandas as pd
# to comment in your case
from depmapomics import tracker as track
#autoreload
%load_ext autoreload
%autoreload 2
#output
from bokeh.plotting import output_notebook
output_notebook()
from gsheets import Sheets
from celligner.params import TISSUE_COLOR

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading expression files

In [None]:
# load from taiga public (figshare link)
# load internal expression,
# latest version can be found at https://depmap.org/portal/download/
# can also be loaded like so pd.read_csv('gs://ccle_default_params/celligner_ex/CCLE_expression.csv.gz', index_col=0)
CCLE_expression = tc.get(name='internal-22q2-097a',
                         file='CCLE_expression_full')  

# load  TCGA expression
# this dataset was generated from  ,using this script: 
# caan be found here: pd.read_csv('gs://ccle_default_params/celligner_ex/TCGA_expression.csv.gz', index_col=0)
TCGA_expression = tc.get(name='celligner-input-9827',
                         file='tumor_expression')

In [None]:
# subset gene names to ensembl ids only
CCLE_expression = CCLE_expression[CCLE_expression.columns[:-92]]
CCLE_expression.columns = list(map(lambda x: x.split(
    ' (')[1][:-1] if ' (' in x else x, CCLE_expression.columns))

common = set(CCLE_expression.columns).intersection(
    set(TCGA_expression.columns))
CCLE_expression = CCLE_expression[list(common)]
TCGA_expression = TCGA_expression[list(common)]

## Managing annotations

In [8]:
# loading annotations
#CCLE_annotation = track.getTracker() 
# the function uses pygsheets to load this: 
REFSHEET_URL="https://docs.google.com/spreadsheets/d/1Pgb5fIClGnErEqzxpU7qqX6ULpGTDjvzWwDN8XUJKIY"
MY_ID="~/.client_secret.json"
MYSTORAGE_ID="~/.storage.json"
CCLE_annotation = Sheets.from_files(MY_ID, MYSTORAGE_ID).get(REFSHEET_URL).sheets[0].to_frame(index_col=0)
# you can also get it from pd.read_csv('gs://ccle_default_params/celligner_ex/CCLE_annotation.csv.gz', index_col=0)


# can be loaded from 
# pd.read_csv('gs://ccle_default_params/celligner_ex/TCGA_annotation.csv.gz', index_col=0)
TCGA_annotation = tc.get(name='celligner-input-9827',
                         file='tumor_annotations') # generated manually 

No dataset version provided. Using version 1.


In [9]:
# transforming annotations
CCLE_annotation = CCLE_annotation.drop_duplicates('arxspan_id').set_index("arxspan_id")
CCLE_annotation = CCLE_annotation.loc[CCLE_expression.index, ["lineage", 'subtype']].rename(columns={"lineage": "tissue_type", "subtype": 'disease_type'})
CCLE_annotation["cell_type"] = "CCLE cell line"

TCGA_annotation = TCGA_annotation.set_index("sampleID").loc[TCGA_expression.index,["lineage",
"subtype"]].rename(columns={"lineage":"tissue_type", "subtype": 'disease_type'})
TCGA_annotation['cell_type'] = "TCGA tumor"

In [10]:
rename = {np.nan: "unknown", "adrenal_cortex": "adrenal", "colorectal": "colon", 'thymus': 'thyroid', 'meninges':"central_nervous_system", None: "unknown", 'brain': "central_nervous_system"}

In [11]:
# some name are not consistent between the two datasets
CCLE_annotation = CCLE_annotation.replace({"tissue_type": rename})
TCGA_annotation = TCGA_annotation.replace({"tissue_type": rename})

## adding other datasets to celligner

for additional explanation on the dataset pre processing : [DepMap Portal: Celligner multidataset documentation](https://docs.google.com/document/d/1PW5GUwqLKj63pqOpi0r7r-9hXVQokSngPHiGWXp4q20/edit)

In [12]:
# met500 
met500_ann = tc.get(name='met500-fc3c', file='met500_ann')
met500_meta = tc.get(name='met500-fc3c', file='met500_meta')
met500_TPM = tc.get(name='met500-fc3c', file='met500_TPM') #20,979x868 matrix

#Novartis_PDX
Novartis_PDX_ann = tc.get(name='pdx-data-3d29', file='Novartis_PDX_ann')
Novartis_PDX_TPM = tc.get(name='pdx-data-3d29', file='Novartis_PDX_TPM').T # 38,087x445

#pediatric_PDX
pediatric_PDX_ann = tc.get(name='pdx-data-3d29', file='pediatric_PDX_ann')
pediatric_PDX_TPM = tc.get(name='pdx-data-3d29', file='pediatric_PDX_TPM') #80,000x250

No dataset version provided. Using version 1.
No dataset version provided. Using version 1.
No dataset version provided. Using version 1.
No dataset version provided. Using version 2.
No dataset version provided. Using version 2.
No dataset version provided. Using version 2.
No dataset version provided. Using version 2.


In [13]:
### adding CCLF

In [14]:
import dalmatian as dm

In [28]:
todrop = []
for i in h.dups([i.split('.')[0] for i in cclf.columns]):
    cols = cclf.columns[cclf.columns.str.contains(i)]
    cclf[i] = cclf[cols].mean(1)
    todrop.extend(cols)
cclf = cclf.drop(columns=cols)
cclf.columns = [i.split('.')[0] for i in cclf.columns]

In [None]:
file = dm.WorkspaceManager('nci-mimoun-bi-org/CCLF_RNA_2_0').get_sample_sets().loc['all', 'genes_tpm']
cclf = pd.read_csv(file, sep="\t").drop(columns='transcript_id(s)').set_index('Unnamed: 0', drop=True).T

todrop = []
for i in h.dups([i.split('.')[0] for i in cclf.columns]):
    cols = cclf.columns[cclf.columns.str.contains(i)]
    cclf[i] = cclf[cols].mean(1)
    todrop.extend(cols)
cclf = cclf.drop(columns=cols)
cclf.columns = [i.split('.')[0] for i in cclf.columns]

### Managing annotations

In [10]:
met500_meta["primary_site"] = met500_ann['primary_site'].values
met500_ann = met500_meta.rename(columns={"Sample_id": 'sample_id', 'tissue': 'tissue_type', 'primary_site': "disease_type", "sample_type": "cell_type"}).set_index('sample_id', drop=True)[["tissue_type","disease_type","cell_type"]].replace({"tissue_type":rename, "cell_type": {"tumor": "met500 tumor"}})

In [11]:
pediatric_PDX_ann = pediatric_PDX_ann.rename(columns={"sampleID": 'sample_id', 'lineage': 'tissue_type', 'subtype': "disease_type", "type": "cell_type"}).set_index('sample_id', drop=True)[['cell_type', 'disease_type', 'tissue_type']].replace({"tissue_type":rename})

In [12]:
Novartis_PDX_ann = Novartis_PDX_ann.rename(columns={"sampleID": 'sample_id', 'lineage': 'tissue_type', 'subtype': "disease_type", "type": "cell_type"}).set_index('sample_id', drop=True)[['cell_type', 'disease_type', 'tissue_type']].replace({"tissue_type":rename})

In [13]:
Novartis_PDX_ann = Novartis_PDX_ann.loc[Novartis_PDX_TPM.index]

In [14]:
pediatric_PDX_ann['cell_type'] = "ped PDX"

In [15]:
# HCMI dataset
# Code to generate this dataset can be found here:
# https://github.com/broadinstitute/hcmi-processing/blob/main/hcmi-rna-analysis-210226.ipynb
hcmi_ltpm = tc.get(name='hcmi-data-ac4b', file='hcmi_ltpm').T # 60486 x 157
hcmi_sample_info = tc.get(name='hcmi-data-ac4b', file='hcmi_sample_info')
#sample_info = tc.get(name='hcmi-data-ac4b', file='sample-info')

hcmi_sample_info = hcmi_sample_info[['subtype', 'sampleID','lineage', 'type']].set_index("sampleID", drop=True).rename(columns={'lineage': 'tissue_type', 'subtype': 'disease_type'}).replace({'tissue_type':{
 None: 'NS',
 'ampulla of vater': 'ampulla_of_vater',
 'bronchus and lung': 'lung',
 'extrahepatic bile duct': 'bile_duct',
 'intrahepatic bile duct': 'bile_duct',
 'rectosigmoid junction': 'colon',
 'rectum': 'colon',
 'brain': 'central_nervous_system',
 'small intestine': 'small_intestine',
}})

hcmi_sample_info['cell_type'] = "hcmi models"

hcmi_ltpm = hcmi_ltpm[hcmi_ltpm.columns[~hcmi_ltpm.columns.isna()]]

No dataset version provided. Using version 7.
No dataset version provided. Using version 7.


## Fitting celliner with the CCLE dataset 

In [16]:
tumor_alligner = Celligner()
tumor_alligner.fit(TCGA_expression, TCGA_annotation)

fetching gene names from biomart cache
using only usefull genes
looking at 12236 samples.
found 29567 common genes
creating a fit dataset..
clustering...
         Falling back to preprocessing with `sc.pp.pca` and default params.
doing differential expression analysis on the clusters
running differential expression on 55 clusters
running limmapy on the samples
you need to have R installed with the limma library installed
3.4.5
done


<celligner.Celligner at 0x7f06bdbbfc50>

In [31]:
from genepy import rna

In [37]:
rna.plot(met500_TPM, met500_ann)

PCA...
UMAP...
plot...


In [17]:
tumor_alligner.mnn_kwargs = {'k1': 10, 'k2': 50, 'cosine_norm': False, "fk":5}
# making args that work for smaller datasets
tumor_alligner.neightbors_kwargs = {"n_neighbors": 10, "n_pcs": 70,}
tumor_alligner.louvain_kwargs = {"resolution": 3,}
_ = tumor_alligner.transform(met500_TPM, met500_ann, _doCPCA=False)
tumor_alligner.putAllToFit(redo_diff=False)

# making args that work for smaller datasets
tumor_alligner.neightbors_kwargs = {"n_neighbors": 5, "n_pcs": 30,}
tumor_alligner.louvain_kwargs = {"resolution": 3,}
tumor_alligner.mnn_kwargs = {'k1': 3, 'k2': 50}
_ = tumor_alligner.transform(hcmi_ltpm[hcmi_sample_info['type']=='hcmi_tumor'], hcmi_sample_info[hcmi_sample_info['type']=='hcmi_tumor'], _doCPCA=False)
tumor_alligner.putAllToFit(redo_diff=False)

looking at 868 samples.
found 18218 common genes
creating a transform input..
clustering..
         Falling back to preprocessing with `sc.pp.pca` and default params.
doing differential expression analysis on the clusters..
running differential expression on 34 clusters
running limmapy on the samples
you need to have R installed with the limma library installed
3.4.5
there is 0.388 overlap between the fit and transform dataset in their most variable genes
doing the MNN analysis using Marioni et al. method..
  Looking for MNNs...
  Found 7509 mutual nearest neighbors.
done
clustering...
         Falling back to preprocessing with `sc.pp.pca` and default params.
done
looking at 64 samples.
found 18215 common genes
creating a transform input..
clustering..
         Falling back to preprocessing with `sc.pp.pca` and default params.
doing differential expression analysis on the clusters..
running differential expression on 11 clusters
running limmapy on the samples
you need to have R instal

Compilation is falling back to object mode WITH looplifting enabled because Function "l2_norm" failed type inference due to: [1m[1m[1mNo implementation of function Function(<function norm at 0x7f08944c1378>) found for signature:
 
 >>> norm(x=array(float32, 2d, A), axis=Literal[int](1))
 
There are 2 candidate implementations:
[1m    - Of which 2 did not match due to:
    Overload in function 'norm_impl': File: numba/np/linalg.py: Line 2352.
      With argument(s): '(x=array(float32, 2d, A), axis=int64)':[0m
[1m     Rejected as the implementation raised a specific error:
       TypeError: norm_impl() got an unexpected keyword argument 'x'[0m
  raised from /home/jeremie/miniconda3/lib/python3.7/site-packages/numba/core/typing/templates.py:722
[0m
[0m[1mDuring: resolving callee type: Function(<function norm at 0x7f08944c1378>)[0m
[0m[1mDuring: typing of call at /home/jeremie/celligner/celligner/mnnpy/mnnpy/utils.py (16)
[0m
[1m
File "../celligner/mnnpy/mnnpy/utils.py", lin

  Looking for MNNs...
  Found 192 mutual nearest neighbors.
done
clustering...
         Falling back to preprocessing with `sc.pp.pca` and default params.
done


In [23]:
tumor_alligner.fit_annotations.replace({'disease_type': disease_rename}, inplace=True)
tumor_alligner.fit_annotations.disease_type = tumor_alligner.fit_annotations.disease_type.str.lower().str.replace(' ', '_')
tumor_alligner.fit_annotations.replace({'disease_type':  disease_rename2}, inplace=True)

In [29]:
imp = np.zeros(len(tumor_alligner.fit_annotations))
imp[tumor_alligner.fit_annotations.cell_type.values=='TCGA tumor'] = 3
shape = tumor_alligner.fit_annotations.cell_type.values
tumor_alligner.plot(color_column="tissue_type", colortable=TISSUE_COLOR, rerun=True, radi=5, shape=shape, importance=imp, folder="../temp/22Q1_hcmiMAIN_new_tumoronly")

no corrected data
reducing dimensionality...
making plot...


You need to install Selenium to save the svg!


  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


In [47]:
imp = np.zeros(len(full_alligner.annotations))
imp[full_alligner.annotations.cell_type.values=='TCGA tumor'] = 2
shape = full_alligner.annotations.cell_type.values
full_alligner.plot(color_column="tissue_type", colortable=TISSUE_COLOR, rerun=False, radi=5, shape=shape, importance=imp, folder="../temp/22Q1_hcmiMAIN_new")

making plot...


You need to install Selenium to save the svg!


  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


In [67]:
model_alligner = Celligner()
model_alligner.fit(CCLE_expression, CCLE_annotation)

model_alligner.mnn_kwargs = {'k1': 5, 'k2': 5, 'cosine_norm': False, "fk":5}
model_alligner.neightbors_kwargs = {"n_neighbors": 10, "n_pcs": 70,}
model_alligner.louvain_kwargs = {"resolution": 4,}
_= model_alligner.transform(hcmi_ltpm, hcmi_sample_info, _doCPCA=False)
model_alligner.putAllToFit(redo_diff=False)

model_alligner.mnn_kwargs = {'k1': 5, 'k2': 5, 'cosine_norm': False, "fk":5}
model_alligner.neightbors_kwargs = {"n_neighbors": 10, "n_pcs": 70,}
model_alligner.louvain_kwargs = {"resolution": 4,}
_= model_alligner.transform(Novartis_PDX_TPM, Novartis_PDX_ann, _doCPCA=False)
model_alligner.putAllToFit(redo_diff=False)

_= model_alligner.transform(pediatric_PDX_TPM, pediatric_PDX_ann, _doCPCA=False)
model_alligner.putAllToFit(redo_diff=False)

  and should_run_async(code)


fetching gene names from biomart cache
using only usefull genes
looking at 1457 samples.
found 29567 common genes
creating a fit dataset..
clustering...
         Falling back to preprocessing with `sc.pp.pca` and default params.
doing differential expression analysis on the clusters
running differential expression on 34 clusters
running limmapy on the samples
you need to have R installed with the limma library installed
3.4.5
done
looking at 156 samples.
found 29511 common genes
creating a transform input..
clustering..
         Falling back to preprocessing with `sc.pp.pca` and default params.
doing differential expression analysis on the clusters..
running differential expression on 15 clusters
running limmapy on the samples
you need to have R installed with the limma library installed
3.4.5
there is 0.264 overlap between the fit and transform dataset in their most variable genes
doing the MNN analysis using Marioni et al. method..
  Looking for MNNs...


  k_index_1 = cKDTree(data1).query(x=data2, k=k1, n_jobs=n_jobs)[1]
  k_index_2 = cKDTree(data2).query(x=data1, k=k2, n_jobs=n_jobs)[1]


  Found 310 mutual nearest neighbors.


  n_jobs=n_jobs)


done
clustering...
         Falling back to preprocessing with `sc.pp.pca` and default params.
done
looking at 445 samples.
found 25138 common genes
creating a transform input..
clustering..
         Falling back to preprocessing with `sc.pp.pca` and default params.
doing differential expression analysis on the clusters..
running differential expression on 20 clusters
running limmapy on the samples
you need to have R installed with the limma library installed
3.4.5
there is 0.33 overlap between the fit and transform dataset in their most variable genes
doing the MNN analysis using Marioni et al. method..
  Looking for MNNs...


  k_index_1 = cKDTree(data1).query(x=data2, k=k1, n_jobs=n_jobs)[1]
  k_index_2 = cKDTree(data2).query(x=data1, k=k2, n_jobs=n_jobs)[1]


  Found 821 mutual nearest neighbors.


  n_jobs=n_jobs)


done
clustering...
         Falling back to preprocessing with `sc.pp.pca` and default params.
done
looking at 244 samples.
found 25138 common genes
creating a transform input..
clustering..
         Falling back to preprocessing with `sc.pp.pca` and default params.
doing differential expression analysis on the clusters..
running differential expression on 17 clusters
running limmapy on the samples
you need to have R installed with the limma library installed
3.4.5
there is 0.372 overlap between the fit and transform dataset in their most variable genes
doing the MNN analysis using Marioni et al. method..
  Looking for MNNs...


  k_index_1 = cKDTree(data1).query(x=data2, k=k1, n_jobs=n_jobs)[1]
  k_index_2 = cKDTree(data2).query(x=data1, k=k2, n_jobs=n_jobs)[1]


  Found 469 mutual nearest neighbors.


  n_jobs=n_jobs)


done
clustering...
         Falling back to preprocessing with `sc.pp.pca` and default params.
done


In [68]:
model_alligner.fit_annotations.replace({'disease_type': disease_rename}, inplace=True)
model_alligner.fit_annotations.disease_type = model_alligner.fit_annotations.disease_type.str.lower().str.replace(' ', '_')
model_alligner.fit_annotations.replace({'disease_type':  disease_rename2}, inplace=True)

  and should_run_async(code)


In [69]:
shape = model_alligner.fit_annotations.cell_type.values
model_alligner.plot(color_column="tissue_type", colortable=TISSUE_COLOR, rerun=True, radi=5, shape=shape, folder = "../temp/22Q1_hcmiMAIN_new_modelonly")

no corrected data
reducing dimensionality...


  and should_run_async(code)


making plot...


You need to install Selenium to save the svg!


  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


In [70]:
full_alligner = Celligner(make_plots=False)
full_alligner.neightbors_kwargs = {"n_neighbors": 20, "n_pcs": 70,}
full_alligner.louvain_kwargs = {"resolution": 4,}
full_alligner.fit(tumor_alligner.fit_input, tumor_alligner.fit_annotations)
full_alligner.mnn_kwargs = {'k1': 5, 'k2': 50, 'cosine_norm': False, "fk":5}
full_alligner.neightbors_kwargs = {"n_neighbors": 15, "n_pcs": 70,}
full_alligner.louvain_kwargs = {"resolution": 5,}
_= full_alligner.transform(model_alligner.fit_input, model_alligner.fit_annotations)

  and should_run_async(code)


fetching gene names from biomart cache
using only usefull genes
looking at 13168 samples.
found 18215 common genes
creating a fit dataset..
clustering...
         Falling back to preprocessing with `sc.pp.pca` and default params.
doing differential expression analysis on the clusters
running differential expression on 54 clusters
running limmapy on the samples
you need to have R installed with the limma library installed
3.4.5
done
looking at 2302 samples.
found 18046 common genes
creating a transform input..
clustering..
         Falling back to preprocessing with `sc.pp.pca` and default params.
doing differential expression analysis on the clusters..
running differential expression on 48 clusters
running limmapy on the samples
you need to have R installed with the limma library installed
3.4.5
there is 0.384 overlap between the fit and transform dataset in their most variable genes
doing cPCA..
transform
regressing out the cPCA components..
doing the MNN analysis using Marioni et al.

  k_index_1 = cKDTree(data1).query(x=data2, k=k1, n_jobs=n_jobs)[1]
  k_index_2 = cKDTree(data2).query(x=data1, k=k2, n_jobs=n_jobs)[1]


  Found 7149 mutual nearest neighbors.


  n_jobs=n_jobs)


done


In [73]:
imp = np.zeros(len(full_alligner.fit_annotations))
imp[full_alligner.fit_annotations.cell_type.values=='TCGA tumor'] = 2
imp = np.hstack([imp, np.zeros(len(full_alligner.transform_annotations))])
shape = np.hstack([full_alligner.fit_annotations.cell_type.values, full_alligner.transform_annotations.cell_type.values])
full_alligner.plot(color_column="tissue_type", colortable=TISSUE_COLOR, rerun=True, radi=5, shape=shape, importance=imp, folder="../temp/22Q1_hcmiMAIN_new")

  and should_run_async(code)


reducing dimensionality...
making plot...


  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


You need to install Selenium to save the svg!


In [50]:
import scib
from anndata import AnnData

  and should_run_async(code)


In [52]:
overlap = list(set(hcmi_ltpm.columns) & set(met500_TPM.columns) & set(CCLE_expression.columns) & set(Novartis_PDX_TPM.columns) & set(pediatric_PDX_TPM.columns) & set(TCGA_expression.columns))

In [55]:
dat = pd.concat([hcmi_ltpm[overlap], met500_TPM[overlap], CCLE_expression[overlap], Novartis_PDX_TPM[overlap], pediatric_PDX_TPM[overlap], TCGA_expression[overlap]])

In [56]:
dat.shape

  and should_run_async(code)


(15406, 18047)

In [58]:
newdataset.shape

  and should_run_async(code)


(13168, 18215)

In [60]:
set(newdataset.obs.cell_type)

  and should_run_async(code)


{'TCGA tumor', 'hcmi models', 'met500 tumor'}

In [54]:
dataset = AnnData(pd.concat([hcmi_ltpm[overlap], met500_TPM[overlap], CCLE_expression[overlap], Novartis_PDX_TPM[overlap], pediatric_PDX_TPM[overlap], TCGA_expression[overlap]]), obs = full_alligner.fit_annotations.append(full_alligner.transform_annotations))

  and should_run_async(code)


ValueError: Index of obs must match index of X.

In [63]:
full_alligner

  and should_run_async(code)


<celligner.Celligner at 0x7f043a3efd68>

In [62]:
full_alligner.corrected.shape

  and should_run_async(code)


AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
full_alligner.fit_input.append(full_alligner.transform_input)

In [57]:
newdataset = AnnData(full_alligner.fit_input.append(full_alligner.transform_input), obs = full_alligner.fit_annotations.append(full_alligner.transform_annotations))

  and should_run_async(code)


In [None]:
dataset = dataset.loc[newdataset.obs.index]

In [65]:
dataset.obs.cell_type = dataset.obs.cell_type.astype('category')
dataset.obs.tissue_type = dataset.obs.tissue_type.astype('category')
newdataset.adata_latent.obs.cell_type = evaluator.adata_latent.obs.cell_type.astype('category')
evaluator.adata_latent.obs.tissue_type = evaluator.adata_latent.obs.tissue_type.astype('category')

In [71]:
evaluator.adata_latent.obsm['X_pca'] = evaluator.adata_latent.X

ValueError: Value passed for key 'X_pca' is of incorrect shape. Values of obsm must match dimensions (0,) of parent. Value had shape (14418, 16) while it should have had (14692,).

In [72]:
scib.metrics.metrics(dataset[evaluator.adata_latent[loc].obs.index], evaluator.adata_latent[loc], batch_key="cell_type", label_key='tissue_type', cell_cycle_=False, organism='human', trajectory_=False, isolated_labels_asw_=True, silhouette_=True, hvg_score_=False, graph_conn_=True, pcr_=True, isolated_labels_f1_=False, nmi_=True, ari_=True, kBET_=True, ilisi_=True, clisi_=True,)

NMI...
ARI...
Silhouette score...
PC regression...
Isolated labels ASW...
Graph connectivity...
kBET...
cLISI score...
/tmp/lisi_f_sfja7t/input.mtx /tmp/lisi_f_sfja7t/
iLISI score...
/tmp/lisi_90rg02kf/input.mtx /tmp/lisi_90rg02kf/


Unnamed: 0,0
NMI_cluster/label,0.831564
ARI_cluster/label,0.701115
ASW_label,0.560188
ASW_label/batch,0.845711
PCR_batch,0.926795
cell_cycle_conservation,
isolated_label_F1,
isolated_label_silhouette,0.482754
graph_conn,0.83428
kBET,


In [22]:
disease_rename = {'--': 'U',
 'ALL': 'ALL',
 'AML': 'AML',
 'ASPS': 'ASPS',
 'ATL': 'ATL',
 'ATRT': 'ATRT',
 'Acinar adenocarcinoma': 'adenocarcinoma',
 'Acute Lymphoblastic Leukemia (ALL), B-cell': 'ALL',
 'Adenocarcinoma': 'adenocarcinoma',
 'Adenocarcinoma (NOS)': 'adenocarcinoma',
 'Adenocarcinoma ductal type': 'adenocarcinoma ductal',
 'Alveolar': 'alveolar',
 'B-cell': 'leukemia',
 'B-cell, Non-Hodgkins, Burkitts': 'leukemia',
 'Breast Ductal Carcinoma': 'ductal carcinoma',
 'CLL': 'CLL',
 'CML': 'CML',
 'CNS germinoma': 'CNS germinoma',
 'Carcinoma': 'carcinoma',
 'Carcinoma (NOS)': 'carcinoma',
 'Cholangiocarcinoma': 'Cholangiocarcinoma',
 'Clear Cell Sarcoma': 'Clear Cell Sarcoma',
 'Colon Carcinoma': 'carcinoma',
 'DIPG': 'DIPG',
 'ETMR': 'ETMR',
 'Engineered': 'U',
 'Ependymoblastoma': 'Ependymoblastoma',
 'Ependymoma': 'Ependymoma',
 'Epithelioid sarcoma': 'Epithelioid sarcoma',
 'Esophagus adenocarcinoma (NOS)': 'adenocarcinoma',
 'Ewing Sarcoma': 'Ewing_sarcoma',
 'Ewing sarcoma': 'Ewing_sarcoma',
 'Ewing_sarcoma': 'Ewing_sarcoma',
 'Ewings Sarcoma': 'Ewing_sarcoma',
 'Extracranial Rhabdoid': 'Extracranial Rhabdoid',
 'Extrahepatic cholangiocarcinoma': 'Extrahepatic cholangiocarcinoma',
 'Glioblastoma': 'Glioblastoma',
 'Gliosarcoma': 'Gliosarcoma',
 'HER2-enriched': 'U',
 'Hepatoblastoma': 'Hepatoblastoma',
 'High-grade glioma': 'High-grade glioma',
 'INI-deficient soft tissue sarcoma NOS': 'INI-deficient soft tissue sarcoma',
 'Intrahepatic cholangiocarcinoma': 'Intrahepatic cholangiocarcinoma',
 'MMMT': 'MMMT',
 'MPNST': 'MPNST',
 'Medulloblastoma': 'Medulloblastoma',
 'Melanoma': 'Melanoma',
 'Melanoma, NOS': 'Melanoma',
 'Merkel_cell_carcinoma': 'Merkel_cell_carcinoma',
 'Mesothelioma': 'Mesothelioma',
 'Metaplastic ductal carcinoma': 'Metaplastic ductal carcinoma',
 'Mucinous adenocarcinoma': 'Mucinous adenocarcinoma',
 'NOS': 'U',
 'NS': 'U',
 'NSCLC': 'NSCLC',
 'NUT midline carcinoma': 'NUT midline carcinoma',
 'Neuroblastoma': 'Neuroblastoma',
 None: 'U',
 'Osteosarcoma': 'Osteosarcoma',
 'PNET': 'PNET',
 'Pleomorphic Xanthoastrocytoma': 'Pleomorphic Xanthoastrocytoma',
 'Rhabdomyosarcoma': 'Rhabdomyosarcoma',
 'SCCOHT': 'SCCOHT',
 'SCLC': 'SCLC',
 'SN': 'SN',
 'Sessile serrated adenoma': 'Sessile serrated adenoma',
 'Small Cell Carcinoma': 'Small Cell Carcinoma',
 'Squamous Cell Carcinoma': 'Squamous',
 'Squamous cell carcinoma': 'Squamous',
 'Tubulovillous adenoma': 'Tubulovillous adenoma',
 'Uveal Melanoma': 'Uveal_Melanoma',
 'Wilms': 'Wilms',
 'acinar cell carcinoma': 'acinar cell carcinoma',
 'acute leukemia': 'leukemia',
 'acute lymphoblastic leukemia': 'ALL',
 'acute megakaryoblastic leukemia': 'acute megakaryoblastic leukemia',
 'acute myeloid leukemia': 'AML',
 'acute undifferentiated leukemia': 'leukemia',
 'adenocarcinoma': 'adenocarcinoma',
 'adenosquamous_carcinoma': 'adenosquamous_carcinoma',
 'adrenal': 'U',
 'adrenal_carcinoma': 'carcinoma',
 'adrenocortical adenoma': 'adrenocortical adenoma',
 'adrenocortical carcinoma': 'adrenocortical carcinoma',
 'alveolar rhabdomyosarcoma': 'alveolar rhabdomyosarcoma',
 'alveolar sarcoma': 'alveolar sarcoma',
 'anaplastic': 'anaplastic',
 'angiosarcoma': 'angiosarcoma',
 'astrocytoma_Grade_IV': 'astrocytoma_Grade_IV',
 'atypical teratoid/rhabdoid tumor': 'atypical teratoid/rhabdoid tumor',
 'basal': 'basal',
 'basal_cell_carcinoma': 'basal_cell_carcinoma',
 'basaloid_carcinoma': 'basaloid_carcinoma',
 'biphasic': 'biphasic',
 'bladder': 'U',
 'bladder urothelial carcinoma': 'urothelial carcinoma',
 'bladder_carcinoma': 'carcinoma',
 'brain': 'U',
 'breast': 'U',
 'breast_adenocarcinoma': 'adenocarcinoma',
 'breast_carcinoma': 'carcinoma',
 'breast_ductal_carcinoma': 'ductal_carcinoma',
 'brenner_tumor': 'brenner_tumor',
 'caecum_adenocarcinoma': 'adenocarcinoma',
 'carcinosarcoma-malignant_mesodermal_mixed_tumour': 'carcinosarcoma-malignant_mesodermal_mixed_tumour',
 'cervical_adenocarcinoma': 'adenocarcinoma',
 'cervical_carcinoma': 'carcinoma',
 'cervical_squamous': 'squamous',
 'cholangiocarcinoma': 'cholangiocarcinoma',
 'chondroblastic': 'chondroblastic',
 'chondrosarcoma': 'chondrosarcoma',
 'choriocarcinoma': 'choriocarcinoma',
 'choroid plexus carcinoma': 'choroid plexus carcinoma',
 'clear cell carcinoma of cervix': 'clear_cell_carcinoma',
 'clear_cell_carcinoma': 'clear_cell_carcinoma',
 'clear_cell_renal_cell_carcinoma': 'clear_cell_carcinoma',
 'colon': 'U',
 'colon adenocarcinoma': 'adenocarcinoma',
 'colorectal_adenocarcinoma': 'adenocarcinoma',
 'craniopharyngioma': 'craniopharyngioma',
 'dedifferentiated': 'U',
 'dedifferentiated liposarcoma': 'liposarcoma',
 'desmoplastic small round cell tumor': 'desmoplastic small round cell tumor',
 'diffuse large B-cell lymphoma': 'diffuse_lymphoma',
 'diffuse_adenocarcinoma': 'diffuse_adenocarcinoma',
 'diffuse_large_B_cell_lymphoma': 'diffuse_lymphoma',
 'ductal_carcinoma': 'ductal_carcinoma',
 'ductal_carcinoma_in_situ': 'ductal_carcinoma',
 'duodenal_adenocarcinoma': 'duodenal_adenocarcinoma',
 'dysembryoplastic neuroepithelial tumor': 'dysembryoplastic neuroepithelial tumor',
 'embryo_carcinoma': 'carcinoma',
 'embryonal rhabdomyosarcoma': 'rhabdomyosarcoma',
 'embryonal tumor with multilayer rosettes': 'tumor with multilayer rosettes',
 'endocrine': 'endocrine',
 'endometrial_adenocarcinoma': 'adenocarcinoma',
 'endometrial_adenosquamous': 'adenosquamous',
 'endometrial_squamous': 'squamous',
 'endometrial_stromal_sarcoma': 'stromal_sarcoma',
 'endometrioid_carcinoma': 'carcinoma',
 'ependymoma': 'ependymoma',
 'ependymoma_Grade_III-IV': 'ependymoma_Grade_III-IV',
 'epithelioid hemangioendothelioma': 'hemangioendothelioma',
 'epithelioid sarcoma': 'sarcoma',
 'epithelioid_sarcoma': 'sarcoma',
 'esophageal carcinoma': 'carcinoma',
 'esophagus': 'esophagus',
 'esophagus_adenocarcinoma': 'adenocarcinoma',
 'esophagus_squamous': 'squamous',
 'exocrine': 'exocrine',
 'fibroblast_bone': 'fibroblast',
 'fibroblast_breast': 'fibroblast',
 'fibroblast_colorectal': 'fibroblast',
 'fibroblast_lung': 'fibroblast',
 'fibroblast_lymphocyte': 'fibroblast',
 'fibroblast_skin': 'fibroblast',
 'fibroblast_soft_tissue': 'fibroblast',
 'fibroblast_upper_aerodigestive': 'fibroblast',
 'fibroblast_urinary_tract': 'fibroblast',
 'fibrolamellar hepatocellular carcinoma': 'fibrolamellar hepatocellular carcinoma',
 'fibromatosis': 'fibromatosis',
 'fibrosarcoma': 'fibrosarcoma',
 'follicular': 'follicular',
 'gall_bladder': 'U',
 'gallbladder_adenocarcinoma': 'adenocarcinoma',
 'gastric_adenocarcinoma': 'adenocarcinoma',
 'gastric_small_cell': 'small_cell',
 'gastrointestinal stromal tumor': 'stromal_tumor',
 'glassy_cell_carcinoma': 'glassy_cell_carcinoma',
 'glioblastoma multiforme': 'glioblastoma multiforme',
 'glioma': 'glioma',
 'gliomatosis cerebri': 'gliomatosis cerebri',
 'haemangiopericytic': 'haemangiopericytic',
 'hepatoblastoma': 'hepatoblastoma',
 'hepatocellular carcinoma': 'carcinoma',
 'hepatocellular_carcinoma': 'carcinoma',
 'hodgkin_lymphoma': 'hodgkin_lymphoma',
 'in collision with cholangicarcinoma, no review in spectrum': 'cholangicarcinoma',
 'infantile fibrosarcoma': 'fibrosarcoma',
 'juvenile myelomonocytic leukemia': 'myelomonocytic leukemia',
 'kidney': 'U',
 'kidney chromophobe': 'chromophobe',
 'kidney clear cell carcinoma': 'clear cell carcinoma',
 'krukenberg_tumor': 'krukenberg_tumor',
 'large_cell_carcinoma': 'large_cell_carcinoma',
 'leiomyosarcoma': 'leiomyosarcoma',
 'liposarcoma': 'liposarcoma',
 'liver': 'U',
 'lobular_carcinoma': 'lobular_carcinoma',
 'luminal A': 'luminal A',
 'luminal B': 'luminal B',
 'lung': 'U',
 'lung adenocarcinoma': 'adenocarcinoma',
 'lung squamous cell carcinoma': 'squamous cell carcinoma',
 'lung_carcinoid': 'carcinoid',
 'lymphoma_unspecified': 'lymphoma_unspecified',
 'malignant peripheral nerve sheath tumor': 'malignant peripheral nerve sheath tumor',
 'malignant_fibrous_histiocytoma': 'malignant_fibrous_histiocytoma',
 'malignant_rhabdoid_tumor': 'malignant_rhabdoid_tumor',
 'medulloblastoma': 'medulloblastoma',
 'melanocytic': 'melanocytic',
 'melanoma': 'melanoma',
 'melanotic neuroectodermal tumor': 'melanotic neuroectodermal tumor',
 'meningioma': 'meningioma',
 'merkel_cell_carcinoma': 'merkel_cell_carcinoma',
 'mesothelioma': 'mesothelioma',
 'metaplastic_carcinoma': 'metaplastic_carcinoma',
 'mixed_adenosquamous_carcinoma': 'mixed_adenosquamous_carcinoma',
 'mixed_carcinoma': 'mixed_carcinoma',
 'mixed_germ_cell': 'mixed_germ_cell',
 'monophasic': 'monophasic',
 'mucinous_carcinoma': 'mucinous_carcinoma',
 'mullerian_carcinoma': 'mullerian_carcinoma',
 'multiple_myeloma': 'multiple_myeloma',
 'myeloid neoplasm NOS': 'myeloid neoplasm',
 'myoepithelial carcinoma': 'myoepithelial carcinoma',
 'myofibromatosis': 'myofibromatosis',
 'myxofibrosarcoma': 'myxofibrosarcoma',
 'myxoid': 'myxoid',
 'myxoid_round cell': 'myxoid_round cell',
 np.nan: 'U',
 'nasopharyngeal carcinoma': 'nasopharyngeal carcinoma',
 'neural crest-like': 'neural crest-like',
 'neuroblastoma': 'neuroblastoma',
 'neuroendocrine': 'neuroendocrine carcinoma',
 'neuroendocrine carcinoma': 'neuroendocrine carcinoma',
 'neurofibroma': 'neurofibroma',
 'nodular': 'nodular',
 'non-seminoma': 'non-seminoma',
 'non_hodgkin_lymphoma': 'non_hodgkin_lymphoma',
 'non_small_cell_carcinoma': 'non_small_cell_carcinoma',
 'normal': 'normal',
 'oral': 'U',
 'osteosarcoma': 'osteosarcoma',
 'other': 'U',
 'ovarian serous cystadenocarcinoma': 'serous cystadenocarcinoma',
 'ovary': 'U',
 'ovary_adenocarcinoma': 'adenocarcinoma',
 'ovary_carcinoma': 'carcinoma',
 'pancreas': 'U',
 'papillary renal cell carcinoma': 'papillary renal cell carcinoma',
 'parotid': 'U',
 'perivascular epithelioid cell neoplasms': 'perivascular epithelioid cell neoplasms',
 'pheochromocytoma & paraganglioma': 'pheochromocytoma & paraganglioma',
 'pineal parenchymal tumor': 'pineal parenchymal tumor',
 'pleomorphic': 'pleomorphic_sarcoma',
 'pleomorphic_sarcoma': 'pleomorphic_sarcoma',
 'pleuropulmonary blastoma': 'blastoma',
 'prostate': 'U',
 'prostate adenocarcinoma': 'adenocarcinoma',
 'prostate_adenocarcinoma': 'adenocarcinoma',
 'prostate_hyperplasia': 'hyperplasia',
 'prostate_small_cell': 'small_cell',
 'rectum adenocarcinoma': 'rectum adenocarcinoma',
 'renal_cell_carcinoma': 'renal_cell_carcinoma',
 'retinoblastoma': 'retinoblastoma',
 'rhabdomyosarcoma': 'rhabdomyosarcoma',
 'rosette forming glioneuronal tumor': 'rosette forming glioneuronal tumor',
 'sarcoma': 'sarcoma',
 'sarcomatoid': 'sarcomatoid_carcinoma',
 'sarcomatoid_carcinoma': 'sarcomatoid_carcinoma',
 'seminoma': 'seminoma',
 'serous_carcinoma': 'serous_carcinoma',
 'skin': 'U',
 'skin_squamous': 'squamous',
 'spindle': 'sclerosing rhabdomyosarcoma',
 'spindle cell/sclerosing rhabdomyosarcoma': 'sclerosing rhabdomyosarcoma',
 'squamous_cell_carcinoma': 'squamous',
 'stomach': 'U',
 'stomach adenocarcinoma': 'adenocarcinoma',
 'storiform_pleomorphic': 'storiform_pleomorphic',
 'supratentorial embryonal tumor NOS': 'supratentorial embryonal tumor',
 'synovial sarcoma': 'synovial_sarcoma',
 'synovial_sarcoma': 'synovial_sarcoma',
 'teratoma': 'teratoma',
 'testis': 'U',
 'thymic carcinoma': 'carcinoma',
 'thymoma': 'thymoma',
 'thymus': 'U',
 'thyroid': 'U',
 'thyroid carcinoma': 'carcinoma',
 'thyroid_carcinoma': 'carcinoma',
 'thyroid_sarcoma': 'sarcoma',
 'thyroid_squamous': 'squamous',
 'transitional_cell_carcinoma': 'transitional_cell_carcinoma',
 'transitory': 'transitional_cell_carcinoma',
 'undifferentiated': 'U',
 'undifferentiated hepatic sarcoma': 'sarcoma',
 'undifferentiated pleomorphic sarcoma': 'pleomorphic_sarcoma',
 'undifferentiated sarcoma NOS': 'sarcoma',
 'undifferentiated spindle cell sarcoma': 'spindle cell sarcoma',
 'undifferentiated_sarcoma': 'sarcoma',
 'unspecified_leukemia': 'leukemia',
 'upper aerodigestive squamous': 'squamous',
 'upper_aerodigestive_carcinoma': 'carcinoma',
 'upper_aerodigestive_squamous': 'squamous',
 'uterine carcinosarcoma': 'carcinosarcoma',
 'uterine endometrioid': 'uterine_endometrioid',
 'uterine_carcinosarcoma': 'carcinosarcoma',
 'uterine_sarcoma': 'sarcoma',
 'uveal melanoma': 'uveal_melanoma',
 'uveal_melanoma': 'uveal_melanoma',
 'well_differentiated': 'U',
 'wilms tumor': 'wilms_tumor',
 'yolk_sac_tumour': 'yolk_sac_tumor' }

disease_rename2 = {'acinar_cell_carcinoma': 'acinar_cell_carcinoma',
 'acute_megakaryoblastic_leukemia': 'acute_megakaryoblastic_leukemia',
 'adenocarcinoma': 'adenocarcinoma',
 'adenocarcinoma_ductal': 'adenocarcinoma_ductal',
 'adenosquamous': 'adenosquamous_carcinoma',
 'adenosquamous_carcinoma': 'adenosquamous_carcinoma',
 'adrenocortical_adenoma': 'adrenocortical_adenoma',
 'adrenocortical_carcinoma': 'adrenocortical_carcinoma',
 'all': 'all',
 'alveolar': 'alveolar',
 'alveolar_rhabdomyosarcoma': 'alveolar_rhabdomyosarcoma',
 'alveolar_sarcoma': 'alveolar_sarcoma',
 'aml': 'aml',
 'anaplastic': 'anaplastic',
 'angiosarcoma': 'angiosarcoma',
 'asps': 'asps',
 'astrocytoma_grade_iv': 'astrocytoma_grade_iv',
 'atl': 'atl',
 'atrt': 'atrt',
 'atypical_teratoid/rhabdoid_tumor': 'atrt',
 'basal': 'basal',
 'basal_cell_carcinoma': 'basal_cell_carcinoma',
 'basaloid_carcinoma': 'basaloid_carcinoma',
 'biphasic': 'biphasic',
 'blastoma': 'blastoma',
 'brenner_tumor': 'brenner_tumor',
 'carcinoid': 'carcinoma',
 'carcinoma': 'carcinoma',
 'carcinosarcoma': 'carcinosarcoma',
 'carcinosarcoma-malignant_mesodermal_mixed_tumour': 'carcinosarcoma-malignant_mesodermal_mixed_tumour',
 'cholangicarcinoma': 'cholangiocarcinoma',
 'cholangiocarcinoma': 'cholangiocarcinoma',
 'chondroblastic': 'chondroblastic',
 'chondrosarcoma': 'chondrosarcoma',
 'choriocarcinoma': 'choriocarcinoma',
 'choroid_plexus_carcinoma': 'choroid_plexus_carcinoma',
 'chromophobe': 'chromophobe',
 'clear_cell_carcinoma': 'clear_cell_carcinoma',
 'clear_cell_sarcoma': 'clear_cell_sarcoma',
 'cll': 'cll',
 'cml': 'cml',
 'cns_germinoma': 'cns_germinoma',
 'craniopharyngioma': 'craniopharyngioma',
 'desmoplastic_small_round_cell_tumor': 'desmoplastic_small_round_cell_tumor',
 'diffuse_adenocarcinoma': 'diffuse_adenocarcinoma',
 'diffuse_lymphoma': 'diffuse_lymphoma',
 'dipg': 'dipg',
 'ductal_carcinoma': 'ductal_carcinoma',
 'duodenal_adenocarcinoma': 'duodenal_adenocarcinoma',
 'dysembryoplastic_neuroepithelial_tumor': 'dysembryoplastic_neuroepithelial_tumor',
 'endocrine': 'endocrine',
 'ependymoblastoma': 'ependymoblastoma',
 'ependymoma': 'ependymoma',
 'ependymoma_grade_iii-iv': 'ependymoma_grade_iii-iv',
 'epithelioid_sarcoma': 'epithelioid_sarcoma',
 'esophagus': 'u',
 'etmr': 'etmr',
 'ewing_sarcoma': 'ewing_sarcoma',
 'exocrine': 'exocrine',
 'extracranial_rhabdoid': 'extracranial_rhabdoid',
 'extrahepatic_cholangiocarcinoma': 'extrahepatic_cholangiocarcinoma',
 'fibroblast': 'fibroblast',
 'fibrolamellar_hepatocellular_carcinoma': 'fibrolamellar_hepatocellular_carcinoma',
 'fibromatosis': 'fibromatosis',
 'fibrosarcoma': 'fibrosarcoma',
 'follicular': 'follicular',
 'glassy_cell_carcinoma': 'glassy_cell_carcinoma',
 'glioblastoma': 'glioblastoma',
 'glioblastoma_multiforme': 'glioblastoma_multiforme',
 'glioma': 'glioma',
 'gliomatosis_cerebri': 'gliomatosis_cerebri',
 'gliosarcoma': 'gliosarcoma',
 'haemangiopericytic': 'haemangiopericytic',
 'hemangioendothelioma': 'hemangioendothelioma',
 'hepatoblastoma': 'hepatoblastoma',
 'high-grade_glioma': 'high-grade_glioma',
 'hodgkin_lymphoma': 'hodgkin_lymphoma',
 'hyperplasia': 'hyperplasia',
 'ini-deficient_soft_tissue_sarcoma': 'ini-deficient_soft_tissue_sarcoma',
 'intrahepatic_cholangiocarcinoma': 'intrahepatic_cholangiocarcinoma',
 'krukenberg_tumor': 'krukenberg_tumor',
 'large_cell_carcinoma': 'large_cell_carcinoma',
 'leiomyosarcoma': 'leiomyosarcoma',
 'leukemia': 'leukemia',
 'liposarcoma': 'liposarcoma',
 'lobular_carcinoma': 'lobular_carcinoma',
 'luminal_a': 'luminal_a',
 'luminal_b': 'luminal_b',
 'lymphoma_unspecified': 'lymphoma',
 'malignant_fibrous_histiocytoma': 'malignant_fibrous_histiocytoma',
 'malignant_peripheral_nerve_sheath_tumor': 'malignant_peripheral_nerve_sheath_tumor',
 'malignant_rhabdoid_tumor': 'malignant_rhabdoid_tumor',
 'medulloblastoma': 'medulloblastoma',
 'melanocytic': 'melanocytic',
 'melanoma': 'melanoma',
 'melanotic_neuroectodermal_tumor': 'melanotic_neuroectodermal_tumor',
 'meningioma': 'meningioma',
 'merkel_cell_carcinoma': 'merkel_cell_carcinoma',
 'mesothelioma': 'mesothelioma',
 'metaplastic_carcinoma': 'metaplastic_carcinoma',
 'metaplastic_ductal_carcinoma': 'metaplastic_ductal_carcinoma',
 'mixed_adenosquamous_carcinoma': 'mixed_adenosquamous_carcinoma',
 'mixed_carcinoma': 'mixed_carcinoma',
 'mixed_germ_cell': 'mixed_germ_cell',
 'mmmt': 'mmmt',
 'monophasic': 'monophasic',
 'mpnst': 'mpnst',
 'mucinous_adenocarcinoma': 'mucinous_adenocarcinoma',
 'mucinous_carcinoma': 'mucinous_carcinoma',
 'mullerian_carcinoma': 'mullerian_carcinoma',
 'multiple_myeloma': 'multiple_myeloma',
 'myeloid_neoplasm': 'myeloid_neoplasm',
 'myelomonocytic_leukemia': 'myelomonocytic_leukemia',
 'myoepithelial_carcinoma': 'myoepithelial_carcinoma',
 'myofibromatosis': 'myofibromatosis',
 'myxofibrosarcoma': 'myxofibrosarcoma',
 'myxoid': 'myxoid',
 'myxoid_round_cell': 'myxoid_round_cell',
 'nasopharyngeal_carcinoma': 'nasopharyngeal_carcinoma',
 'neural_crest-like': 'neural_crest-like',
 'neuroblastoma': 'neuroblastoma',
 'neuroendocrine_carcinoma': 'neuroendocrine_carcinoma',
 'neurofibroma': 'neurofibroma',
 'nodular': 'nodular',
 'non-seminoma': 'non-seminoma',
 'non_hodgkin_lymphoma': 'non_hodgkin_lymphoma',
 'non_small_cell_carcinoma': 'non_small_cell_carcinoma',
 'normal': 'normal',
 'nsclc': 'nsclc',
 'nut_midline_carcinoma': 'nut_midline_carcinoma',
 'osteosarcoma': 'osteosarcoma',
 'papillary_renal_cell_carcinoma': 'papillary_renal_cell_carcinoma',
 'perivascular_epithelioid_cell_neoplasms': 'perivascular_epithelioid_cell_neoplasms',
 'pheochromocytoma_&_paraganglioma': 'pheochromocytoma_&_paraganglioma',
 'pineal_parenchymal_tumor': 'pineal_parenchymal_tumor',
 'pleomorphic_sarcoma': 'pleomorphic_sarcoma',
 'pleomorphic_xanthoastrocytoma': 'pleomorphic_xanthoastrocytoma',
 'pnet': 'pnet',
 'rectum_adenocarcinoma': 'rectum_adenocarcinoma',
 'renal_cell_carcinoma': 'renal_cell_carcinoma',
 'retinoblastoma': 'retinoblastoma',
 'rhabdomyosarcoma': 'rhabdomyosarcoma',
 'rosette_forming_glioneuronal_tumor': 'rosette_forming_glioneuronal_tumor',
 'sarcoma': 'sarcoma',
 'sarcomatoid_carcinoma': 'sarcomatoid_carcinoma',
 'sccoht': 'sccoht',
 'sclc': 'sclc',
 'sclerosing_rhabdomyosarcoma': 'sclerosing_rhabdomyosarcoma',
 'seminoma': 'seminoma',
 'serous_carcinoma': 'serous_carcinoma',
 'serous_cystadenocarcinoma': 'serous_cystadenocarcinoma',
 'sessile_serrated_adenoma': 'sessile_serrated_adenoma',
 'small_cell': 'small_cell',
 'small_cell_carcinoma': 'small_cell_carcinoma',
 'sn': 'sn',
 'spindle_cell_sarcoma': 'spindle_cell_sarcoma',
 'squamous': 'squamous_cell_carcinoma',
 'squamous_cell_carcinoma': 'squamous_cell_carcinoma',
 'storiform_pleomorphic': 'storiform_pleomorphic',
 'stromal_sarcoma': 'stromal_sarcoma',
 'stromal_tumor': 'stromal_tumor',
 'supratentorial_embryonal_tumor': 'supratentorial_embryonal_tumor',
 'synovial_sarcoma': 'synovial_sarcoma',
 'teratoma': 'teratoma',
 'thymoma': 'thymoma',
 'transitional_cell_carcinoma': 'transitional_cell_carcinoma',
 'tubulovillous_adenoma': 'tubulovillous_adenoma',
 'tumor_with_multilayer_rosettes': 'tumor_with_multilayer_rosettes',
 'u': 'u',
 'urothelial_carcinoma': 'urothelial_carcinoma',
 'uterine_endometrioid': 'uterine_endometrioid',
 'uveal_melanoma': 'uveal_melanoma',
 'wilms': 'wilms_tumor',
 'wilms_tumor': 'wilms_tumor',
 'yolk_sac_tumor': 'yolk_sac_tumor'}

## saving and exporting

In [None]:
my_alligner.save('../temp/22Q1')

In [None]:
my_alligner = Celligner()
my_alligner.load('../temp/22Q1')

In [None]:
def exportCellignerToPortal(my_alligner):
    return pd.concat([my_alligner.annotations, pd.DataFrame(data=np.hstack([my_alligner.umap_reduced, np.array([my_alligner.clusts]).T]), index=my_alligner.annotations.index, columns=['umap1', 'umap2', 'clusts']),], axis=1)

In [None]:
exportCellignerToPortal(my_alligner).to_csv('../temp/22Q1/celligner_aligned_all.csv')

In [None]:
new_dataset_id = tc.update_dataset(
    "celligner-multi-dataset-alignment-5403",
    changes_description="new version for 22Q1 processed with the python celligner. columns are now [umap1, umap2, clusts, tissue_type, disease_type, cell_type]",
    upload_files=[
        {
            "path": "../temp/22Q1/celligner_aligned_all.csv",
            "format": "TableCSV", # or "NumericMatrixCSV" or "TableCSV"
            "encoding": "utf-8" # optional (but recommended), will use iso-8859-1 if not provided
        }
    ],
    add_all_existing_files=False, # If True, will add all files from the base dataset version, except files with the same names as those in upload_files or add_taiga_ids
)

## making it portal-like

In [None]:
my_alligner = Celligner()
my_alligner.load('../temp/22Q1')

In [None]:
imp = np.zeros(len(my_alligner.umap_reduced))
imp[my_alligner.annotations.cell_type=="TCGA tumor"] = 2
my_alligner.plot(rerun=False, importance = imp)

In [None]:
my_alligner.plot(color_column="tissue_type", colortable=TISSUE_COLOR, rerun=False, shape=my_alligner.annotations.cell_type.values)

In [None]:
nn = my_alligner.getKNN(ofcell="CCLE cell line", incell="TCGA tumor")

## computing metrics 

In [None]:
import scib
from anndata import AnnData

In [None]:
adata = AnnData(my_alligner.fit_input.append(my_alligner.transform_input), my_alligner.fit_annotations.append(my_alligner.transform_annotations))

In [None]:
common = set(CCLE_expression.columns) & set(met500_TPM.columns) & set(Novartis_PDX_TPM.columns) & set(pediatric_PDX_TPM.columns)
CCLE_expression = CCLE_expression[list(common)]
TCGA_expression = TCGA_expression[list(common)]
met500_TPM = met500_TPM[list(common)]
Novartis_PDX_TPM = Novartis_PDX_TPM[list(common)]
pediatric_PDX_TPM = pediatric_PDX_TPM[list(common)]
len(common)

In [None]:
adata_unfit = AnnData(pd.concat([CCLE_expression, TCGA_expression, met500_TPM, Novartis_PDX_TPM, pediatric_PDX_TPM]), my_alligner.fit_annotations.append(my_alligner.transform_annotations))

In [None]:
adata.obs['cell_type'] = adata.obs['cell_type'].astype("category")
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype("category")

In [None]:
scib.metrics.metrics(adata_unfit, adata, batch_key="cell_type", label_key='tissue_type', cell_cycle_=False, organism='human', trajectory_=False, isolated_labels_asw_=True, silhouette_=True, hvg_score_=False, graph_conn_=True, pcr_=True, isolated_labels_f1_=False, nmi_=True, ari_=True, kBET_=True, ilisi_=True, clisi_=True,)