# Obtaining robust cell population markers, and redefining/reassuring the biased cell populations

**TODO**
* Save adatas
* Export analysis to 4h notebook

## imports

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr

In [None]:
!pip install cellassign

In [None]:
from cellassign import assign_cats
%store -r dict_colors
%store -r seed
%store -r magma
%store -r data_dir

In [None]:
mpl.rcParams['figure.dpi'] = 150

**IMPORTANT: I am running this analysis in a computer with ~500 GB of RAM. I will load many datasets at once, which might be too much for some computers. I took this decision conciously, to have as much info available at any time as possible. If you cannot run all the analysis at once, you can run it by parts.**

## Anndata loading

In [None]:
deng_dir = data_dir + '/deng_2021'
adata_deng_2021_scar_fb = sc.read(deng_dir + '/adata_deng_scar_fb.h5')

In [None]:
gao_dir = data_dir + '/gao_2021'
adata_gao_fb = sc.read(gao_dir + '/gao_2021_fb.h5')

In [None]:
gaydosik_dir = data_dir + '/gaydosik_2020'
adata_gaydosik_HC_fb = sc.read(gaydosik_dir + '/gaydosik_2020_HC_fb.h5ad')

In [None]:
he_dir = data_dir + '/He_2020'
adata_he_fb = sc.read(he_dir + '/He2020_fb.h5')

In [None]:
hughes_dir = data_dir + '/hughes_2020'
adata_hughes_fb = sc.read(hughes_dir + '/hughes_2020_fb.h5')

In [None]:
kim_dir_2020 = data_dir + '/Kim_2020'
adata_kim_2020_fb = sc.read(kim_dir_2020 + '/Kim_2020_fb.h5ad')

In [None]:
kim_dir_2021 = data_dir + '/kim_2021'
adata_kim_2021_fb = sc.read(kim_dir_2021 + '/kim_2021_fb.h5')

In [None]:
liu_dir = data_dir + '/liu_2021'
adata_liu_ctrl_fb = sc.read(liu_dir + '/liu_2021_fb.h5')

In [None]:
mirizio_dir = data_dir + '/mirizio_2020'
adata_mirizio_fb = sc.read(mirizio_dir + '/Mirizio_2020_fb.h5ad')

In [None]:
reynolds_dir = data_dir + '/reynolds_2021'
adata_reynolds_healthy_fb = sc.read(reynolds_dir + '/adata_reynolds_healthy_fb.h5')

In [None]:
sole_dir = data_dir + '/Sole-Boldo_2020'
adata_sole_young_fb = sc.read(sole_dir + '/SB2020_fb.h5')

In [None]:
tabib_dir = data_dir + '/Tabib_2018'
adata_tabib_fb = sc.read(tabib_dir + '/tabib_2018_fb.h5')

In [None]:
tabib_2021_dir = data_dir + '/Tabib_2021'
adata_tabib_2021_ctrl_fb = sc.read(tabib_2021_dir + '/adata_tabib_2021_ctrl_fb.h5')

In [None]:
tsc_dir = data_dir + '/Tabula_Sapiens_Consortium_2021'
adata_tsc_fb = sc.read(tsc_dir + '/adata_tsc_fb.h5')

In [None]:
theo_dir_2020 = data_dir + '/Theocharidis_2020/'
adata_theo_2020_healthy_fb = sc.read(theo_dir_2020 + '/adata_theo_healthy_fb.h5')

In [None]:
theo_dir_2021 = data_dir + '/Theocharidis_2021/'
adata_theo_2021_healthy_fb = sc.read(theo_dir_2021 + '/adata_theo_2021_fb.h5')

In [None]:
vors_dir = data_dir + '/Vorstandlechner_2020'
adata_vors_fb = sc.read(vors_dir + '/adata_vors_fb_2020.h5')

## Evaluate marker score for adatas
All the commented anndatas are discarded for marker selection because they do not show enough marker consistency and might bias the result.

In [None]:
dict_datasets = {'adata_deng_2021_scar_fb':           ['Deng', 'Scar', '2021'],
                 'adata_gao_fb':                 ['Gao', 'Healthy', '2021'],
                 'adata_gaydosik_HC_fb':         ['Gaydosik', 'Healthy', '2020', ],
#                  'adata_he_fb':                  ['He', 'Healthy', '2020', ],
                 'adata_hughes_fb':              ['Hughes', 'Healthy', '2020', ],
#                  'adata_kim_2020_fb':            ['Kim', 'Healthy', '2020', ],
                 'adata_liu_ctrl_fb':            ['Liu', 'Healthy', '2021', ],
                 'adata_mirizio_fb':             ['Mirizio', 'Healthy', '2020', ],
#                  'adata_reynolds_healthy_fb':    ['Reynolds', 'Healthy', '2021', ],
                 'adata_sole_young_fb':          ['Solé-Boldo', 'Young', '2020', ],
                 'adata_tabib_fb':               ['Tabib', 'Healthy', '2018', ],
                 'adata_tabib_2021_ctrl_fb':     ['Tabib', 'Healthy', '2021', ],
#                  'adata_tsc_fb':                 ['Tabula Sapiens', 'Healthy', '2021', ],
                 'adata_theo_2020_healthy_fb':                ['Theocarditis', 'Healthy', '2020', ],
#                  'adata_theo_2021_healthy_fb':           ['Theocarditis', 'Healthy', '2021', ], 
                 'adata_vors_fb':                ['Vorstandlechner', 'Healthy', '2020', ],}

In [None]:
list_accepted_clusters = ['A1', 'A2', 'A3', 'A4', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3', 'C5', 'D1', 'D2', 'D3', 'T1',]                     

In [None]:
for adata_name in dict_datasets.keys():
    eval(adata_name).obs['manual_axis'] = [i[0] for i in eval(adata_name).obs['cluster']]

In [None]:
pd.options.display.float_format = "{:,.2f}".format

In [None]:
def make_gene_scoring(dict_datasets=dict_datasets, calculate_DEGs = True, group_name = 'cluster', value_ref = 'scores', select_method = 'pval', 
                      select_thres = 0.01, select_by='less', list_clusters=None):
    """In this function we are going to extract the selective markers for each population. 
       To do that we are going to apply a two-step procedure. 
       1) In the first step we are going to select the genes that are going to be evaluated. These
       are genes that, for each of the datasets, applies for the selection criterion. Then, the common set of genes between all the groups is created.
       The common set of genes is important to discriminate the importance of each dataset in the second step.
       2) Once the set of genes is created, we apply a weight for each of the datasets. We use a secondary reference value (score, logfold, pval...)
       which we assign for each gene for all datasets. To create a final score, we apply a ponderated mean of all values. We sum the values for all genes
       create a value per dataset. Then, we use these values to create a wieghted mean.
    """ 
        
    # Calculate DEGs if necessary
    for adata_name, list_adata in dict_datasets.items():
        adata = eval(adata_name)
        
        if calculate_DEGs:
            sc.tl.rank_genes_groups(adata, groupby=group_name, method='t-test_overestim_var')
    
    
    dict_scores = {}
    for cluster in list_clusters:
        # 1) Create the group of genes based on "value_ref".
        list_group_genes = []
        
        for adata_name, list_adata in dict_datasets.items():
            adata = eval(adata_name)
            
            list_terms = ['RPS', 'RPL', 'MT-', 'S100A', 'MYL', 'EIF', 'MALAT1']
            unsupported_genes = []
            for term in list_terms:
                unsupported_genes += [i for i in adata.var_names if i.startswith(term)]
                
            
            if cluster in adata.uns['rank_genes_groups']['names'].dtype.names:
                selected_genes = adata.uns['rank_genes_groups']['names'][cluster]

                if select_method == 'pval':
                    selected_vals = adata.uns['rank_genes_groups']['pvals_adj'][cluster]
                    selected_fold = adata.uns['rank_genes_groups']['logfoldchanges'][cluster]

                    mask = (selected_vals < select_thres) & (selected_fold > 0)
                else:
                    if select_by == 'less':
                        mask = (adata.uns['rank_genes_groups'][select_method][cluster] < select_thres)
                    else:
                        mask = (adata.uns['rank_genes_groups'][select_method][cluster] > select_thres)  

                list_group_genes += list([i for i in selected_genes[mask] if i not in unsupported_genes])
            
        set_group_genes = sorted(set(list_group_genes))
        
        # 2) Scoring genes
        df_cluster_score = pd.DataFrame(0, index=set_group_genes, columns=['-'.join(i) for i in dict_datasets.values()])
        
        for adata_name, list_adata in dict_datasets.items():
            adata = eval(adata_name)
            
            if cluster in adata.uns['rank_genes_groups']['names'].dtype.names:
                genes = adata.uns['rank_genes_groups']['names'][cluster]
                if value_ref in ['scores', 'pvals_adj', 'logfoldchanges']:
                    values = adata.uns['rank_genes_groups'][value_ref][cluster]
                elif value_ref.endswith('expression'):
                    values = adata[:, genes].X.sum(0).A1 / len(adata)
                elif value_ref == 'expression_group':
                    values = np.dot(adata.obsp['distances'], adata[:, genes].X).sum(0).A1 / len(adata)
                    
                    
                values[values < 0] = 0  # this is to avoid getting strange stuff
                
                df_genes_values = pd.Series(values, index=genes)
                intersect_genes = np.intersect1d(genes, set_group_genes)

                df_cluster_score.loc[intersect_genes, '-'.join(list_adata)] = df_genes_values[intersect_genes]
        
        # the sqrt is to avoid putting too much weight to some odd dataset
        vec_score = df_cluster_score.values
        vec_weights = np.sqrt(df_cluster_score.sum().values)
        
        
        all_values_mean = np.dot(vec_score, vec_weights) / np.sum(vec_weights)        
        all_values_std = np.sqrt(np.dot((vec_score - all_values_mean.reshape(len(all_values_mean), 1)) ** 2, vec_weights) / np.sum(vec_weights))
        
        df_cluster_score['mean'], df_cluster_score['dev'] = all_values_mean, all_values_std ** (1/3)
        
        df_cluster_score['CV'] = df_cluster_score['mean'] / df_cluster_score['dev'] 
        
    
        dict_scores[cluster] = df_cluster_score
    
    return dict_scores


def make_gene_scoring_with_expr(value_ref = 'scores', expr_type='expression_group',  **kwargs):
    # We see that, when adding info about the gene expression, we can select genes that are underexpressed, but
    # localized in small populations, instead of genes expressed throughout the scene, although higher in the population of interest.
    dict_make_gene_scoring = make_gene_scoring(value_ref=value_ref, **kwargs)
    dict_make_gene_expression = make_gene_scoring(value_ref=expr_type,  **kwargs)
    
    dict_return = {}
    
    for cluster in dict_make_gene_scoring.keys():
        df_score = dict_make_gene_scoring[cluster].sort_values(by='mean', ascending=False)
        df_expr = dict_make_gene_expression[cluster].sort_values(by='mean', ascending=False)

        df_score['expr'] = df_expr['mean']
        df_score['expr_pow'] = df_score['expr'] ** 0.25  # This is a dampening factor, so that over or underexpressed genes do not disturb the ranking
        df_score['Z'] = df_score['mean'] / df_score['expr_pow']

        df_score = df_score[df_score['expr'] >= 0.035]  # Avoid genes with really really small expression
        
        dict_return[cluster] = df_score.sort_values(by='Z', ascending=False)
    
    return dict_return

In [None]:
dict_make_gene_scoring = make_gene_scoring_with_expr(calculate_DEGs = True, group_name = 'cluster', value_ref = 'scores', select_method = 'pval', 
                                                     select_thres = 0.05, list_clusters=list_accepted_clusters)

In [None]:
dict_make_gene_scoring['C5']

In [None]:
dict_make_gene_scoring['D2'].sort_values(by='Z', ascending=False)

In [None]:
dict_make_gene_scoring_axis = make_gene_scoring_with_expr(calculate_DEGs = True, group_name = 'manual_axis', list_clusters=['A', 'B', 'C', 'D'],
                                                     value_ref = 'scores', select_method = 'pval', select_thres = 0.05)

## Recalculating clusters in the datasets

In [None]:
{i: np.array(dict_make_gene_scoring[i].index[:30]) for i in dict_make_gene_scoring.keys()}
{i: np.array(dict_make_gene_scoring_axis[i].index[:30]) for i in dict_make_gene_scoring_axis.keys()}

In [None]:
dict_cats_clusters_robust = {
    'A1': ['CTHRC1', 'WISP2', 'SLPI', 'SEMA3C', 'SEMA3B', 'MATN4', 'FBLN1',
        'ACKR3', 'PI16', 'MFAP5', 'MMP2', 'PDGFRL', 'LOX', 'ABCC9', 'CD70',
        'FAM180B', 'SMOC2', 'TNXB', 'C1QTNF3', 'CD55', 'CPVL', 'AEBP1',
        'DCN', 'QPCT', 'DPP4', 'FBLN2', 'CGREF1', 'THBS2', 'CHRDL1',
        'AOX1'],
     'A2': ['APCDD1', 'COL18A1', 'COL23A1', 'NKD2', 'NPTX2', 'CLEC2A',
            'COL6A5', 'COL13A1', 'HSPB3', 'COL6A1', 'AKAP6', 'PHACTR3',
            'F13A1', 'STC2', 'PTGS1', 'THSD4', 'CDC42EP3', 'NBL1', 'RSPO1',
            'TMEM52', 'AXIN2', 'ROBO2', 'C1orf198', 'TNFRSF19', 'MYO10',
            'NCKAP5', 'WIF1', 'OSBP2', 'LINC00327', 'LEPR'],
     'A3': ['WIF1', 'SFRP2', 'QPCT', 'FBLN1', 'THBS2', 'ACKR4', 'CES1',
            'C1QTNF3', 'CORIN', 'MMP2', 'ANGPTL5', 'SGCA', 'WISP2', 'NBL1',
            'SGCG', 'ITIH5', 'DPP4', 'CTHRC1', 'SOSTDC1', 'ISM1', 'COL14A1',
            'PPIC', 'SMOC2', 'ELN', 'TWIST2', 'THBS4', 'COL5A1', 'DCN',
            'MMP23B', 'LOX'],
     'A4': ['SEMA3C', 'MFAP5', 'C1QTNF3', 'SFRP4', 'FBN1', 'IGFBP6', 'PRG4',
            'CD55', 'TNXB', 'CD70', 'HSD3B7', 'ACKR3', 'PCOLCE2', 'CTHRC1',
            'DBN1', 'DPP4', 'SH3BGRL3', 'FSTL1', 'MATN4', 'CAPG', 'CA12',
            'ADAMTS2', 'FAM180B', 'FABP3', 'SSC5D', 'ELN', 'ACE', 'CD248',
            'LINC01133', 'ANGPTL2'],
     'B1': ['GEM', 'C11orf96', 'CXCL2', 'SOD2', 'SPSB1', 'TNFSF14', 'KDM6B',
            'CXCL3', 'ADAMTS4', 'ICAM1', 'IL6', 'REL', 'CDKN1A', 'NFKBIA',
            'APOE', 'CYGB', 'ELL2', 'MYC', 'NFIL3', 'ERRFI1', 'NAMPT',
            'ZC3H12A', 'MAFF', 'TNFAIP6', 'ARID5B', 'MT1A', 'BAZ1A', 'UAP1',
            'KPNA2', 'DDX21'],
     'B2': ['APOE', 'CCL19', 'RBP5', 'TNFSF13B', 'CXCL12', 'SLCO2B1',
            'TMEM150C', 'GGT5', 'C3', 'IGFBP7', 'IGFBP3', 'IL33', 'ADRA2A',
            'UBD', 'C7', 'CTSH', 'SCN4B', 'IRF8', 'ICAM2', 'CLSTN3', 'HLA-F',
            'IL34', 'ANKRD29', 'PTGDS', 'CYP7B1', 'CD74', 'CH25H', 'CYGB',
            'MSC', 'CD200R1'],
     'B3': ['ITM2A', 'EFEMP1', 'GPC3', 'MYOC', 'GDF10', 'NFIB', 'TNNT3',
            'CXCL12', 'CYGB', 'GGT5', 'TXNIP', 'PLA2G2A', 'GSN', 'CFH',
            'ADH1B', 'APOD', 'LSP1', 'MGP', 'HHIP-AS1', 'NTRK2', 'SIX1',
            'RARRES2', 'SULT1A1', 'ABCA8', 'SNCG', 'FGF7', 'CFD', 'C3',
            'MGST1', 'C7'],
     'C1': ['COL11A1', 'DPEP1', 'KIF26B', 'UGT3A2', 'EDNRA', 'MEF2C', 'GPC3',
            'POSTN', 'MME', 'PPP1R14A', 'ACAN', 'HAPLN1', 'WFDC1', 'RBP4',
            'TNMD', 'SLC26A7', 'EVA1A', 'EDIL3', 'F2RL2', 'ALX4', 'DOK6',
            'PTCH1', 'CYYR1', 'RFLNB', 'CFHR1', 'MAGI1', 'COL21A1', 'THSD4',
            'PAFAH1B3', 'BCL11B'],
     'C2': ['COCH', 'ASPN', 'TNN', 'CRABP1', 'SLITRK6', 'MKX', 'OGN', 'PLPP5',
            'NDNF', 'SLC22A16', 'SPARCL1', 'PLXDC1', 'RSPO4', 'CCK', 'TNMD',
            'RHPN1', 'FZD1', 'EMID1', 'CPNE5', 'CHADL', 'FIBIN', 'PXDNL',
            'NECAB1', 'TRPM3', 'CADM2', 'HSPA2', 'COL24A1', 'ARHGAP15',
            'LIMCH1', 'GAP43'],
     'C3': ['POSTN', 'LTBP2', 'F2R', 'ASPN', 'EDNRA', 'PMEPA1', 'LRRC15',
            'ADAM12', 'TNN', 'SDC1', 'F2RL2', 'GPM6B', 'INHBA', 'EDIL3',
            'TENM3', 'ADGRE2', 'MMP11', 'PRSS23', 'WNT10A', 'TNC', 'SGIP1',
            'EMID1', 'HTRA1', 'MDK', 'BGN', 'PPP1R14A', 'SLC5A3', 'RSPO4',
            'LOXL1', 'PLPP4'],
     'C5': ['TFAP2A', 'WNT5A', 'INHBA', 'LEF1', 'SOX18', 'CENPW', 'BMP7',
            'SLC5A3', 'DCXR', 'KRT17', 'EDIL3', 'PTCH1', 'STMN1', 'IGFBP3',
            'MRPS6', 'TRPS1', 'CXCR4', 'DPEP1', 'FOXD1', 'SPON1', 'PTMA',
            'PHLDA1', 'DKK3', 'VGLL4', 'CNST', 'FKBP4', 'CPE', 'MARCKSL1',
            'CNTN4', 'PKP4'],
     'D1': ['ANGPTL7', 'EBF2', 'SCN7A', 'ENTPD2', 'APOD', 'C2orf40', 'TENM2',
            'CYP1B1', 'CLDN1', 'ATP1A2', 'MARCKSL1', 'TM4SF1', 'BAMBI',
            'SCRG1', 'CRISPLD1', 'NRP2', 'PODNL1', 'COL8A1', 'FGFBP2', 'KLK1',
            'SFRP4', 'SPARCL1', 'MCTP1', 'TIAM1', 'P2RY14', 'FOXS1', 'MRAS',
            'SOX9', 'ETV1', 'PLK2'],
     'D2': ['ITGA6', 'SBSPON', 'TM4SF1', 'NGFR', 'SLC2A1', 'CLDN1', 'TAGLN',
            'SLC22A3', 'TNNC1', 'BNC2', 'ITGB4', 'AQP3', 'TENM2', 'KLF5',
            'C2orf40', 'PEAR1', 'CSRP1', 'PALMD', 'NR2F2', 'SFRP4', 'ISYNA1',
            'PLEKHA4', 'IGFBP6', 'SCN7A', 'INMT', 'SDPR', 'HRH1', 'NDUFA4L2',
            'CAV2', 'DUSP5'],
     'D3': ['IGFBP2', 'FGFBP2', 'OLFML2A', 'KLK1', 'SFRP1', 'CPE', 'ALX4',
            'LFNG', 'RAMP1', 'SEMA3G', 'SCN7A', 'SLC22A3', 'HOPX', 'SAMD5',
            'SPON2', 'EGR2', 'PLA2G5', 'PDGFD', 'DDIT4L', 'A2M', 'NECAB1',
            'KIAA1217', 'PGF', 'PLEKHA6', 'LTBP2', 'IGF1', 'APOD', 'RELN',
            'MAP2', 'UNC5B'],
     'T1': ['GPC3', 'PLPP4', 'ASPN', 'PAMR1', 'ASIP', 'IL20RB', 'RAMP2',
            'TCF7L2', 'ABCA8', 'PDGFRL', 'PI16', 'HS3ST6', 'SOD3', 'CFD',
            'CCDC146', 'TMEM176B', 'FOXD1', 'CXCL12', 'ENHO', 'DPT', 'OSR2',
            'GAS1', 'IGFBP5', 'EMP2', 'DKK3', 'F10', 'MGP', 'CST3', 'MAFB',
            'EEF1B2']}

In [None]:
dict_cats_axes_robust = {
    'A': ['MMP2', 'SFRP2', 'CTHRC1', 'DPP4', 'THBS2', 'NBL1', 'C1QTNF3',
        'WISP2', 'TNXB', 'QPCT', 'COL14A1', 'ELN', 'SEMA3B', 'SMOC2',
        'SEMA3C', 'ACKR3', 'SPARC', 'ACKR4', 'COL1A1', 'WIF1', 'COL1A2',
        'COL5A1', 'PTGIS', 'PPIC', 'AEBP1', 'ISM1', 'MATN4', 'FBLN1',
        'CERCAM', 'CD70'],
     'B': ['APOE', 'GGT5', 'CYGB', 'C7', 'TNFSF13B', 'APOC1', 'RBP5',
            'CXCL12', 'CCL19', 'IGFBP7', 'C3', 'IRF8', 'IL33', 'SCN4B',
            'TNFSF14', 'IGFBP3', 'TMEM150C', 'C1orf54', 'MSC', 'LIFR', 'CCL2',
            'C11orf96', 'CLSTN3', 'RARRES2', 'UBD', 'FMO1', 'EFEMP1',
            'ADAMTS4', 'NFIB', 'CXCL3'],
     'C': ['ASPN', 'TNN', 'EDNRA', 'COL11A1', 'KIF26B', 'TNMD', 'PPP1R14A',
            'CRABP1', 'POSTN', 'GPM6B', 'EMID1', 'HAPLN1', 'SLITRK6', 'COCH',
            'SDC1', 'EDIL3', 'TENM3', 'ACAN', 'MKX', 'RSPO4', 'LEF1', 'UGT3A2',
            'DPEP1', 'SLC22A16', 'COL24A1', 'ALX4', 'ADAMTS6', 'CPNE5', 'FZD1',
            'CYYR1'],
     'D': ['SCN7A', 'CLDN1', 'KLK1', 'TENM2', 'SLC22A3', 'ITGA6', 'C2orf40',
            'TM4SF1', 'ANGPTL7', 'SBSPON', 'FGFBP2', 'PEAR1', 'NGFR', 'SLC2A1',
            'FOXS1', 'OLFML2A', 'A2M', 'KLF5', 'FMO2', 'SFRP4', 'ITGB4',
            'TAGLN', 'NR2F2', 'APOD', 'C19orf33', 'BAMBI', 'DDIT4L', 'INMT',
            'MRAS', 'ETV1']
}

### Deng 2021

In [None]:
assign_cats(adata_deng_2021_scar_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.8, key_added='cluster_robust', others_name='U')
assign_cats(adata_deng_2021_scar_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_deng_2021_scar_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_deng_2021_scar_fb.obs['cluster_robust']))]
sc.pl.umap(adata_deng_2021_scar_fb, color=['leiden', 'batch', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Gao 2021

In [None]:
assign_cats(adata_gao_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U')
assign_cats(adata_gao_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_gao_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_gao_fb.obs['cluster_robust']))]
sc.pl.umap(adata_gao_fb, color=['leiden', 'Patient', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Gaydosik 2020

In [None]:
assign_cats(adata_gaydosik_HC_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U')
assign_cats(adata_gaydosik_HC_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_gaydosik_HC_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_gaydosik_HC_fb.obs['cluster_robust']))]
sc.pl.umap(adata_gaydosik_HC_fb, color=['leiden', 'sample', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### He 2020

In [None]:
assign_cats(adata_he_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U')
assign_cats(adata_he_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_he_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_he_fb.obs['cluster_robust']))]
sc.pl.umap(adata_he_fb, color=['leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Hughes 2020

In [None]:
assign_cats(adata_hughes_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U')
assign_cats(adata_hughes_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_hughes_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_hughes_fb.obs['cluster_robust']))]
sc.pl.umap(adata_hughes_fb, color=['leiden', 'batch', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Kim 2020

In [None]:
assign_cats(adata_kim_2020_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U')
assign_cats(adata_kim_2020_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_kim_2020_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_kim_2020_fb.obs['cluster_robust']))]
sc.pl.umap(adata_kim_2020_fb, color=['leiden', 'batch', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Kim 2021

In [None]:
assign_cats(adata_kim_2021_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U')
assign_cats(adata_kim_2021_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_kim_2021_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_kim_2021_fb.obs['cluster_robust']))]
sc.pl.umap(adata_kim_2021_fb, color=['leiden', 'batch', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Liu 2021

In [None]:
assign_cats(adata_liu_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U')
assign_cats(adata_liu_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_liu_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_liu_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(adata_liu_ctrl_fb, color=['leiden', 'Patient', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Mirizio 2020

In [None]:
assign_cats(adata_mirizio_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.95, key_added='cluster_robust', others_name='U')
assign_cats(adata_mirizio_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_mirizio_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_mirizio_fb.obs['cluster_robust']))]
sc.pl.umap(adata_mirizio_fb, color=['leiden', 'batch', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Reynolds 2021

In [None]:
assign_cats(adata_reynolds_healthy_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U')
assign_cats(adata_reynolds_healthy_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_reynolds_healthy_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_reynolds_healthy_fb.obs['cluster_robust']))]
sc.pl.umap(adata_reynolds_healthy_fb, color=['leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Solé-Boldo 2021

In [None]:
assign_cats(adata_sole_young_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U')
assign_cats(adata_sole_young_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_sole_young_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_sole_young_fb.obs['cluster_robust']))]
sc.pl.umap(adata_sole_young_fb, color=['leiden', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Tabib 2018

In [None]:
assign_cats(adata_tabib_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U')
assign_cats(adata_tabib_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_tabib_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_tabib_fb.obs['cluster_robust']))]
sc.pl.umap(adata_tabib_fb, color=['leiden', 'batch', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Tabib 2021

In [None]:
assign_cats(adata_tabib_2021_ctrl_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U')
assign_cats(adata_tabib_2021_ctrl_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_tabib_2021_ctrl_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_tabib_2021_ctrl_fb.obs['cluster_robust']))]
sc.pl.umap(adata_tabib_2021_ctrl_fb, color=['leiden', 'batch', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Tabula Sapiens Consortium 2021

In [None]:
assign_cats(adata_tsc_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.7, key_added='cluster_robust', others_name='U')
assign_cats(adata_tsc_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_tsc_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_tsc_fb.obs['cluster_robust']))]
sc.pl.umap(adata_tsc_fb, color=['leiden', 'batch', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Theocarditis 2020

In [None]:
assign_cats(adata_theo_2020_healthy_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.9, key_added='cluster_robust', others_name='U')
assign_cats(adata_theo_2020_healthy_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_theo_2020_healthy_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_theo_2020_healthy_fb.obs['cluster_robust']))]
sc.pl.umap(adata_theo_2020_healthy_fb, color=['leiden', 'batch', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Theocarditis 2021

In [None]:
assign_cats(adata_theo_2021_healthy_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.85, key_added='cluster_robust', others_name='U')
assign_cats(adata_theo_2021_healthy_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_theo_2021_healthy_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_theo_2021_healthy_fb.obs['cluster_robust']))]
sc.pl.umap(adata_theo_2021_healthy_fb, color=['leiden', 'batch', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

### Vorstandlechner 2020

In [None]:
assign_cats(adata_vors_fb, dict_cats=dict_cats_clusters_robust, min_score=0.5, quantile_gene_sel=0.8, key_added='cluster_robust', others_name='U')
assign_cats(adata_vors_fb, column_groupby='cluster_robust', dict_cats=dict_cats_axes_robust, min_score=0.4, key_added='axis_robust', 
            intermediate_states=True, diff=0.15, others_name='U')

adata_vors_fb.uns['cluster_robust_colors'] = [dict_colors[i] if i in dict_colors else '#bcbcbc' for i in sorted(set(adata_vors_fb.obs['cluster_robust']))]
sc.pl.umap(adata_vors_fb, color=['leiden', 'batch', 'axis_robust', 'cluster_robust'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

## Reevaluate the presence of clusters for each dataset
In this representation we will exclude Kim 2021, because being nail it is interesting to study for another case, but not exactly here.

In [None]:
from fb_functions import plot_adata_cluster_properties

In [None]:
# The structure of the dataset dict is dict: [Name, Status (healthy, young, psoriasis, etc), year, ]
dict_datasets = {'adata_deng_scar_fb':           ['Deng', 'Scar', '2021', adata_deng_scar_fb],
                 'adata_gao_fb':                 ['Gao', 'Healthy', '2021', adata_gao_fb],
                 'adata_gaydosik_HC_fb':         ['Gaydosik', 'Healthy', '2020', adata_gaydosik_HC_fb],
                 'adata_he_fb':                  ['He', 'Healthy', '2020', adata_he_fb],
                 'adata_hughes_fb':              ['Hughes', 'Healthy', '2020', adata_hughes_fb],
                 'adata_kim_2020_fb':            ['Kim', 'Healthy', '2020', adata_kim_2020_fb],
                 'adata_liu_ctrl_fb':            ['Liu', 'Healthy', '2021', adata_liu_ctrl_fb],
                 'adata_mirizio_fb':             ['Mirizio', 'Healthy', '2020', adata_mirizio_fb],
                 'adata_reynolds_healthy_fb':    ['Reynolds', 'Healthy', '2021', adata_reynolds_healthy_fb],
                 'adata_sole_young_fb':          ['Solé-Boldo', 'Young', '2020', adata_sole_young_fb],
                 'adata_tabib_fb':               ['Tabib', 'Healthy', '2018', adata_tabib_fb],
                 'adata_tabib_2021_ctrl_fb':     ['Tabib', 'Healthy', '2021', adata_tabib_2021_ctrl_fb],
                 'adata_theo_fb':                ['Theocarditis', 'Healthy', '2020', adata_theo_fb],
                 'adata_theo_2021_fb':           ['Theocarditis', 'Healthy', '2021', adata_theo_2021_fb], 
                 'adata_vors_fb':                ['Vorstandlechner', 'Healthy', '2020', adata_vors_fb],}

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, dict_datasets=dict_datasets, what='presence', cluster_name='cluster', axis_name='axis')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, dict_datasets=dict_datasets, what='score', cluster_name='cluster', axis_name='axis')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, dict_datasets=dict_datasets, what='percentage', cluster_name='cluster', axis_name='axis')

In [None]:
plot_adata_cluster_properties(dict_cats_clusters=dict_cats_clusters, dict_datasets=dict_datasets, what='axis', cluster_name='cluster', axis_name='axis')

## Plotting all Adatas

In [None]:
list_adatas = [adata_deng_scar_fb, adata_gao_fb, adata_gaydosik_HC_fb, adata_he_fb, adata_hughes_fb, adata_kim_2020_fb, adata_liu_ctrl_fb, 
               adata_mirizio_fb, adata_reynolds_healthy_fb, adata_sole_young_fb, adata_tabib_fb, adata_tabib_2021_ctrl_fb, adata_theo_healthy_fb, adata_vors_fb]
list_names = ['Deng', 'Gao', 'Gaydosik', 'Hughes', 'Kim', 'Liu', 'Mirizio', 'Reynolds', 'Solé-Boldo', 'Tabib', 'Tabib 2021', 'Theocarditis', 'Vorstandlechner']

In [None]:
fig, axs = plt.subplots(3, 5, figsize=(5 * 4, 3 * 4))

for adata, name, idx in zip(list_adatas, list_names, range(len(list_adatas))):
    sc.pl.umap(adata, color=['cluster'], legend_loc='on data', show=False, ax = axs.ravel()[idx], title=name, size=15, cmap=magma)

In [None]:
fig, axs = plt.subplots(3, 5, figsize=(5 * 4, 3 * 4))

for adata, name, idx in zip(list_adatas, list_names, range(len(list_adatas))):
    del adata.uns['dendrogram_cluster']
    sc.pl.dendrogram(adata, groupby='cluster', show=False, ax = axs.ravel()[idx])

## PAGA 

In [None]:
fig, axs = plt.subplots(3, 5, figsize=(5 * 4, 3 * 4))

for adata, name, idx in zip(list_adatas, list_names, range(len(list_adatas))):
    sc.tl.paga(adata, groups='cluster')
    sc.pl.paga(adata, ax=axs.ravel()[idx], frameon=False, show=False)

In [None]:
fig, axs = plt.subplots(3, 5, figsize=(5 * 4, 3 * 4))

for adata, name, idx in zip(list_adatas, list_names, range(len(list_adatas))):
    sc.tl.paga(adata, groups='cluster')
    sc.pl.paga(adata, ax=axs.ravel()[idx], frameon=False, show=False, solid_edges='connectivities_tree')

## Dataset merge [too diverse to make a good result]

We will remove He from the list of datasets to remove noise. But the mergeing is still not good.

In [None]:
adata_deng_scar_fb

In [None]:
adata_all = sc.AnnData.concatenate(adata_deng_scar_fb, adata_gao_fb, adata_gaydosik_HC_fb, 
                                   adata_he_fb, adata_hughes_fb, adata_kim_2020_fb, adata_liu_ctrl_fb, 
                                   adata_mirizio_fb, adata_reynolds_healthy_fb, adata_sole_young_fb, 
                                   adata_tabib_fb, adata_theo_healthy_fb, adata_vors_fb,
                          batch_key='dataset', 
                          batch_categories=['Deng', 'Gao', 'Gaydosik', 'He', 'Hughes', 'Kim', 'Liu', 
                                            'Mirizio', 'Reynolds', 'Sole-Boldo', 'Tabib', 'Theocarditis', 'Vorstandlechner'], 
                          join='outer')

adata_all = adata_all[adata_all.obs['cluster'] != 'unassigned']

In [None]:
adata_all.obs['dataset_batch'] = adata_all.obs['dataset'].astype(str) + adata_all.obs['Patient'].astype(str) + \
                                adata_all.obs['sample'].astype(str) + adata_all.obs['sample_id'].astype(str)  + adata_all.obs['batch'].astype(str)

In [None]:
sc.pp.pca(adata_all, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_all, key='dataset_batch', max_iter_harmony = 30, )
sc.pp.neighbors(adata_all, n_neighbors=int(len(adata_all) ** 0.5 // 8), use_rep='X_pca_harmony')
# sce.pp.bbknn(adata_all, metric='angular', batch_key='dataset', neighbors_within_batch=4, use_rep='X_pca_harmony')
tk.tl.triku(adata_all)

In [None]:
sc.tl.umap(adata_all, min_dist=0.05, random_state=seed)
sc.tl.leiden(adata_all, resolution=5, random_state=seed)

In [None]:
assign_cats(adata_all, dict_cats=dict_cats_clusters_3, min_score=0.5, quantile_gene_sel=0.95, key_added='clusterall')
assign_cats(adata_all, column_groupby='clusterall', dict_cats=dict_cats_axes, min_score=0.4, key_added='axisall', 
             intermediate_states=True, diff=0.15)

In [None]:
sc.pl.umap(adata_all, color=['dataset', 'dataset_batch', 
                             'leiden', 'axis', 'cluster3',
                             'clusterall', 'axisall'], legend_loc='on data', cmap=magma, use_raw=False, ncols=2)

In [None]:
for cluster in list(dict.fromkeys(adata_all.obs['cluster'].values)):
    adata_all.obs[f'is_{cluster}'] = [i if i == cluster else '-'  for i in adata_all.obs['cluster'].values]
    adata_all.uns[f'is_{cluster}_colors'] = ['#bcbcbc', '#bc0000']

In [None]:
sc.pl.umap(adata_all, color=sorted([f'is_{i}' for i in list(dict.fromkeys(adata_all.obs['cluster'].values))]), 
           cmap=magma, use_raw=False, ncols=2)

In [None]:
sc.pl.umap(adata_all, color=['dataset'], cmap=magma, use_raw=False, )

We see that the integration is not good enough to see the main clusters together.