# Mouse / human comparison

For this notebook **you need to run the 4M and 4H notbeooks previously!!**.

In this notebook we are going to analyse the similarities and differences between mouse and human skin fibroblast populations. We already know that mouse and human skin are different between them, but we want to know how much of this translates into the transcriptomic realm of single cell.

To do this analysis we are going to do two analyses. 
* Choose datasets with human and mouse samples from the same lab (e.g., Boothby, Vorstandlechner) and try to find overlaps in the populations.
* Map human/mouse genes between them using an homology databe (MGI) and try to find similarities between the list of markers of human and mouse populations. 

## imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
import matplotlib.cm as cm
import networkx as nx

In [None]:
!pip install munkres
from munkres import Munkres

In [None]:
# local imports and imports from other notebooks
from cellassign import assign_cats
from fb_functions import make_gene_scoring_with_expr, plot_score_graph, plot_UMAPS_gene, plot_adata_cluster_properties
%store -r dict_colors_human
%store -r dict_colors_mouse

dict_colors_human_mouse = {**dict_colors_human , **dict_colors_mouse}

%store -r seed
%store -r magma
%store -r data_dir

In [None]:
%store -r dict_make_gene_scoring_robust
%store -r dict_make_gene_scoring_axis_robust

In [None]:
mpl.rcParams['figure.dpi'] = 120
pd.options.display.float_format = "{:,.2f}".format

In [None]:
def join_fbs_adatas(adata_full, adata_fb):
    cell_types = adata_full.obs['assigned_cats'].copy().astype(str)
    intersect_idx = np.intersect1d(adata_fb.obs_names, adata_full.obs_names)
    cell_types[intersect_idx] = [f'fibro_{i}' for i in adata_fb[intersect_idx].obs['cluster']]
    adata_full.obs['full_cell_type'] = cell_types.astype('category')

## Creating the mouse-human gene homology dictionary

In [None]:
!cd results && wget http://www.informatics.jax.org/downloads/reports/HOM_AllOrganism.rpt

In [None]:
df = pd.read_csv('results/HOM_AllOrganism.rpt', sep='\t')
df = df[df['Common Organism Name'].isin(['mouse, laboratory', 'human'])][['DB Class Key', 'Common Organism Name', 'Symbol']].reset_index(drop=True)

list_DB = set(df['DB Class Key'].values)

dict_mouse_human = {}
for el in tqdm(list_DB):
    df_sub = df[df['DB Class Key'] == el].sort_values(by='Common Organism Name')
    if len(df_sub) == 2:
        dict_mouse_human[df_sub.iloc[1, 2]] = df_sub.iloc[0, 2]

# Comparison of UMAPs of populations

In this section we are going to compare mouse and human datasets of 
* Datasets from the same laboratory
* Datasets from diferent laboratories (more confirmatory)

To do this we are going to translate the mouse into human genes and get the subset of genes that have homology and are present in the human adata.

### Boothby

In [None]:
boothby_2021_dir = data_dir + '/boothby_2021'
boothby_2021_ctrl_mouse_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_mouse_fb_robust.h5')
boothby_2021_ctrl_human_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_human_fb_robust.h5')

boothby_2021_ctrl_mouse_fb_raw = sc.read(boothby_2021_dir + '/boothby_2021_mouse_ctrl_mouse.h5')
boothby_2021_ctrl_human_fb_raw = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_human.h5')

In [None]:
boothby_2021_ctrl_mouse_fb.X = boothby_2021_ctrl_mouse_fb_raw[boothby_2021_ctrl_mouse_fb.obs_names, boothby_2021_ctrl_mouse_fb.var_names].X.copy()
boothby_2021_ctrl_human_fb.X = boothby_2021_ctrl_human_fb_raw[boothby_2021_ctrl_human_fb.obs_names, boothby_2021_ctrl_human_fb.var_names].X.copy()

In [None]:
adata_mouse_genes, adata_human_genes = boothby_2021_ctrl_mouse_fb.var_names, boothby_2021_ctrl_human_fb.var_names
mouse_selected_genes, human_homolog, human_mouse_gene = [], [], []

for i in adata_mouse_genes:
    if i in dict_mouse_human:
        if (dict_mouse_human[i] in adata_human_genes) & (dict_mouse_human[i] not in human_homolog):
            mouse_selected_genes.append(i); human_homolog.append(dict_mouse_human[i]); human_mouse_gene.append(f'{dict_mouse_human[i]} | {i}')

In [None]:
boothby_2021_ctrl_human_fb, boothby_2021_ctrl_mouse_fb = boothby_2021_ctrl_human_fb[:, human_homolog], boothby_2021_ctrl_mouse_fb[:, mouse_selected_genes]
boothby_2021_ctrl_human_fb.var_names, boothby_2021_ctrl_mouse_fb.var_names = human_mouse_gene, human_mouse_gene
boothby_2021_ctrl_human_mouse_fb = sc.AnnData.concatenate(boothby_2021_ctrl_human_fb, boothby_2021_ctrl_mouse_fb, batch_categories=['human', 'mouse'], batch_key='organism')

In [None]:
sc.pp.log1p(boothby_2021_ctrl_human_mouse_fb)

In [None]:
sc.pp.pca(boothby_2021_ctrl_human_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(boothby_2021_ctrl_human_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(boothby_2021_ctrl_human_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(boothby_2021_ctrl_human_mouse_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(boothby_2021_ctrl_human_mouse_fb)

In [None]:
boothby_2021_ctrl_human_mouse_fb.var_names[boothby_2021_ctrl_human_mouse_fb.var['highly_variable'] == True]

In [None]:
sc.tl.umap(boothby_2021_ctrl_human_mouse_fb, min_dist=0.2, random_state=seed)

In [None]:
boothby_2021_ctrl_human_mouse_fb.obs['cluster'] = boothby_2021_ctrl_human_mouse_fb.obs['cluster'].astype('category')
boothby_2021_ctrl_human_mouse_fb.uns['cluster_colors'] = [dict_colors_human_mouse[i] if i in dict_colors_human_mouse else '#bcbcbc' for  i in boothby_2021_ctrl_human_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pp.subsample(boothby_2021_ctrl_human_mouse_fb, fraction=1, random_state=0, copy=False)
sc.pl.umap(boothby_2021_ctrl_human_mouse_fb, color=['cluster'], legend_loc='on data')

In [None]:
sc.pl.umap(boothby_2021_ctrl_human_mouse_fb[boothby_2021_ctrl_human_mouse_fb.obs['organism']=='mouse'], color=['cluster'], legend_loc='on data')
sc.pl.umap(boothby_2021_ctrl_human_mouse_fb[boothby_2021_ctrl_human_mouse_fb.obs['organism']=='human'], color=['cluster'], legend_loc='on data')

### Vorstandlechner

In [None]:
vorstandlechner_2021_dir = data_dir + '/Vorstandlechner_2021'
vorstandlechner_2021_ctrl_human_fb = sc.read(f"{vorstandlechner_2021_dir}/vors_2021_ctrl_human_fb_robust.h5")
vorstandlechner_2021_ctrl_mouse_fb = sc.read(f"{vorstandlechner_2021_dir}/vorstandlechner_2021_ctrl_mouse_fb_robust.h5")

vorstandlechner_2021_ctrl_human_fb_raw = sc.read(f"{vorstandlechner_2021_dir}/vorstandlechner_2021_ctrl_human.h5")
vorstandlechner_2021_ctrl_mouse_fb_raw = sc.read(f"{vorstandlechner_2021_dir}/vorstandlechner_2021_ctrl_mouse.h5")

In [None]:
vorstandlechner_2021_ctrl_human_fb.X = vorstandlechner_2021_ctrl_human_fb_raw[vorstandlechner_2021_ctrl_human_fb.obs_names, vorstandlechner_2021_ctrl_human_fb.var_names].X.copy()
vorstandlechner_2021_ctrl_mouse_fb.X = vorstandlechner_2021_ctrl_mouse_fb_raw[vorstandlechner_2021_ctrl_mouse_fb.obs_names, vorstandlechner_2021_ctrl_mouse_fb.var_names].X.copy()

In [None]:
adata_mouse_genes, adata_human_genes = vorstandlechner_2021_ctrl_mouse_fb.var_names, vorstandlechner_2021_ctrl_human_fb.var_names
mouse_selected_genes, human_homolog, human_mouse_gene = [], [], []

for i in adata_mouse_genes:
    if i in dict_mouse_human:
        if (dict_mouse_human[i] in adata_human_genes) & (dict_mouse_human[i] not in human_homolog):
            mouse_selected_genes.append(i); human_homolog.append(dict_mouse_human[i]); human_mouse_gene.append(f'{dict_mouse_human[i]} | {i}')

In [None]:
vorstandlechner_2021_ctrl_human_fb, vorstandlechner_2021_ctrl_mouse_fb = vorstandlechner_2021_ctrl_human_fb[:, human_homolog], vorstandlechner_2021_ctrl_mouse_fb[:, mouse_selected_genes]
vorstandlechner_2021_ctrl_human_fb.var_names, vorstandlechner_2021_ctrl_mouse_fb.var_names = human_mouse_gene, human_mouse_gene
vorstandlechner_2021_ctrl_human_mouse_fb = sc.AnnData.concatenate(vorstandlechner_2021_ctrl_human_fb, vorstandlechner_2021_ctrl_mouse_fb, batch_categories=['human', 'mouse'], batch_key='organism')

In [None]:
sc.pp.log1p(vorstandlechner_2021_ctrl_human_mouse_fb)

In [None]:
sc.pp.pca(vorstandlechner_2021_ctrl_human_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(vorstandlechner_2021_ctrl_human_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(vorstandlechner_2021_ctrl_human_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(vorstandlechner_2021_ctrl_human_mouse_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(vorstandlechner_2021_ctrl_human_mouse_fb)

In [None]:
vorstandlechner_2021_ctrl_human_mouse_fb.var_names[vorstandlechner_2021_ctrl_human_mouse_fb.var['highly_variable'] == True]

In [None]:
sc.tl.umap(vorstandlechner_2021_ctrl_human_mouse_fb, min_dist=0.2, random_state=seed)

In [None]:
vorstandlechner_2021_ctrl_human_mouse_fb.obs['cluster'] = vorstandlechner_2021_ctrl_human_mouse_fb.obs['cluster'].astype('category')
vorstandlechner_2021_ctrl_human_mouse_fb.uns['cluster_colors'] = [dict_colors_human_mouse[i] if i in dict_colors_human_mouse else '#bcbcbc' for  i in vorstandlechner_2021_ctrl_human_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pp.subsample(vorstandlechner_2021_ctrl_human_mouse_fb, fraction=1, random_state=0, copy=False)
sc.pl.umap(vorstandlechner_2021_ctrl_human_mouse_fb, color=['cluster'], legend_loc='on data')

In [None]:
sc.pl.umap(vorstandlechner_2021_ctrl_human_mouse_fb[vorstandlechner_2021_ctrl_human_mouse_fb.obs['organism']=='mouse'], color=['cluster'], legend_loc='on data')
sc.pl.umap(vorstandlechner_2021_ctrl_human_mouse_fb[vorstandlechner_2021_ctrl_human_mouse_fb.obs['organism']=='human'], color=['cluster'], legend_loc='on data')

### Conclusions
Datasets do not match. When reading the methods they point out that they use a restricted set of genes to create then do the UMAP. I don't feel confortable with that strategy, so I prefer to do a more "unbiased" analysis based on the markers.

# Comparison of gene patterns between populations
In this section we are going to take the gene markers from human and mouse, and do human-mouse, mouse-mouse and human-human comparisons between the set of markers. From this we expect to find similarities betwen both parties and establish a homology model of the skin fibroblasts.

To do this analysis we are going to:
* Get the top N markers for each population in mouse and human (also in human-human and mouse-mouse) and compute the Jaccard index between them. 
    * To find the "best" N we are going to compute for an array of N the matrix the jaccard matrix and get the trace with highest values using the hungarian algorithm. The sum of the trace is stored. A higher sum would imply a general higher overlap between markers and, therefore, a more relevant analysis. From the array of values, we are going to get the best "rounded" N.
* With the top N we are going to get the Jaccard matrix and plot it with a heatmap and a clustergram.
* From there, we are going to select the best matches and analyze the relationship between human and mouse populations

We do the human-human and mouse-mouse comparisons to get a "control" view. We expect more gene overlap between clusters of the same axis than of different ones. Also, we get a sense of the expected overlaps, to see how this is reflected in mouse.

To make the joining as "easy" as possible, we are going to use the subset of genes that have a mouse-human homology only. This might limit the extent of the analysis but, in general, we find a fair overlap, and this will make the dataset mapping and batch effect correction much easier.

In [None]:
%store -r dict_make_gene_scoring_cluster_robust_human
%store -r dict_make_gene_scoring_cluster_robust_mouse

In [None]:
def get_df_overlap(dict_1, dict_2, N=100, translate=True):
    df_overlap = pd.DataFrame(0, index=dict_1.keys(), columns=dict_2.keys())
    
    for cluster_name_1, cluster_df_1 in dict_1.items():
        for cluster_name_2, cluster_df_2 in dict_2.items():
            gene_list_1 = set(cluster_df_1.index[:N])
            gene_list_2_unchanged = cluster_df_2.index[:N]
            if translate:
                gene_list_2 = set([dict_mouse_human[i] if i in dict_mouse_human else i for i in gene_list_2_unchanged])
            else:
                gene_list_2 = set(gene_list_2_unchanged)

            overlap = len(gene_list_1 & gene_list_2) / len(gene_list_1 | gene_list_2)
            if overlap == 1:
                overlap = 0
                
            df_overlap.loc[cluster_name_1, cluster_name_2] = overlap
    
    return df_overlap

In [None]:
def plot_best_N(dict_1, dict_2, N_min=10, N_max=300, translate=True):
    list_N, list_total = [],[] 
    for N in tqdm(range(N_min, N_max)):
        df_jaccard_1_2 = get_df_overlap(dict_1, dict_2, N=N, translate=translate)

        m = Munkres()
        indexes = m.compute(1-df_jaccard_1_2.values)
        total = 0
        for row, column in indexes:
            value = df_jaccard_1_2.values[row][column]
            total += value

        list_N.append(N)
        list_total.append(total)

    plt.plot(list_N, list_total)

In [None]:
def plot_heatmap(df_jaccard_1_2, dict_colors_1, dict_colors_2, figsize=(12,8), ticklabelsize=15, diag=False):
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    if diag:
        mask = np.eye(df_jaccard_1_2.shape[0], dtype=bool)
        sns.heatmap(df_jaccard_1_2, annot=True, fmt='.2f', ax=ax, mask=mask)
        ax.set_facecolor("#989898")
    else:
        sns.heatmap(df_jaccard_1_2, annot=True, fmt='.2f', ax=ax)
    [t.set_color(dict_colors_1[t.get_text()]) for t in ax.xaxis.get_ticklabels()]; [t.set_color(dict_colors_2[t.get_text()]) for t in ax.yaxis.get_ticklabels()]
    ax.set_xticklabels(ax.get_xticklabels(),  weight='bold', size=ticklabelsize); ax.set_yticklabels(ax.get_yticklabels(),  weight='bold', size=ticklabelsize, va='center')
    None # To avoid plotting stuff on screen
    
def plot_clustermap(df_jaccard_1_2, dict_colors_1, dict_colors_2, figsize=(12,8), ticklabelsize=15):
    cg = sns.clustermap(df_jaccard_1_2)
    ax = cg.ax_heatmap
    [t.set_color(dict_colors_1[t.get_text()]) for t in ax.xaxis.get_ticklabels()]; [t.set_color(dict_colors_2[t.get_text()]) for t in ax.yaxis.get_ticklabels()]
    ax.set_xticklabels(ax.get_xticklabels(),  weight='bold', size=ticklabelsize); ax.set_yticklabels(ax.get_yticklabels(),  weight='bold', size=ticklabelsize, va='center')
    None # To avoid plotting stuff on screen

In [None]:
def print_common_genes(dict_1, dict_2, cluster_1, cluster_2, translate=True, N=150):
    gene_list_1 = set(dict_1[cluster_1].index[:N])
    gene_list_2_unchanged = dict_2[cluster_2].index[:N]
    if translate:
        gene_list_2 = set([dict_mouse_human[i] if i in dict_mouse_human else i for i in gene_list_2_unchanged])
    else:
        gene_list_2 = set(gene_list_2_unchanged)
        
    return sorted(list(gene_list_1 & gene_list_2))

### Human-human comparison

In [None]:
plot_best_N(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_human, N_min=10, N_max=300)

In [None]:
df_jaccard_human_human = get_df_overlap(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_human, N=125)
plot_heatmap(df_jaccard_human_human, dict_colors_human, dict_colors_human, figsize=(12,7), ticklabelsize=15, diag=True)

In [None]:
plot_clustermap(df_jaccard_human_human, dict_colors_human, dict_colors_human, figsize=(12,7), ticklabelsize=15)

#### Comments on human-human
* It is clear that A3 is a bridge between A1-A4 and A2. A2 and A1 are clearly different clusters, and A3 is more "favorable" to A1 than to A2.
* A2 has a "shared" transcriptomic profile with C and E axis! It is also apparent form UMAPs and connectivity graphs. However, its relatively low similarity profile with C1, C3 and E1 makes it a somewhat "independent" population.
    * A2-C1: APELA, CCDC3, **COL21A1**, **COL3A1**, **COL7A1**, CPXM2, **DKK3**, DUSP4, **LAMC3**, ROBO2, **SEMA5A**, **SPON1**, STMN1, THSD4, **TMEM119**
    * A2-C3: ANTXR1, APELA, C4orf48, **COL3A1**, COL5A1, COL6A3, **COL7A1**, COMP, DKK3, HMGB3, **LAMC3**, LOXL2, MFAP2, PTK7, RAB31, **SEMA5A**, **SPON1**, **TCF4**, **TMEM119**, TNC
    * A2-E1: CMKLR1, **DKK3**, ID1, IGFBP2, LAMC3, MAP2, PDGFRA, RGCC, SEMA5A, **SPON1**, SPRY1, **TCF4**, TNFRSF21
* B axis seems quite independent from the rest of axes, although it is related to T1 and, also, B1 with D1. 
    * It is also claear that B3 acts as a bridge between B1, B2 and B4. B1 and B2 are not related, and B4 is slightly related to B2, but not B1 (although they appear more related in UMAPs and graphs).
    * B1, similar to A2, shows an independent transcriptomic profile with its neighbour B clusters.
* The C, D and E axes, although they have a substructure, they are more correlated between them than A or B axes.
    * C1 and C3 are quite related, and C3 specially is the bridge cluster among the C axis. 
    * D1 and D2 are also quite related.

### Mouse-mouse comparison

In [None]:
plot_best_N(dict_make_gene_scoring_cluster_robust_mouse, dict_make_gene_scoring_cluster_robust_mouse, N_min=10, N_max=300, translate=False)

In [None]:
df_jaccard_mouse_mouse = get_df_overlap(dict_make_gene_scoring_cluster_robust_mouse, dict_make_gene_scoring_cluster_robust_mouse, N=150, translate=False)
plot_heatmap(df_jaccard_mouse_mouse, dict_colors_mouse, dict_colors_mouse, figsize=(12,7), ticklabelsize=15, diag=True)

#### Comments on mouse-mouse
* a1 and a2 clusters are clearly related. 
* a/d cluster shows little resemblance the *d* axis, but only for a few genes
    * a/d - d1: *Ptma*, *Tpm2*, *Tuba1c*
    * a/d - d2: *Ptma*, *Tpm2*, *H1fx*
    * a/d - d3: *Ptma*, *Tpm2*, *Bok*, *Hnrnpa1*, *Stmn1*, *Tubb5*
* b axis is interconnected, as shown in the PAGA graph. The most relevant similarities are b1 and b2 with b4; and b2 with b3.
    * b/c cluster is a bridge between b6 and c1, although b6 and c1 have almost no resemblance.
    * Interestingly, b6 does share a good set of genes with a1. Maybe b6 is some sort of bridge between *a* and *b*: *Adgrd1*, *Ccl2*, *Ccl7*, *Clic4*, *Csrnp1*, *Cxcl1*, *Cxcl2*, *Errfi1*, *Fosl1*, *Gfpt2*, *Has1*, *Hk2*, *Ifi205*, *Ifrd1*, *Il6*, *Kdm6b*, *Maff*, *Mt2*, *Myc*, *Nfkb1*, *Nfkbia*, *Nfkbiz*, *Nr4a3*, *Ptgs2*, *Ptx3*, *Tnfaip2*, *Tnfaip3*, *Tnfaip6*, *Txnrd1*, *Ugdh*, *Zc3h12a*
* There is a good internal cohesion within the *d* axis.
* *e1* cluster, althoguh it is slightly related to *b5*, is separate from the rest fo clusters.

In [None]:
plot_clustermap(df_jaccard_mouse_mouse, dict_colors_mouse, dict_colors_mouse, figsize=(12,7), ticklabelsize=15)

### Mouse-human comparison

In [None]:
plot_best_N(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, N_min=10, N_max=300)

In [None]:
print(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'B2', 'b3', translate=True, N=150))



#### Comments on human-mouse
* *A1*, *A3*, and *A4* clusters are tightly related to *a1*, *a2*, *b/c*, *b2*, *b3*, *c1* and *c2*. This is somewhat expected based on the result from Joost et al. 2020, where *a*, and b* axis were related to the "common" dermis, and *c* within the hypodermis. 
    * *A4* is more related to *a2*, and since a2 is not as relevant as a1 in plots, there might be a fair enough relation between them. Some genes that are related are *SFRP4*, *WNT2*, *IGFBP6*, *SEMA3E*, *SEMA3C*, which are good *A4* markers.
        * A4 - a1 - a2: *ACKR3*, **AIF1L**, *ALDH1A3*, *BASP1*, **CD248**, *CD55*, *CHRDL1*, *CREB5*, **DBN1**, *DPP4*, **EMILIN2**, *FNDC1*, *GAP43*, *HEG1*, **IGFBP6**, *MFAP5*, *NPR1*, *PRSS23*, *SCARA5*, **SEMA3C**, **SEMA3E**, *TIMP2*, **WNT2**
        * A4 - a1: *ADGRD1*, *GFPT2*, *HAS1*, *METRNL*, *TNFAIP6*, *UGDH*, *VASN*
        * A4 - a2: *ADAMTSL4*, *ADGRG2*, *CLEC3B*, *IGFBP5*, *ISLR*, *LIMS2*, **PAMR1**, *PPP1R14B*, **PTGIS**, **RAB32**, **SFRP4**
    * *A1* is related to *b3* and *b/c* mainly, which might be a sign that *A1* is more widely expressed and, therefore, is more relatable to other clusters.
        * A1 (+A3/A4) - b3: **ABCC9**, *ANGPTL1*, **C1QTNF3**, **CCN5**, *CD151*, **CLU**, *COL12A1*, **ELN**, **FBLN1**, **FBLN2**, *FGL2*, *GPX3*, *ITGBL1*, **LGR5**, **LOX**, *MFAP4*, *MGP*, *OMD*, *PAM*, *PCOLCE*, **PDGFRL**, *PODN*, **PTGIS**, *SFRP2*, *SMOC2*
        * A1 (+A3/A4) - b/c: **ABCC9**, *AEBP1*, *ANGPTL1*, **C1QTNF3**, *CADM3*, *CCDC80*, **CD34**, *COL12A1*, *CPZ*, **CTSK**, *CYP4B1*, **DCN**, *GPX3*, *HPGD*, **LGR5**, **LOX**, *MGST1*, *MMP27*, *PCOLCE*, **PDGFRL**, *PLTP*, **SEMA3B**, **SERPINF1**, **THBS2**, *THBS3*, **TNXB**
        * A1 (+A3/A4) - c1: **AEBP1**, *ANGPTL1*, **C1QTNF3**, *COL1A1*, **COL1A2**, *CPZ*, **CTSK**, **CYBRD1**, **DCN**, **ELN**, *HPGD*, *ITGBL1*, *MFAP4*, *MMP27*, **PCOLCE2**, **PDGFRL**, **SCARA5**, **SEMA3B**, *SPARC*
* *A2* is more related to *c2*, but it is also related to *d* axis. Since *C* and *d* axis are related, and *A2* and *C* axis are also related, this is an expected result.
    * A2 - c2: **AHRR**, *AXIN2*, **COL13A1**, **COL23A1**, *COL3A1*, *COL5A1*, *COL7A1*, *CYP26B1*, *EMX2*, *F13A1*, *GREM2*, *IGFBP2*, *ISM1*, *KCNK2*, *LAMC3*, *LSAMP*, *MAMDC2*, **NKD1**, **NKD2**, **PREX1**, *PTK7*, **PTPRE**, *RSPO1*, *SCARF2*, *SMIM3*, *SPRY1*, *STC1*, *TCF4*, *TGFBI*, *THSD4*, **TNFRSF19**, *TWIST2*
* *B1* seems to be tightly related to *b6*. It is also related to *B3*, but considering that *B3* is likely a bridge cluster, we keep it with *B1*.
    * B1 - b6: *ADAMTS1*, *ADAMTS4*, *ARID5A*, *ARL5B*, *BCL3*, *BIRC3*, *BTG1*, *CCL2*, *CCNL1*, *CEBPB*, *CSRNP1*, **CXCL2**, **ELL2**, *ERRFI1*, *ETS2*, **FOSL1**, **GCH1**, *GFPT2*, *ICAM1*, *IER3*, *IL1R1*, **IL6**, *IRF1*, *KDM6B*, *LIF*, *MAFF*, *MYC*, *NFKB1*, *NFKBIA*, *NFKBIZ*, *NNMT*, **NR4A3**, *PNRC1*, *PTGS2*, *SLC39A14*, *SOCS3*, *SOD2*, *TMEM88*, *TNFAIP2*, *TNFAIP3*, *TNFAIP6*, *TNFRSF12A*, *UGCG*, *ZC3H12A*
* There is a relationship between *B2*/*B4* and *b1*/*b4*. However, it is hard to set a one-on-one relationship between mouse and human clusters. Mainly, the overlapping genes are repeated within one-on-one comparisons, and many of those are not exclusive of *B2* or *B4*, but are usually shared between *B2* and *B4*, as well as *B3* sometimes.
    * B2 - b4: *ABCA8*, *APOE*, *C1S*, *C3*, **C7**, *COL4A2*, *COL4A4*, *CXCL12*, *CYGB*, *CYP7B1*, *GGT5*, *IGFBP3*, *IGFBP7*, *IL11RA*, *LIFR*, **LPAR4**, *NDRG2*, *NRP1*, *SNED1*, **TMEM176A**, *TMEM176B*, **TNFSF13B**
    * B4 - b1: *BMPER*, *CD36*, *CXCL12*, *CYGB*, *EBF1*, *EFEMP1*, *FABP4*, *FGF10*, *FMO1*, *FZD4*, *GGT5*, *GPC3*, *GPX3*, *ITM2A*, *LGALS3BP*, *MGST1*, *NFIB*, *NOVA1*, *NR1H3*, *PPARG*, *ZFHX4*
    * B4 - b4: **ABCA8**, *ADAMTSL3*, *ADCYAP1R1*, *APOD*, *APOE*, *BMPER*, *C3*, *C7*, *CXCL12*, *CYGB*, *F3*, *FZD4*, *GDF10*, *GGT5*, *GPX3*, *GSN*, *IGFBP3*, *IGFBP7*, *MGP*, *NFIB*, *NRP1*, **NTRK2**, *PPL*, *SERPING1*, *SRPX*, *TMEM176A*, *TNFSF13B*, *TSHZ2*, *TXNIP*, *VIT*
* It is clear that *D1*/*D2* are related to*b5*. Also, *b5* is small, so it is likely that the two sets of genes are related. Maybe *b5* is more related to *D2* than to *D1*, or maybe both *D1* and *D2* coexist as *b5*.
    * D1 - b5: *ABCA8*,  *APOD*, **COL8A1**, **ETV1**, *GPC3*, *ITM2A*, *P2RY14*, *PHLDA1*, *PTCH1*, **SOX9**, *SPARCL1*, *TM4SF1*, *WFDC1*
    * D2 - b5: *BHLHE40*, **BNC2**, *CAV1*, **CAV2**, *CCDC3*,  **CSRP1**, *DDIT4*, *EFNB1*,  *GAB1*, **ITGA6**, **ITGB4**,  **KRT19**, **KLF5**, *MTSS1*, **NGFR**, **PEAR1**, *PHLDA3*, *PKDCC*,  **SLC2A1**, *STXBP6*, *TJP1*, *TM4SF1*
    * D1 - D2 - b5: *AKAP12*, **CLDN1**, **DUSP5**, **EBF2**, **FOXS1**,  *MATN2*, *MEOX2*, **MRAS**,  *NDRG2*,  *NR2F2*, *PLK2*, *TGFBI*, *VIT*
* There is a clear relationship between *C* and *d* axes. The most relevant interactions are:
    * C1 - d3: **ADAMTS18**, *ALX4*, **BCL11B**, *CD200*, *CDH11*, *CNN2*, **COL11A1**, *EDNRA*, *EDNRB*, *EGFL6*, *F2R*, *KIAA1217*, *KIF26B*, *LAMC3*, *MDK*, **MEF2C**, *MICAL2*, *NTRK3*, *PALLD*, **PTCH1**, *ROBO2*, **RUNX2**, *STMN1*, *TAGLN*, **TENM3**, *TMEM119*, *TNS3*, *TPM2*
    * C5 - d1: *ALX4*, **BMP7**, *CRABP1*, **FBXO32**, *IGFBP3*, *INHBA*, *MRPS6*, **PGM2L1**, *PTMA*, **RSPO3**, *SDC1*, *SPON1*, **TFAP2A**, *TNN*, *TRPS1*
    * C2 - c/d: **CCK**, *CHST15*, **COCH**,  **CPNE5**, *CRABP1*, *CYP1B1*, **FIBIN**, *FMOD*, *MEOX2*, **MKX**, *NCAM1*, *NR2F1*, *NRP2*, **PLXDC1**, *PTGFR*, *PTH1R*, *RSPO4*, *SRPX*, *TBXA2R*, *TNMD*, *TRIB2*
    * C3 - c/d: *COL7A1*, *COL8A2*, *EGFLAM*, *F2R*, *HMCN1*, *MAFB*, *MFAP2*, *MMP16*, *NREP*, *NRP2*, **RASL11B**, *RFLNB*, *TPM2*, *TRIL*
    * C2 - C3 - c/d: *ADAMTS9*, *DKK2*, *EDNRA*, **EMID1**, **GPM6B**, *KIF26B*, *MAFB*, *TBX15*, *TCF4*, *TENM3*, *TRPS1*, *TSHZ3*

In [None]:
df_jaccard_mouse_human = get_df_overlap(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, N=100)
plot_heatmap(df_jaccard_mouse_human, dict_colors_mouse, dict_colors_human, figsize=(12,7), ticklabelsize=15)

In [None]:
plot_clustermap(df_jaccard_mouse_human, dict_colors_mouse, dict_colors_human, figsize=(12,7), ticklabelsize=15)

In [None]:
print(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A1', 'a2', translate=True, N=150))

In [None]:
print(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A4', 'a2', translate=True, N=150))

In [None]:
print(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A2', 'c1', translate=True, N=150))

In [None]:
print(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'C1', 'd3', translate=True, N=150))

In [None]:
print(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'C5', 'd1', translate=True, N=150))

In [None]:
print(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'D1', 'b5', translate=True, N=150))

In [None]:
print(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'D2', 'b5', translate=True, N=150))

In [None]:
print(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A2', 'd1', translate=True, N=150))

In [None]:
print(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'B1', 'b6', translate=True, N=150))

In [None]:
print(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'B2', 'b4', translate=True, N=150))