# Mouse / human comparison

For this notebook **you need to run the 4M and 4H notbeooks previously!!**.

In this notebook we are going to analyse the similarities and differences between mouse and human skin fibroblast populations. We already know that mouse and human skin are different between them, but we want to know how much of this translates into the transcriptomic realm of single cell.

To do this analysis we are going to do two analyses. 
* Choose datasets with human and mouse samples from the same lab (e.g., Boothby, Vorstandlechner) and try to find overlaps in the populations.
* Map human/mouse genes between them using an homology databe (MGI) and try to find similarities between the list of markers of human and mouse populations. 

## imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
import matplotlib.cm as cm
import networkx as nx

In [None]:
!pip install munkres
from munkres import Munkres

In [None]:
# local imports and imports from other notebooks
from cellassign import assign_cats
from fb_functions import make_gene_scoring_with_expr, plot_score_graph, plot_UMAPS_gene, plot_adata_cluster_properties
%store -r dict_colors_human
%store -r dict_colors_mouse

dict_colors_human_mouse = {**dict_colors_human , **dict_colors_mouse}

%store -r seed
%store -r magma
%store -r data_dir

In [None]:
%store -r dict_make_gene_scoring_robust
%store -r dict_make_gene_scoring_axis_robust

In [None]:
mpl.rcParams['figure.dpi'] = 120
pd.options.display.float_format = "{:,.2f}".format

In [None]:
def join_fbs_adatas(adata_full, adata_fb):
    cell_types = adata_full.obs['assigned_cats'].copy().astype(str)
    intersect_idx = np.intersect1d(adata_fb.obs_names, adata_full.obs_names)
    cell_types[intersect_idx] = [f'fibro_{i}' for i in adata_fb[intersect_idx].obs['cluster']]
    adata_full.obs['full_cell_type'] = cell_types.astype('category')

## Creating the mouse-human gene homology dictionary

In [None]:
# !cd results && wget http://www.informatics.jax.org/downloads/reports/HOM_AllOrganism.rpt

In [None]:
df = pd.read_csv('results/HOM_AllOrganism.rpt', sep='\t')
df = df[df['Common Organism Name'].isin(['mouse, laboratory', 'human'])][['DB Class Key', 'Common Organism Name', 'Symbol']].reset_index(drop=True)

list_DB = set(df['DB Class Key'].values)

dict_mouse_human = {}
for el in tqdm(list_DB):
    df_sub = df[df['DB Class Key'] == el].sort_values(by='Common Organism Name')
    if len(df_sub) == 2:
        dict_mouse_human[df_sub.iloc[1, 2]] = df_sub.iloc[0, 2]

# Comparison of UMAPs of populations

In this section we are going to compare mouse and human datasets of 
* Datasets from the same laboratory
* Datasets from diferent laboratories (more confirmatory)

To do this we are going to translate the mouse into human genes and get the subset of genes that have homology and are present in the human adata.

### Boothby

In [None]:
boothby_2021_dir = data_dir + '/boothby_2021'
boothby_2021_ctrl_mouse_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_mouse_fb_robust.h5')
boothby_2021_ctrl_human_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_human_fb_robust.h5')

boothby_2021_ctrl_mouse_fb_raw = sc.read(boothby_2021_dir + '/boothby_2021_mouse_ctrl_mouse.h5')
boothby_2021_ctrl_human_fb_raw = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_human.h5')

In [None]:
boothby_2021_ctrl_mouse_fb.X = boothby_2021_ctrl_mouse_fb_raw[boothby_2021_ctrl_mouse_fb.obs_names, boothby_2021_ctrl_mouse_fb.var_names].X.copy()
boothby_2021_ctrl_human_fb.X = boothby_2021_ctrl_human_fb_raw[boothby_2021_ctrl_human_fb.obs_names, boothby_2021_ctrl_human_fb.var_names].X.copy()

In [None]:
adata_mouse_genes, adata_human_genes = boothby_2021_ctrl_mouse_fb.var_names, boothby_2021_ctrl_human_fb.var_names
mouse_selected_genes, human_homolog, human_mouse_gene = [], [], []

for i in adata_mouse_genes:
    if i in dict_mouse_human:
        if (dict_mouse_human[i] in adata_human_genes) & (dict_mouse_human[i] not in human_homolog):
            mouse_selected_genes.append(i); human_homolog.append(dict_mouse_human[i]); human_mouse_gene.append(f'{dict_mouse_human[i]} | {i}')

In [None]:
boothby_2021_ctrl_human_fb, boothby_2021_ctrl_mouse_fb = boothby_2021_ctrl_human_fb[:, human_homolog], boothby_2021_ctrl_mouse_fb[:, mouse_selected_genes]
boothby_2021_ctrl_human_fb.var_names, boothby_2021_ctrl_mouse_fb.var_names = human_mouse_gene, human_mouse_gene
boothby_2021_ctrl_human_mouse_fb = sc.AnnData.concatenate(boothby_2021_ctrl_human_fb, boothby_2021_ctrl_mouse_fb, batch_categories=['human', 'mouse'], batch_key='organism')

In [None]:
sc.pp.log1p(boothby_2021_ctrl_human_mouse_fb)

In [None]:
sc.pp.pca(boothby_2021_ctrl_human_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(boothby_2021_ctrl_human_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(boothby_2021_ctrl_human_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(boothby_2021_ctrl_human_mouse_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(boothby_2021_ctrl_human_mouse_fb)

In [None]:
boothby_2021_ctrl_human_mouse_fb.var_names[boothby_2021_ctrl_human_mouse_fb.var['highly_variable'] == True]

In [None]:
sc.tl.umap(boothby_2021_ctrl_human_mouse_fb, min_dist=0.2, random_state=seed)

In [None]:
boothby_2021_ctrl_human_mouse_fb.obs['cluster'] = boothby_2021_ctrl_human_mouse_fb.obs['cluster'].astype('category')
boothby_2021_ctrl_human_mouse_fb.uns['cluster_colors'] = [dict_colors_human_mouse[i] if i in dict_colors_human_mouse else '#bcbcbc' for  i in boothby_2021_ctrl_human_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pp.subsample(boothby_2021_ctrl_human_mouse_fb, fraction=1, random_state=0, copy=False)
sc.pl.umap(boothby_2021_ctrl_human_mouse_fb, color=['cluster'], legend_loc='on data')

In [None]:
sc.pl.umap(boothby_2021_ctrl_human_mouse_fb[boothby_2021_ctrl_human_mouse_fb.obs['organism']=='mouse'], color=['cluster'], legend_loc='on data')
sc.pl.umap(boothby_2021_ctrl_human_mouse_fb[boothby_2021_ctrl_human_mouse_fb.obs['organism']=='human'], color=['cluster'], legend_loc='on data')

In [None]:
sc.tl.leiden(boothby_2021_ctrl_human_mouse_fb, resolution=0.01)
sc.pl.umap(boothby_2021_ctrl_human_mouse_fb, color=['leiden'], legend_loc='on data')
sc.tl.rank_genes_groups(boothby_2021_ctrl_human_mouse_fb, groupby='leiden')
sc.pl.rank_genes_groups_tracksplot(boothby_2021_ctrl_human_mouse_fb, n_genes=30)

### Vorstandlechner

In [None]:
vorstandlechner_2021_dir = data_dir + '/Vorstandlechner_2021'
vorstandlechner_2021_ctrl_human_fb = sc.read(f"{vorstandlechner_2021_dir}/vors_2021_ctrl_human_fb_robust.h5")
vorstandlechner_2021_ctrl_mouse_fb = sc.read(f"{vorstandlechner_2021_dir}/vorstandlechner_2021_ctrl_mouse_fb_robust.h5")

vorstandlechner_2021_ctrl_human_fb_raw = sc.read(f"{vorstandlechner_2021_dir}/vorstandlechner_2021_ctrl_human.h5")
vorstandlechner_2021_ctrl_mouse_fb_raw = sc.read(f"{vorstandlechner_2021_dir}/vorstandlechner_2021_ctrl_mouse.h5")

In [None]:
vorstandlechner_2021_ctrl_human_fb.X = vorstandlechner_2021_ctrl_human_fb_raw[vorstandlechner_2021_ctrl_human_fb.obs_names, vorstandlechner_2021_ctrl_human_fb.var_names].X.copy()
vorstandlechner_2021_ctrl_mouse_fb.X = vorstandlechner_2021_ctrl_mouse_fb_raw[vorstandlechner_2021_ctrl_mouse_fb.obs_names, vorstandlechner_2021_ctrl_mouse_fb.var_names].X.copy()

In [None]:
adata_mouse_genes, adata_human_genes = vorstandlechner_2021_ctrl_mouse_fb.var_names, vorstandlechner_2021_ctrl_human_fb.var_names
mouse_selected_genes, human_homolog, human_mouse_gene = [], [], []

for i in adata_mouse_genes:
    if i in dict_mouse_human:
        if (dict_mouse_human[i] in adata_human_genes) & (dict_mouse_human[i] not in human_homolog):
            mouse_selected_genes.append(i); human_homolog.append(dict_mouse_human[i]); human_mouse_gene.append(f'{dict_mouse_human[i]} | {i}')

In [None]:
vorstandlechner_2021_ctrl_human_fb, vorstandlechner_2021_ctrl_mouse_fb = vorstandlechner_2021_ctrl_human_fb[:, human_homolog], vorstandlechner_2021_ctrl_mouse_fb[:, mouse_selected_genes]
vorstandlechner_2021_ctrl_human_fb.var_names, vorstandlechner_2021_ctrl_mouse_fb.var_names = human_mouse_gene, human_mouse_gene
vorstandlechner_2021_ctrl_human_mouse_fb = sc.AnnData.concatenate(vorstandlechner_2021_ctrl_human_fb, vorstandlechner_2021_ctrl_mouse_fb, batch_categories=['human', 'mouse'], batch_key='organism')

In [None]:
sc.pp.log1p(vorstandlechner_2021_ctrl_human_mouse_fb)

In [None]:
sc.pp.pca(vorstandlechner_2021_ctrl_human_mouse_fb, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(vorstandlechner_2021_ctrl_human_mouse_fb, key='Internal sample identifier', max_iter_harmony=50)
sc.pp.neighbors(vorstandlechner_2021_ctrl_human_mouse_fb, use_rep='X_pca_harmony', n_neighbors=int(0.5 * len(vorstandlechner_2021_ctrl_human_mouse_fb) ** 0.5 // 4), metric='cosine')
tk.tl.triku(vorstandlechner_2021_ctrl_human_mouse_fb)

In [None]:
vorstandlechner_2021_ctrl_human_mouse_fb.var_names[vorstandlechner_2021_ctrl_human_mouse_fb.var['highly_variable'] == True]

In [None]:
sc.tl.umap(vorstandlechner_2021_ctrl_human_mouse_fb, min_dist=0.2, random_state=seed)

In [None]:
vorstandlechner_2021_ctrl_human_mouse_fb.obs['cluster'] = vorstandlechner_2021_ctrl_human_mouse_fb.obs['cluster'].astype('category')
vorstandlechner_2021_ctrl_human_mouse_fb.uns['cluster_colors'] = [dict_colors_human_mouse[i] if i in dict_colors_human_mouse else '#bcbcbc' for  i in vorstandlechner_2021_ctrl_human_mouse_fb.obs['cluster'].cat.categories]

In [None]:
sc.pp.subsample(vorstandlechner_2021_ctrl_human_mouse_fb, fraction=1, random_state=0, copy=False)
sc.pl.umap(vorstandlechner_2021_ctrl_human_mouse_fb, color=['cluster'], legend_loc='on data')

In [None]:
sc.pl.umap(vorstandlechner_2021_ctrl_human_mouse_fb[vorstandlechner_2021_ctrl_human_mouse_fb.obs['organism']=='mouse'], color=['cluster'], legend_loc='on data')
sc.pl.umap(vorstandlechner_2021_ctrl_human_mouse_fb[vorstandlechner_2021_ctrl_human_mouse_fb.obs['organism']=='human'], color=['cluster'], legend_loc='on data')

In [None]:
sc.tl.leiden(vorstandlechner_2021_ctrl_human_mouse_fb, resolution=0.01)
sc.pl.umap(vorstandlechner_2021_ctrl_human_mouse_fb, color=['leiden'], legend_loc='on data')
sc.tl.rank_genes_groups(vorstandlechner_2021_ctrl_human_mouse_fb, groupby='leiden')
sc.pl.rank_genes_groups_tracksplot(vorstandlechner_2021_ctrl_human_mouse_fb, n_genes=30)

### Conclusions
Datasets do not match. When reading the methods they point out that they use a restricted set of genes to create then do the UMAP. I don't feel confortable with that strategy, so I prefer to do a more "unbiased" analysis based on the markers.

# Comparison of gene patterns between populations
In this section we are going to take the gene markers from human and mouse, and do human-mouse, mouse-mouse and human-human comparisons between the set of markers. From this we expect to find similarities betwen both parties and establish a homology model of the skin fibroblasts.

To do this analysis we are going to:
* Get the top N markers for each population in mouse and human (also in human-human and mouse-mouse) and compute the Jaccard index between them. 
    * To find the "best" N we are going to compute for an array of N the matrix the jaccard matrix and get the trace with highest values using the hungarian algorithm. The sum of the trace is stored. A higher sum would imply a general higher overlap between markers and, therefore, a more relevant analysis. From the array of values, we are going to get the best "rounded" N.
* With the top N we are going to get the Jaccard matrix and plot it with a heatmap and a clustergram.
* From there, we are going to select the best matches and analyze the relationship between human and mouse populations

We do the human-human and mouse-mouse comparisons to get a "control" view. We expect more gene overlap between clusters of the same axis than of different ones. Also, we get a sense of the expected overlaps, to see how this is reflected in mouse.

To make the joining as "easy" as possible, we are going to use the subset of genes that have a mouse-human homology only. This might limit the extent of the analysis but, in general, we find a fair overlap, and this will make the dataset mapping and batch effect correction much easier.

In [None]:
%store -r dict_make_gene_scoring_cluster_robust_human
%store -r dict_make_gene_scoring_cluster_robust_mouse

In [None]:
def get_df_overlap(dict_1, dict_2, N=100, translate=True):
    df_overlap = pd.DataFrame(0, index=dict_1.keys(), columns=dict_2.keys())
    
    for cluster_name_1, cluster_df_1 in dict_1.items():
        for cluster_name_2, cluster_df_2 in dict_2.items():
            gene_list_1 = set(cluster_df_1.index[:N])
            gene_list_2_unchanged = cluster_df_2.index[:N]
            if translate:
                gene_list_2 = set([dict_mouse_human[i] if i in dict_mouse_human else i for i in gene_list_2_unchanged])
            else:
                gene_list_2 = set(gene_list_2_unchanged)

            overlap = len(gene_list_1 & gene_list_2) / len(gene_list_1 | gene_list_2)
            if overlap == 1:
                overlap = 0
                
            df_overlap.loc[cluster_name_1, cluster_name_2] = overlap
    
    return df_overlap

In [None]:
def plot_best_N(dict_1, dict_2, N_min=10, N_max=300, translate=True):
    list_N, list_total = [],[] 
    for N in tqdm(range(N_min, N_max)):
        df_jaccard_1_2 = get_df_overlap(dict_1, dict_2, N=N, translate=translate)

        m = Munkres()
        indexes = m.compute(1-df_jaccard_1_2.values)
        total = 0
        for row, column in indexes:
            value = df_jaccard_1_2.values[row][column]
            total += value

        list_N.append(N)
        list_total.append(total)

    plt.plot(list_N, list_total)

In [None]:
def plot_heatmap(df_jaccard_1_2, dict_colors_1, dict_colors_2, figsize=(12,8), ticklabelsize=15, diag=False):
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    if diag:
        mask = np.eye(df_jaccard_1_2.shape[0], dtype=bool)
        sns.heatmap(df_jaccard_1_2, annot=True, fmt='.2f', ax=ax, mask=mask)
        ax.set_facecolor("#989898")
    else:
        sns.heatmap(df_jaccard_1_2, annot=True, fmt='.2f', ax=ax)
    [t.set_color(dict_colors_1[t.get_text()]) for t in ax.xaxis.get_ticklabels()]; [t.set_color(dict_colors_2[t.get_text()]) for t in ax.yaxis.get_ticklabels()]
    ax.set_xticklabels(ax.get_xticklabels(),  weight='bold', size=ticklabelsize); ax.set_yticklabels(ax.get_yticklabels(),  weight='bold', size=ticklabelsize, va='center')
    None # To avoid plotting stuff on screen
    
def plot_clustermap(df_jaccard_1_2, dict_colors_1, dict_colors_2, figsize=(12,8), ticklabelsize=15, diag=False):
    if diag:
        mask = np.eye(df_jaccard_1_2.shape[0], dtype=bool)
        cg = sns.clustermap(df_jaccard_1_2, mask=mask)
        ax = cg.ax_heatmap
        ax.set_facecolor("#989898")
    else:
        cg = sns.clustermap(df_jaccard_1_2, )
        ax = cg.ax_heatmap
        
    [t.set_color(dict_colors_1[t.get_text()]) for t in ax.xaxis.get_ticklabels()]; [t.set_color(dict_colors_2[t.get_text()]) for t in ax.yaxis.get_ticklabels()]
    ax.set_xticklabels(ax.get_xticklabels(),  weight='bold', size=ticklabelsize); ax.set_yticklabels(ax.get_yticklabels(),  weight='bold', size=ticklabelsize, va='center')
    None # To avoid plotting stuff on screen

In [None]:
def print_common_genes(dict_1, dict_2, cluster_1, cluster_2, translate=True, N=150):
    gene_list_1 = set(dict_1[cluster_1].index[:N])
    gene_list_2_unchanged = dict_2[cluster_2].index[:N]
    if translate:
        gene_list_2 = set([dict_mouse_human[i] if i in dict_mouse_human else i for i in gene_list_2_unchanged])
    else:
        gene_list_2 = set(gene_list_2_unchanged)
        
    return sorted(list(gene_list_1 & gene_list_2))

### Human-human comparison

In [None]:
plot_best_N(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_human, N_min=10, N_max=300)

In [None]:
df_jaccard_human_human = get_df_overlap(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_human, N=125)
plot_heatmap(df_jaccard_human_human, dict_colors_human, dict_colors_human, figsize=(12,7), ticklabelsize=15, diag=True)

In [None]:
plot_clustermap(df_jaccard_human_human, dict_colors_human, dict_colors_human, figsize=(12,7), ticklabelsize=15, diag=True)

#### Comments on human-human
* It is clear that A3 is a bridge between A1-A4 and A2. A2 and A1 are clearly different clusters, and A3 is more "favorable" to A1 than to A2.
* A2 has a "shared" transcriptomic profile with C and E axis! It is also apparent form UMAPs and connectivity graphs. However, its relatively low similarity profile with C1, C3 and E1 makes it a somewhat "independent" population.
    * A2-C1: APELA, CCDC3, **COL21A1**, **COL3A1**, **COL7A1**, CPXM2, **DKK3**, DUSP4, **LAMC3**, ROBO2, **SEMA5A**, **SPON1**, STMN1, THSD4, **TMEM119**
    * A2-C3: ANTXR1, APELA, C4orf48, **COL3A1**, COL5A1, COL6A3, **COL7A1**, COMP, DKK3, HMGB3, **LAMC3**, LOXL2, MFAP2, PTK7, RAB31, **SEMA5A**, **SPON1**, **TCF4**, **TMEM119**, TNC
    * A2-E1: CMKLR1, **DKK3**, ID1, IGFBP2, LAMC3, MAP2, PDGFRA, RGCC, SEMA5A, **SPON1**, SPRY1, **TCF4**, TNFRSF21
* B axis seems quite independent from the rest of axes, although it is related to T1 and, also, B1 with D1. 
    * It is also claear that B3 acts as a bridge between B1, B2 and B4. B1 and B2 are not related, and B4 is slightly related to B2, but not B1 (although they appear more related in UMAPs and graphs).
    * B1, similar to A2, shows an independent transcriptomic profile with its neighbour B clusters.
* The C, D and E axes, although they have a substructure, they are more correlated between them than A or B axes.
    * C1 and C3 are quite related, and C3 specially is the bridge cluster among the C axis. 
    * D1 and D2 are also quite related.

### Mouse-mouse comparison

In [None]:
plot_best_N(dict_make_gene_scoring_cluster_robust_mouse, dict_make_gene_scoring_cluster_robust_mouse, N_min=10, N_max=300, translate=False)

In [None]:
df_jaccard_mouse_mouse = get_df_overlap(dict_make_gene_scoring_cluster_robust_mouse, dict_make_gene_scoring_cluster_robust_mouse, N=150, translate=False)
plot_heatmap(df_jaccard_mouse_mouse, dict_colors_mouse, dict_colors_mouse, figsize=(12,7), ticklabelsize=15, diag=True)

#### Comments on mouse-mouse
* a1 and a2 clusters are clearly related. 
* a/d cluster shows little resemblance the *d* axis, but only for a few genes
    * a/d - d1: *Ptma*, *Tpm2*, *Tuba1c*
    * a/d - d2: *Ptma*, *Tpm2*, *H1fx*
    * a/d - d3: *Ptma*, *Tpm2*, *Bok*, *Hnrnpa1*, *Stmn1*, *Tubb5*
* b axis is interconnected, as shown in the PAGA graph. The most relevant similarities are b1 and b2 with b4; and b2 with b3.
    * b/c cluster is a bridge between b6 and c1, although b6 and c1 have almost no resemblance.
    * Interestingly, b6 does share a good set of genes with a1. Maybe b6 is some sort of bridge between *a* and *b*: *Adgrd1*, *Ccl2*, *Ccl7*, *Clic4*, *Csrnp1*, *Cxcl1*, *Cxcl2*, *Errfi1*, *Fosl1*, *Gfpt2*, *Has1*, *Hk2*, *Ifi205*, *Ifrd1*, *Il6*, *Kdm6b*, *Maff*, *Mt2*, *Myc*, *Nfkb1*, *Nfkbia*, *Nfkbiz*, *Nr4a3*, *Ptgs2*, *Ptx3*, *Tnfaip2*, *Tnfaip3*, *Tnfaip6*, *Txnrd1*, *Ugdh*, *Zc3h12a*
* There is a good internal cohesion within the *d* axis.
* *e1* cluster, althoguh it is slightly related to *b5*, is separate from the rest fo clusters:*Cldn1*, *Cpe*, *Cspg4*, *Csrp1*, *Ebf2*, *Hmgcs2*, *Klf5*, *Lmo4*, *Nr2f2*, *Olfml2a*, *Phlda3*, *Sox9*.

In [None]:
plot_clustermap(df_jaccard_mouse_mouse, dict_colors_mouse, dict_colors_mouse, figsize=(12,7), ticklabelsize=15, diag=True)

### Mouse-human comparison

In [None]:
plot_best_N(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, N_min=10, N_max=300)

In [None]:
N_human_mouse = 150
df_jaccard_mouse_human = get_df_overlap(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, N=N_human_mouse)
plot_heatmap(df_jaccard_mouse_human, dict_colors_mouse, dict_colors_human, figsize=(12,7), ticklabelsize=15)

In [None]:
plot_clustermap(df_jaccard_mouse_human, dict_colors_mouse, dict_colors_human, figsize=(12,7), ticklabelsize=15)

#### Comments on human-mouse
Representation of genes: MARKER, **MARKER IN HUMAN**, *MARKER IN MOUSE*, ***MARKER IN HUMAN AND MOUSE***  [1, **3**, *6*, ***3***]  -  13

Of note, the selection of genes is based on the [LOC (fb, UMAP)] column from [this table](https://docs.google.com/spreadsheets/d/1lfI6sgjEyg37BGL7VRMfW7KgwGKwX5QrCtnKYk1DXY4/edit?usp=sharing). To make this table I based on the UMAPs from 4H and 4M notabooks. The notation is X ~ Y if two clusters or axes are equally *relevant*, and X > Y if X is *more relevant* than Y. **This notation is subjective** but to some extent necessary.

The logic for a marker on the following list to be chosen **is also subjective** and is based in two factors: either (1) the marker is exclusive of the cluster it is said to be represented (e.g. if it says is of cluster b5 and the UMAP show it is only expressed in cluster b5 -- b5 -- or it is expressed in b5 preferentially -- b5 > X --) or (2) it is a marker of that cluster, among others (b5 ~ X or b ~ X > Y).
    
---

* A4
    * A4 - a1:  ADGRD1, FLNC, VASN, **ACE**, *HAS1*, *METRNL*, *TNFAIP6*, *UGDH*  -  [3, **1**, *4*, ***0***]  -  8
    * A4 - a2: ADAMTSL4,  LIMS2, **CLEC3B**, **IGFBP5**, **ISLR**, **PAMR1**, **PTGIS**, **SFRP4**, *ADGRG2*, ***ACKR3***,  ***RAB32***  -  [2, **6**, *1*, ***2***]  -  11
    * A4 - a1 - a2: CD55, CHRDL1, CREB5, DPP4, GAP43, HEG1, PRSS23, SCARA5, TIMP2, **CD248**, **DBN1**, **EMILIN2**, **SEMA3C**, *ALDH1A3*, *BASP1*, *FNDC1*, *GFPT2*, *MFAP5*, *NPR1*, ***AIF1L***, ***IGFBP6***, ***PCOLCE2***, ***SEMA3E***, ***WNT2***  -  [9, **4**, *6*, ***5***]  -  24
    
    
* A1
    * A1 - a2: DBN1, LIMS2, PAMR1, PTGIS, **CD55**, **CHRDL1**, **CLEC3B**, **ECM1**, **ISLR**, **SCARA5**, **SFRP2**, **TIMP2**, **TUBB4A**, *EMILIN2*, *IGFBP6*, *SEMA3C*, *UCHL1* , ***ACKR3***, ***CD248***, ***DPP4***, ***MFAP5***, ***PI16***, ***PRKG2***  -  [4, **9**, *4*, ***6***]  -  23
    
    * A1 - b3: C1QTNF3, FGL2, GPX3, **CD151**, **FBLN1**, **FBLN2**, **PCOLCE**, **SERPINF1**, **SMOC2**, *COL12A1*, *ELN*, *MFAP4*, *MGP*, *PAM*, *PTGIS*, ***ABCC9***, **ANGPTL1**, ***CCN5***, ***CLU***, ***LGR5***, ***LOX***, ***OMD***, ***PDGFRL***, ***PODN***, ***SFRP2***   -  [3, **7**, *6*, ***9***]  -  25
    
    * A1 - b/c: CCDC80, CD34, COL12A1, GALNT15, GPX3, MGST1, **AEBP1**, **ANGPTL1**, **CADM3**, **CPZ**, **CTSK**, **DCN**, **HPGD**, **PCOLCE**, **PDGFRL**, **SEMA3B**, **SERPINF1**, **THBS3**, **TNXB**, *C1QTNF3*, *CYP4B1*, *MMP27*, *PLTP*, ***ABCC9***, ***LGR5***, ***THBS2***  -  [6, **13**, *4*, ***3***]  -  26
    
    * A1 - c1: COL1A1, SPARC, **AEBP1**, **ANGPTL1**, **COL1A2**, **DCN**, **PDGFRL**, **SCARA5**, *C1QTNF3*, *ELN*, *ITGBL1*, *MFAP4*, *PCOLCE2*, ***CPZ***, ***CTSK***, ***CYBRD1***, ***HPGD***, ***MMP27***, ***SEMA3B***,  -  [2, **6**, *5*, ***6***]  -  19
    
    
* A3
    * A3 - b2: CD81, CD9, COL6A1, COL6A2, CP, ENPP3, FBLN2, LOX, MFAP5, SMOC2, **COL14A1**, **SERPINF1**, **SVEP1**, **THBS3**, *ISLR*, *PODN*, *SSC5D*, ***ITIH5***, ***THBS4***  -  [10, **4**, *3*, ***2***]  -  19
    
    * A3 - c1: AEBP1, COL1A1, COL1A2, COL5A1, COPZ2, DCN, HPGD, HTRA1, KDELR3, P4HA2, PDGFRL, RCN3, RSPO1, SERPINH1, SPARC, TSPAN4, **COL3A1**, *ADAMTS2*, *ALDH3A1*, *C1QTNF3*, *CGREF1*, *CTSK*, *CYBRD1*, *ECM2*, *MFAP4*, *PCOLCE2*, *PPIC*, *SEMA3B*,  ***ELN***, ***MMP27***  -  [16, **1**, *11*, ***2***]  -  30
    
    * A3 - b2 - c1: **ANGPTL1**, **BGN**  -  [0, **2**, *0*, ***0***]  -  2

---
    
* A2
    * A2 - c2: COL3A1, EMX2, LAMC3, PTK7, SCARF2, SMIM3, SPON1, TCF4, TGFBI, ZNF608, **COL23A1**, **CYP26B1**, **NKD1**, **NKD2**, **PREX1**, **RSPO1**, *AXIN2*, *CD9*, *COL7A1*, *HS3ST6*, *IGFBP2*, *MAMDC2*, ***AHRR***, ***F13A1***, ***GREM2***, ***KCNK2***, ***LSAMP***, ***PTPRE***,***SPRY1***, ***STC1***, ***TNFRSF19***, ***TWIST2***   -  [10, **6**, *6*, ***10***]  -  32

---

* B1 and B3 vs b6 : Aglunos genes (CCL2, CXCL12, etc. también son markrs de a1.
    * B1 - b6: ARID5A, ARL5B, BCL3, ETS2, GFPT2, PTGS2, TNFAIP2, **CEBPB**, **ERRFI1**, **FOSL1**, **GCH1**, **IER3**, **KDM6B**, **MAFF**, **NR4A3**, **PNRC1**, *CSRNP1*, *LIF*, ***ELL2***, ***IL6***, ***NFKB1***, ***TNFAIP6***  -  [7, **9**, *2*, ***4***]  -  22
    * B3 - b6: ATF3, BTG2, CSF1, FOSB, GADD45B, OSMR, RARRES2, RNF122, **CXCL12**, **EGR1**, **JUNB**, **TMEM176A**, **TMEM176B**, **VCAM1**, **ZFP36**, ***CYP7B1***  -  [8, **0**, *7*, ***1***]  -  16
    * B1 - B3 - b6: ADAMTS1, BTG1, CCNL1, MYC, NNMT, UGCG, **ADAMTS4**, **CCL2**, **CXCL2**, **ICAM1**, **IRF1**, **SOCS3**, **SOD2**, **TNFAIP3**, *NFKBIZ*, ***BIRC3***, ***NFKBIA***   -  [6, **8**, *1*, ***2***]  -  17

---

* B2 and B4 vs b4
    * B2 - b4: C1S, COL4A2, FRMD6, IL11RA, P2RY14, **IL33**, *AVPR1A*, *ID4*, *NDRG2*, *PTCH2*, *SNED1*, ***COL4A4***, ***TMEM176B***  -  [5, **1**, *5*, ***2***]  -  13
    * B4 - b4: BMPER, FZD4, PPL, SERPING1, SRPX, **GPX3**, **GSN**, **NFIB**, **TSHZ2**, *ADAMTSL3*, *ADCYAP1R1*, *F3*, *VIT*, ***GDF10***, ***MGP***, ***NTRK2***  -  [5, **4**, *4*, ***3***]  -  16
    * B2 - B4 - b4: C3, **GGT5**, *APOE*, *IGFBP7*, *NRP1*, *TNFSF13B*, ***C7***, ***CXCL12***, ***CYGB***, ***TMEM176A***  -  [1, **1**, *4*, ***4***]  -  10


* B4 vs b1 and b4
    * B4 - b1: EBF1, GPC3, LGALS3BP, NOVA1, **EFEMP1**, **FMO1**, **ITM2A**, **MGST1**, *CD36*, *FABP4*, ***C6***, ***PPARG***  -  [4, **4**, *2*, ***2***]  -  12
    * B4 - b4: PPL, SERPING1, SRPX, **GSN**, **NFIB**, *ADAMTSL3*, *ADCYAP1R1*, *APOE*, *C3*, *C7*, *F3*, *NRP1*, *TMEM176A*, *TNFSF13B*, *VIT*, ***GDF10***, ***MGP***, ***NTRK2***, **TSHZ2**  -  [3, **2**, *10*, ***3***]  -  18
    * B4 - b1 - b4: BMPER, FZD4, **GGT5**, **GPX3**, *IGFBP7*, ***CXCL12***, ***CYGB***  -  [2, **3**, *1*, ***2***]  -  8

---

* C1
    * C1 - d3: KIAA1217, MDK, MICAL2, STMN1, TNN, TNS3, **ALX4**, **EDNRA**, **EDNRB**, **LAMC3**, **PTCH1**, **ROBO2**, **RUNX2**, **TENM3**, *CD200*, *EGFL6*, *NTRK3*, *PALLD*, *TAGLN*, *TMEM119*, *TPM2*,  ***ADAMTS18***, ***BCL11B***, ***CDH11***, ***CNN2***, ***COL11A1***, ***F2R***, ***KIF26B***, ***MEF2C***  -  [6, **8**, *7*, ***8***]  -  29



* C5 
    * C5 - d1 (& d2): PTMA, *ALX4*, *CRABP1*, *SDC1*, *SPON1*, *TNN*, ***BMP7***, ***FBXO32***, ***IGFBP3***, ***INHBA***, ***MRPS6***, ***PGM2L1***, ***RSPO3***, ***TFAP2A***, ***TRPS1***  -  [1, **0**, *5*, ***9***]  -  15


* C2 and C3 vs c/d
    * C2 - c/d: NR2F1, PTH1R, SRPX, **CPNE5**, **CRABP1**, **MEOX2**, **NCAM1**, **RSPO4**, *CHST15*, *CYP1B1*, *TBXA2R*, *TRIB2*, ***CCK***, ***COCH***, ***FIBIN***, ***FMOD***, ***MKX***, ***PLXDC1***, ***PTGFR***, ***TNMD***  -  [3, **5**, *4*, ***8***]  -  20
    * C3 - c/d: TPM2, **COL8A2**, **MFAP2**, **TRIL**, *EGFLAM*, *NREP*, *RFLNB*, ***COL7A1***, ***F2R***, ***MMP16***, ***RASL11B***  -  [1, **3**, *3*, ***4***]  -  11
    * C2 - C3 - c/d: COL11A1, KIF26B, NRP2, TRPS1, **EMID1**, **TBX15**, *ADAMTS9*, *EDNRA*, *TSHZ3*, ***DKK2***, ***GPM6B***, ***MAFB***, ***TCF4***, ***TENM3***  -  [4, **2**, *3*, ***5***]  -  14

---

* D1 and D2 vs b5    
    * D1 - b5: CNN3, LUM, **ABCA8**, **ETV1**, **PHLDA1**, *ITM2A*, *SPARCL1*, ***APOD***, ***COL8A1***, ***ENTPD2***, ***GPC3***, ***P2RY14***, ***SOX9***  -  [2, **3**, *2*, ***6***]  -  13
    * D2 - b5: BHLHE40, PHLDA3, **CSRP1**, *DDIT4*, ***SBSPON***,  -  [2, **1**, *1*, ***1***]  -  5
    * D1 - D2 - b5: KLF5, **CCL2**, **MEOX2**, **TGFBI**, **TM4SF1**, *MATN2*, ***CLDN1***, ***EBF2***, ***FOXS1***, ***NR2F2***, ***VIT***  -  [1, **4**, *1*, ***5***]  -  11
    
     
* D1 and D2 vs e1  [En e1 y b1 también salen bastantes genes de d2]
    * D1 - e1: CTNNAL1, PTCH1, *EGR3*, *HMGA1*, ***ETV4***, ***SOX9***, ***TIAM1***  -  [2, **0**, *2*, ***3***]  -  7
    * D2 - e1: CCDC3, MTSS1, **CSRP1**, **LMO7**, **SBSPON**, *CAVIN2*, *FRMD4B*, *PHLDA3*, *SYNE2*, *TPD52*, ***BNC2***, ***CAV1***, ***CAV2***, ***EFNB1***, ***ITGA6***, ***ITGB4***, ***KRT19***, ***PALMD***, ***SLC2A1***  -  [2, **3**, *5*, ***9***]  -  19
    * D1 - D2 - e1: EZR, **NR2F2**, *KLF5*, ***CLDN1***, ***EBF2***, ***NDRG2***, ***TENM2***   -  [1, **1**, *1*, ***4***]  -  7

##### A4

In [None]:
A4_a1 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A4', 'a1', translate=True, N=N_human_mouse)
A4_a2 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A4', 'a2', translate=True, N=N_human_mouse)
A4_a1_a2 = [i for i in A4_a1 if i in A4_a2]
A4_a1 = [i for i in A4_a1 if i not in A4_a1_a2]
A4_a2 = [i for i in A4_a2 if i not in A4_a1_a2]

In [None]:
print(', '.join(A4_a1))

In [None]:
print(', '.join(A4_a2))

In [None]:
print(', '.join(A4_a1_a2))

##### A1

In [None]:
print(', '.join(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A1', 'a2', translate=True, N=N_human_mouse)))

In [None]:
print(', '.join(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A1', 'b3', translate=True, N=N_human_mouse)))

In [None]:
print(', '.join(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A1', 'b/c', translate=True, N=N_human_mouse)))

In [None]:
print(', '.join(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A1', 'c1', translate=True, N=N_human_mouse)))

##### A3

In [None]:
A3_b2 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A3', 'b2', translate=True, N=N_human_mouse)
A3_c1 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A3', 'c1', translate=True, N=N_human_mouse)
A3_b2_c1 = [i for i in A3_b2 if i in A3_c1]
A3_b2 = [i for i in A3_b2 if i not in A3_b2_c1]
A3_c1 = [i for i in A3_c1 if i not in A3_b2_c1]

In [None]:
print(', '.join(A3_b2))

In [None]:
print(', '.join(A3_c1))

In [None]:
print(', '.join(A3_b2_c1))

##### A2

In [None]:
print(', '.join(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'A2', 'c2', translate=True, N=N_human_mouse)))

##### B1 / B3

In [None]:
B1_b6 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'B1', 'b6', translate=True, N=N_human_mouse)
B3_b6 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'B3', 'b6', translate=True, N=N_human_mouse)
B1_B3_b6 = [i for i in B3_b6 if i in B1_b6]
B1_b6 = [i for i in B1_b6 if i not in B1_B3_b6]
B3_b6 = [i for i in B3_b6 if i not in B1_B3_b6]

In [None]:
print(', '.join(B1_b6))

In [None]:
print(', '.join(B3_b6))

In [None]:
print(', '.join(B1_B3_b6))

##### B2 vs B4

In [None]:
B2_b4 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'B2', 'b4', translate=True, N=N_human_mouse)
B4_b4 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'B4', 'b4', translate=True, N=N_human_mouse)
B2_B4_b4 = [i for i in B2_b4 if i in B4_b4]
B2_b4 = [i for i in B2_b4 if i not in B2_B4_b4]
B4_b4 = [i for i in B4_b4 if i not in B2_B4_b4]

In [None]:
print(', '.join(B2_b4))

In [None]:
print(', '.join(B4_b4))

In [None]:
print(', '.join(B2_B4_b4))

##### B4

In [None]:
B4_b1 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'B4', 'b1', translate=True, N=N_human_mouse)
B4_b4 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'B4', 'b4', translate=True, N=N_human_mouse)
B4_b1_b4 = [i for i in B4_b1 if i in B4_b4]
B4_b1 = [i for i in B4_b1 if i not in B4_b1_b4]
B4_b4 = [i for i in B4_b4 if i not in B4_b1_b4]

In [None]:
print(', '.join(B4_b1))

In [None]:
print(', '.join(B4_b4))

In [None]:
print(', '.join(B4_b1_b4))

##### C axis

In [None]:
print(', '.join(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'C1', 'd3', translate=True, N=N_human_mouse)))

In [None]:
print(', '.join(print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'C5', 'd1', translate=True, N=N_human_mouse)))

In [None]:
C2_cd = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'C2', 'c/d', translate=True, N=N_human_mouse)
C3_cd = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'C3', 'c/d', translate=True, N=N_human_mouse)
C2_C3_cd = [i for i in C2_cd if i in C3_cd]
C2_cd = [i for i in C2_cd if i not in C2_C3_cd]
C3_cd = [i for i in C3_cd if i not in C2_C3_cd]

In [None]:
print(', '.join(C2_cd))

In [None]:
print(', '.join(C3_cd))

In [None]:
print(', '.join(C2_C3_cd))

##### D vs b5

In [None]:
D1_b5 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'D1', 'b5', translate=True, N=N_human_mouse)
D2_b5 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'D2', 'b5', translate=True, N=N_human_mouse)
D1_D2_b5 = [i for i in D1_b5 if i in D2_b5]
D1_b5 = [i for i in D1_b5 if i not in D1_D2_b5]
D2_b5 = [i for i in D2_b5 if i not in D1_D2_b5]

In [None]:
print(', '.join(D1_b5))

In [None]:
print(', '.join(D2_b5))

In [None]:
print(', '.join(D1_D2_b5))

##### D vs e1

In [None]:
D1_e1 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'D1', 'e1', translate=True, N=N_human_mouse)
D2_e1 = print_common_genes(dict_make_gene_scoring_cluster_robust_human, dict_make_gene_scoring_cluster_robust_mouse, 
                   'D2', 'e1', translate=True, N=N_human_mouse)
D1_D2_e1 = [i for i in D1_e1 if i in D2_e1]
D1_e1 = [i for i in D1_e1 if i not in D1_D2_e1]
D2_e1 = [i for i in D2_e1 if i not in D1_D2_e1]

In [None]:
print(', '.join(D1_e1))

In [None]:
print(', '.join(D2_e1))

In [None]:
print(', '.join(D1_D2_e1))