# Ligand receptor analysis

In this analysis we are going to use CellPhoneDB (>= v3) to analyse interactions between (1) fibroblas subpopulations and (2) fibroblast subpopulations with the rest of populations. To save time, we are going to do (2) and, then, extract (1) from it. We are goin to run the analysis on each dataset, and then do a general combination of interactions to get a general frame of interactions that occur across datasets.

**YOU NEED TO RUN NOTEBOOK 4H FIRST**

## imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
import matplotlib.cm as cm
import networkx as nx
import shutil

In [None]:
# local imports and imports from other notebooks
from cellassign import assign_cats
from fb_functions import make_gene_scoring_with_expr, plot_score_graph, plot_UMAPS_gene, plot_adata_cluster_properties
%store -r dict_colors
%store -r seed
%store -r magma
%store -r data_dir

In [None]:
%store -r dict_make_gene_scoring_robust
%store -r dict_make_gene_scoring_axis_robust

In [None]:
mpl.rcParams['figure.dpi'] = 120
pd.options.display.float_format = "{:,.2f}".format

In [None]:
def join_fbs_adatas(adata_full, adata_fb):
    cell_types = adata_full.obs['assigned_cats'].copy().astype(str)
    intersect_idx = np.intersect1d(adata_fb.obs_names, adata_full.obs_names)
    cell_types[intersect_idx] = [f'fibro_{i}' for i in adata_fb[intersect_idx].obs['cluster']]
    adata_full.obs['full_cell_type'] = cell_types.astype('category')

## Anndata loading

In [None]:
list_adatas_fb, list_adatas_full = [], []

In [None]:
ahlers_2022_dir = data_dir + '/ahlers_2022'
ahlers_2022_young_human_fb = sc.read(ahlers_2022_dir + '/ahlers_2022_young_human_fb_robust.h5')
ahlers_2022_young_human = sc.read(ahlers_2022_dir + '/ahlers_2022_young_human_processed.h5')
join_fbs_adatas(ahlers_2022_young_human, ahlers_2022_young_human_fb)
list_adatas_fb.append(ahlers_2022_young_human_fb); list_adatas_full.append(ahlers_2022_young_human)

In [None]:
boothby_2021_dir = data_dir + '/boothby_2021'
boothby_2021_ctrl_human_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_human_fb_robust.h5')
boothby_2021_ctrl_human = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_human_processed.h5')
join_fbs_adatas(boothby_2021_ctrl_human, boothby_2021_ctrl_human_fb)
list_adatas_fb.append(boothby_2021_ctrl_human_fb); list_adatas_full.append(boothby_2021_ctrl_human)

In [None]:
deng_2021_dir = data_dir + '/deng_2021'
deng_2021_scar_fb = sc.read(deng_2021_dir + '/deng_2021_scar_fb_robust.h5')
deng_2021_scar = sc.read(deng_2021_dir + '/deng_2021_scar_processed.h5')
join_fbs_adatas(deng_2021_scar, deng_2021_scar_fb)
list_adatas_fb.append(deng_2021_scar_fb); list_adatas_full.append(deng_2021_scar)

In [None]:
gao_2021_dir = data_dir + '/gao_2021'
gao_2021_ctrl_human_fb = sc.read(gao_2021_dir + '/gao_2021_ctrl_human_fb_robust.h5')
gao_2021_ctrl_human = sc.read(gao_2021_dir + '/gao_2021_ctrl_human_processed.h5')
join_fbs_adatas(gao_2021_ctrl_human, gao_2021_ctrl_human_fb)
list_adatas_fb.append(gao_2021_ctrl_human_fb); list_adatas_full.append(gao_2021_ctrl_human)

In [None]:
gaydosik_2020_dir = data_dir + '/gaydosik_2020'
gaydosik_2020_ctrl_human_fb = sc.read(gaydosik_2020_dir + '/gaydosik_2020_ctrl_human_fb_robust.h5')
gaydosik_2020_ctrl_human = sc.read(gaydosik_2020_dir + '/gaydosik_2020_ctrl_human_processed.h5')
join_fbs_adatas(gaydosik_2020_ctrl_human, gaydosik_2020_ctrl_human_fb)
list_adatas_fb.append(gaydosik_2020_ctrl_human_fb); list_adatas_full.append(gaydosik_2020_ctrl_human)

In [None]:
gur_2022_dir = data_dir + '/gur_2022'
gur_2022_ctrl_human_fb = sc.read(gur_2022_dir + '/gur_2022_ctrl_human_fb_robust.h5')
gur_2022_ctrl_human = sc.read(gur_2022_dir + '/gur_2022_ctrl_human_processed.h5')
join_fbs_adatas(gur_2022_ctrl_human, gur_2022_ctrl_human_fb)
list_adatas_fb.append(gur_2022_ctrl_human_fb); list_adatas_full.append(gur_2022_ctrl_human)

In [None]:
he_2020_dir = data_dir + '/He_2020'
he_2020_ctrl_human_fb = sc.read(he_2020_dir + '/he_2020_ctrl_human_fb_robust.h5')
he_2020_ctrl_human = sc.read(he_2020_dir + '/he_2020_ctrl_human_processed.h5')
join_fbs_adatas(he_2020_ctrl_human, he_2020_ctrl_human_fb)
list_adatas_fb.append(he_2020_ctrl_human_fb); list_adatas_full.append(he_2020_ctrl_human)

In [None]:
hughes_2020_dir = data_dir + '/hughes_2020'
hughes_2020_ctrl_human_fb = sc.read(hughes_2020_dir + '/hughes_2020_ctrl_human_fb_robust.h5')
hughes_2020_ctrl_human = sc.read(hughes_2020_dir + '/hughes_2020_ctrl_human_processed.h5')
join_fbs_adatas(hughes_2020_ctrl_human, hughes_2020_ctrl_human_fb)
list_adatas_fb.append(hughes_2020_ctrl_human_fb); list_adatas_full.append(hughes_2020_ctrl_human)

In [None]:
kim_2020_dir = data_dir + '/Kim_2020'
kim_2020_ctrl_human_fb = sc.read(kim_2020_dir + '/kim_2020_ctrl_human_fb_robust.h5')
kim_2020_ctrl_human = sc.read(kim_2020_dir + '/kim_2020_ctrl_human_processed.h5')
join_fbs_adatas(kim_2020_ctrl_human, kim_2020_ctrl_human_fb)
list_adatas_fb.append(kim_2020_ctrl_human_fb); list_adatas_full.append(kim_2020_ctrl_human)

In [None]:
liu_2021_dir = data_dir + '/liu_2021'
liu_2021_ctrl_human_fb = sc.read(liu_2021_dir + '/liu_2021_ctrl_human_fb_robust.h5')
liu_2021_ctrl_human = sc.read(liu_2021_dir + '/liu_2021_ctrl_human_processed.h5')
join_fbs_adatas(liu_2021_ctrl_human, liu_2021_ctrl_human_fb)
list_adatas_fb.append(liu_2021_ctrl_human_fb); list_adatas_full.append(liu_2021_ctrl_human)

In [None]:
mariottoni_2021_dir = data_dir + '/mariottoni_2021'
mariottoni_2021_ctrl_human_fb = sc.read(mariottoni_2021_dir + '/mariottoni_2021_ctrl_human_fb_robust.h5')
mariottoni_2021_ctrl_human = sc.read(mariottoni_2021_dir + '/mariottoni_2021_ctrl_human_processed.h5')
join_fbs_adatas(mariottoni_2021_ctrl_human, mariottoni_2021_ctrl_human_fb)
list_adatas_fb.append(mariottoni_2021_ctrl_human_fb); list_adatas_full.append(mariottoni_2021_ctrl_human)

In [None]:
mirizio_2020_dir = data_dir + '/mirizio_2020'
mirizio_2020_scl_human_fb = sc.read(mirizio_2020_dir + '/mirizio_2020_scl_human_fb_robust.h5')
mirizio_2020_scl_human = sc.read(mirizio_2020_dir + '/mirizio_2020_scl_human_processed.h5')
join_fbs_adatas(mirizio_2020_scl_human, mirizio_2020_scl_human_fb)
list_adatas_fb.append(mirizio_2020_scl_human_fb); list_adatas_full.append(mirizio_2020_scl_human)

In [None]:
reynolds_2021_dir = data_dir + '/reynolds_2021'
reynolds_2021_ctrl_human_fb = sc.read(reynolds_2021_dir + '/reynolds_2021_ctrl_human_fb_robust.h5')
# Dataset with the rest of cells is not available. It can be used but since the dataset doesn't have good quality, is not fully relevant.
reynolds_2021_ctrl_human_fb.obs['full_cell_type'] = [f'fibro_{i}' for i in reynolds_2021_ctrl_human_fb.obs['cluster']]
reynolds_2021_ctrl_human_fb.obs['full_cell_type'] = reynolds_2021_ctrl_human_fb.obs['full_cell_type'].astype('category')
list_adatas_fb.append(reynolds_2021_ctrl_human_fb); list_adatas_full.append(reynolds_2021_ctrl_human_fb)

In [None]:
rindler_2021_dir = data_dir + '/rindler_2021'
rindler_2021_ctrl_human_fb = sc.read(rindler_2021_dir + '/rindler_2021_ctrl_human_fb_robust.h5')
rindler_2021_ctrl_human = sc.read(rindler_2021_dir + '/rindler_2021_ctrl_human_processed.h5')
join_fbs_adatas(rindler_2021_ctrl_human, rindler_2021_ctrl_human_fb)
list_adatas_fb.append(rindler_2021_ctrl_human_fb); list_adatas_full.append(rindler_2021_ctrl_human)

In [None]:
sole_2020_dir = data_dir + '/Sole-Boldo_2020'
sole_2020_young_human_fb = sc.read(sole_2020_dir + '/sole_2020_young_human_fb_robust.h5')
sole_2020_young_human = sc.read(sole_2020_dir + '/sole_2020_young_human_processed.h5')
join_fbs_adatas(sole_2020_young_human, sole_2020_young_human_fb)
list_adatas_fb.append(sole_2020_young_human_fb); list_adatas_full.append(sole_2020_young_human)

In [None]:
tabib_2018_dir = data_dir + '/Tabib_2018'
tabib_2018_ctrl_human_fb = sc.read(tabib_2018_dir + '/tabib_2018_ctrl_human_fb_robust.h5')
tabib_2018_ctrl_human = sc.read(tabib_2018_dir + '/tabib_2018_ctrl_human_processed.h5')
join_fbs_adatas(tabib_2018_ctrl_human, tabib_2018_ctrl_human_fb)
list_adatas_fb.append(tabib_2018_ctrl_human_fb); list_adatas_full.append(tabib_2018_ctrl_human)

In [None]:
tabib_2021_dir = data_dir + '/Tabib_2021'
tabib_2021_ctrl_human_fb = sc.read(tabib_2021_dir + '/tabib_2021_ctrl_human_fb_robust.h5')
tabib_2021_ctrl_human = sc.read(tabib_2021_dir + '/tabib_2021_ctrl_human_processed.h5')
join_fbs_adatas(tabib_2021_ctrl_human, tabib_2021_ctrl_human_fb)
list_adatas_fb.append(tabib_2021_ctrl_human_fb); list_adatas_full.append(tabib_2021_ctrl_human)

In [None]:
theo_2020_dir = data_dir + '/Theocharidis_2020/'
theo_2020_ctrl_human_dm_fb = sc.read(theo_2020_dir + '/theo_2020_ctrl_human_dm_fb_robust.h5')
theo_2020_ctrl_human_dm = sc.read(theo_2020_dir + '/theo_2020_ctrl_human_dm_processed.h5')
join_fbs_adatas(theo_2020_ctrl_human_dm, theo_2020_ctrl_human_dm_fb)
list_adatas_fb.append(theo_2020_ctrl_human_dm_fb); list_adatas_full.append(theo_2020_ctrl_human_dm)

In [None]:
theo_2021_dir = data_dir + '/Theocharidis_2021/'
theo_2021_ctrl_human_fb = sc.read(theo_2021_dir + '/theo_2021_ctrl_human_fb_robust.h5')
theo_2021_ctrl_human = sc.read(theo_2021_dir + '/theo_2021_ctrl_human_processed.h5')
join_fbs_adatas(theo_2021_ctrl_human, theo_2021_ctrl_human_fb)
list_adatas_fb.append(theo_2021_ctrl_human_fb); list_adatas_full.append(theo_2021_ctrl_human)

In [None]:
vors_2020_dir = data_dir + '/Vorstandlechner_2020'
vors_2020_ctrl_human_fb = sc.read(vors_2020_dir + '/vors_2020_ctrl_human_fb_robust.h5')
vors_2020_ctrl_human = sc.read(vors_2020_dir + '/vors_2020_ctrl_human_processed.h5')
join_fbs_adatas(vors_2020_ctrl_human, vors_2020_ctrl_human_fb)
list_adatas_fb.append(vors_2020_ctrl_human_fb); list_adatas_full.append(vors_2020_ctrl_human)

In [None]:
vors_2021_dir = data_dir + '/Vorstandlechner_2021'
vors_2021_ctrl_human_fb = sc.read(vors_2021_dir + '/vors_2021_ctrl_human_fb_robust.h5')
vors_2021_ctrl_human = sc.read(vors_2021_dir + '/vors_2021_ctrl_human_processed.h5')
join_fbs_adatas(vors_2021_ctrl_human, vors_2021_ctrl_human_fb)
list_adatas_fb.append(vors_2021_ctrl_human_fb); list_adatas_full.append(vors_2021_ctrl_human)

In [None]:
xu_2021_dir = data_dir + '/xu_2021'
xu_2021_ctrl_human_fb = sc.read(xu_2021_dir + '/xu_2021_ctrl_human_fb_robust.h5')
xu_2021_ctrl_human = sc.read(xu_2021_dir + '/xu_2021_ctrl_human_processed.h5')
join_fbs_adatas(xu_2021_ctrl_human, xu_2021_ctrl_human_fb)
list_adatas_fb.append(xu_2021_ctrl_human_fb); list_adatas_full.append(xu_2021_ctrl_human)

In [None]:
list_names = [str(adata.obs['Author'].iloc[0]) + ' ' + str(int(adata.obs['Year'].iloc[0])) + ' human' for adata in list_adatas_fb ]

In [None]:
list_names

# Creating a custom db
We are going to create a custom database with additional LR interactions from other sources.

In [None]:
shutil.rmtree('results/CellPhoneDB/dbsample')

In [None]:
!cellphonedb database generate  --result-path results/CellPhoneDB/dbsample

In [None]:
# Create from biomart a dataset of mouse info with columns "UniProtKB Gene Name symbol" and "UniProtKB Gene Name ID"

In [None]:
df_proteins = pd.read_csv('results/CellPhoneDB/dbsample/protein_generated.csv')
df_genes = pd.read_csv('results/CellPhoneDB/dbsample/gene_generated.csv')
df_interactions = pd.read_csv('results/CellPhoneDB/dbsample/interaction_input.csv')
df_biomart = pd.read_csv('data/mart_export.txt', sep='\t')

In [None]:
# Modify gene_generated to gene_inpuit by adding genes in mouse
df_biomart.drop_duplicates(subset='UniProtKB Gene Name symbol', keep='last', ignore_index=True, inplace=True)
df_biomart.index += len(df_genes)
df_biomart.columns = ['gene_name', 'hgnc_symbol', 'uniprot', 'ensembl']
df_genes = df_genes.append(df_biomart)

df_genes.to_csv('results/CellPhoneDB/dbsample/gene_input_custom.csv', index=None)

In [None]:
# Modify protein_generated to protein_inpuit by adding proteins in mouse
df_biomart.drop_duplicates(subset='uniprot', keep='last', ignore_index=True, inplace=True)
df_biomart.index += len(df_proteins)
df_biomart = df_biomart[['gene_name', 'uniprot']]
df_biomart.columns = ['protein_name', 'uniprot']
df_biomart['protein_name'] = [str(i).upper() + '_MOUSE' for i in df_biomart['protein_name']] 
df_proteins = df_proteins.append(df_biomart)

df_proteins.iloc[df_biomart.index, [2, 3 ,4]] = True # set transmembrane, peripheral and secreted to True

df_proteins.to_csv('results/CellPhoneDB/dbsample/protein_input_custom.csv', index=None)

In [None]:
# Download mouse and human pairs from http://tcm.zju.edu.cn/celltalkdb/download.php

In [None]:
dict_proteins_uniprot = dict(zip(df_proteins['protein_name'].values, df_proteins['uniprot'].values))

In [None]:
human_new_pairs_celltalkdb = pd.read_csv('data/human_lr_pair.txt', sep='\t')
mouse_new_pairs_celltalkdb = pd.read_csv('data/mouse_lr_pair.txt', sep='\t')

In [None]:
human_new_pairs_celltalkdb = human_new_pairs_celltalkdb[['ligand_gene_symbol', 'receptor_gene_symbol']]
human_new_pairs_celltalkdb.columns = ['protein_name_a', 'protein_name_b']
human_new_pairs_celltalkdb += '_HUMAN'
human_new_pairs_celltalkdb['partner_a'] = [dict_proteins_uniprot[i] if i in dict_proteins_uniprot else np.NaN for i in human_new_pairs_celltalkdb['protein_name_a']]
human_new_pairs_celltalkdb['partner_b'] = [dict_proteins_uniprot[i] if i in dict_proteins_uniprot else np.NaN for i in human_new_pairs_celltalkdb['protein_name_b']]
human_new_pairs_celltalkdb = human_new_pairs_celltalkdb.dropna().reset_index(drop=True)
human_new_pairs_celltalkdb.index += len(df_interactions)
df_interactions = df_interactions.append(human_new_pairs_celltalkdb)

In [None]:
mouse_new_pairs_celltalkdb = mouse_new_pairs_celltalkdb[['ligand_gene_symbol', 'receptor_gene_symbol']]
mouse_new_pairs_celltalkdb.columns = ['protein_name_a', 'protein_name_b']
mouse_new_pairs_celltalkdb = mouse_new_pairs_celltalkdb.apply(lambda x: x.astype(str).str.upper()) + '_MOUSE'
mouse_new_pairs_celltalkdb['partner_a'] = [dict_proteins_uniprot[i] if i in dict_proteins_uniprot else np.NaN for i in mouse_new_pairs_celltalkdb['protein_name_a']]
mouse_new_pairs_celltalkdb['partner_b'] = [dict_proteins_uniprot[i] if i in dict_proteins_uniprot else np.NaN for i in mouse_new_pairs_celltalkdb['protein_name_b']]
mouse_new_pairs_celltalkdb = mouse_new_pairs_celltalkdb.dropna().reset_index(drop=True)
mouse_new_pairs_celltalkdb.index += len(df_interactions)
df_interactions = df_interactions.append(mouse_new_pairs_celltalkdb)

In [None]:
df_interactions['annotation_strategy'] = 'curated'
df_interactions.to_csv('results/CellPhoneDB/dbsample/interaction_input_custom.csv')

In [None]:
!cellphonedb database generate  --result-path results/CellPhoneDB/dbsample --user-interactions-only 
--user-gene results/CellPhoneDB/dbsample/gene_input_custom.csv --user-protein results/CellPhoneDB/dbsample/protein_input_custom.csv 
--user-interactions results/CellPhoneDB/dbsample/interaction_input_custom.csv

In [None]:
list_files = [i for i in os.listdir('results/CellPhoneDB/dbsample') if i[-2:] == 'db' ]
os.rename(f'results/CellPhoneDB/dbsample/{list_files[-1]}', 'results/CellPhoneDB/dbsample/customdb.db')

# Running CellPhoneDB on each dataset

In [None]:
cellphone_dir = f'results/CellPhoneDB/'
os.makedirs(cellphone_dir, exist_ok=True)

In [None]:
for adata, name in zip(list_adatas_full, list_names):
    print(name)
    name_str = name.lower().replace(' ', '_')
    os.makedirs(cellphone_dir + name_str, exist_ok=True)

    df_meta = pd.DataFrame(data={'Cell':list(adata.obs.index), 'cell_type':[ i for i in adata.obs['full_cell_type']]})
    df_meta.set_index('Cell', inplace=True)
    df_meta.to_csv(f'{cellphone_dir}/{name_str}/meta.tsv', sep = '\t')
    
    df_counts = pd.DataFrame(adata.X.toarray()).transpose()
    df_counts.index = adata.var_names
    df_counts.columns = adata.obs_names

    df_counts.to_csv(f'{cellphone_dir}/{name_str}/{name_str}.txt', sep='\t')
    
    !cellphonedb method statistical_analysis  \
    {cellphone_dir}/{name_str}/meta.tsv  \
    {cellphone_dir}/{name_str}/{name_str}.txt \
    --threshold 0.1 --threads 40 \
    --output-path={cellphone_dir}{name_str} --counts-data hgnc_symbol \
    --database results/CellPhoneDB/dbsample/customdb.db

# Loading the results

In [None]:
df_unstack = pd.DataFrame(columns = ['dataset', 'gene_A', 'gene_B', 'pair_A', 'pair_B', 'value'])
df_unstack_fibros = pd.DataFrame(columns = df_unstack.columns)


for list_name in list_names:
    name, year = list_name.split(' ')[0], list_name.split(' ')[1]

    df = pd.read_csv(f'results/CellPhoneDB/{name.lower()}_{year}_human/significant_means.txt', sep='\t')

    select_cols = [i for i in df.columns if 'fibro' in i]
    select_idx = df[select_cols].dropna(how='all').index

    df = df.loc[select_idx][list(df.columns[:12]) + select_cols]

    df_unstack_dataset = pd.DataFrame({'dataset': [f'{name} {year}'] * len(select_cols) * len(df), 
                               'gene_A': np.repeat(df['gene_a'].tolist(), len(select_cols)), 
                               'gene_B': np.repeat(df['gene_b'].tolist(), len(select_cols)), 
                               'pair_A': [i.split('|')[0] for i in select_cols] * len(df), 
                               'pair_B': [i.split('|')[1] for i in select_cols] * len(df), 
                               'value': df[select_cols].values.ravel()})

    df_unstack_dataset = df_unstack_dataset.dropna(subset = ['value'])
    
    df_unstack_fibros_dataset = df_unstack_dataset.loc[np.array(['fibro' in i for i in df_unstack_dataset['pair_A']]) & 
                                                       np.array(['fibro' in i for i in df_unstack_dataset['pair_B']])]
    
    
    df_unstack = pd.concat([df_unstack, df_unstack_dataset], ignore_index=True, sort=False)
    df_unstack_fibros = pd.concat([df_unstack_fibros, df_unstack_fibros_dataset], ignore_index=True, sort=False)

## Extracting information between fibroblasts

In [None]:
# Processing 

df_processed = df_unstack_fibros
%store -r dict_knee_markers

# 1) Remove duplicates by all columns expect value. This is because if pair_A == pair_B in some cases the values are the same. 
# This produces problems later when counting datasets.

pair_gene_cols = ['gene_A', 'gene_B', 'pair_A', 'pair_B']

df_processed = df_processed.groupby(['dataset'] + pair_gene_cols, as_index=False).mean()



# 2) We group the values. For the numeric values, we compute the median across cases. For string values, we join and count them. 

df_grouped = pd.DataFrame(index=df_processed.groupby(pair_gene_cols).sum().index)
df_grouped['value'] = df_processed.groupby(pair_gene_cols).median()['value']
df_grouped['dataset_list'] = df_processed.groupby(pair_gene_cols)['dataset'].apply(', '.join)
df_grouped['dataset_count'] = df_processed.groupby(pair_gene_cols)['dataset'].count()
df_grouped = df_grouped.reset_index()



# 3) We want to keep only the LR pairs that are consistentr among datasets. Therefore, we will establish a threshold of datasets in which 
# that interaction appears. Because some clusters are specific of fewer datasets, we will consider the lower threshold a percentage (60-80%) of
# the less frequent cluster

# 3A) Create the dict with correspondence between clusters and counts
list_clusters = []
for adata in list_adatas_fb:
    list_clusters += adata.obs['cluster_robust'].cat.categories.tolist()
clusters, counts = np.unique(list_clusters, return_counts=True)
dict_cluster_counts = dict(zip(clusters, counts))

# 3B) Apply the dataset filter
thres = 0.3

pair_A_counts = [dict_cluster_counts[i[-2:]] if i[-2:] in dict_cluster_counts.keys() else 100 for i in df_grouped['pair_A'].values]
pair_B_counts = [dict_cluster_counts[i[-2:]] if i[-2:] in dict_cluster_counts.keys() else 100 for i in df_grouped['pair_B'].values]
min_pair_counts = [min(i, j) for i, j in zip(pair_A_counts, pair_B_counts)]

mask_idx = df_grouped['dataset_count'] > np.array(min_pair_counts) * thres


df_grouped = df_grouped.replace('nan', np.nan)
df_masked = df_grouped.loc[mask_idx].dropna(subset=['gene_A', 'gene_B'], how='all').reset_index(drop=True)

# 3C) Apply the gene filter: we are going to keep genes that are "relevant" markers. Other genes that appear as positive in CellPhoneDB tend to 
# be thorougly expressed and are not relevant
pair_A_bool = [((pair[-2:] in dict_knee_markers.keys()) & (gene in dict_knee_markers.get(pair[-2:], []))) 
                 for gene, pair in zip(df_masked['gene_A'].values, df_masked['pair_A'].values)]
pair_B_bool = [((pair[-2:] in dict_knee_markers.keys()) & (gene in dict_knee_markers.get(pair[-2:], []))) 
                 for gene, pair in zip(df_masked['gene_B'].values, df_masked['pair_B'].values)]

mask_bool = np.array(pair_A_bool) & np.array(pair_B_bool)
mask_bool.sum()

df_masked_fibros = df_masked.loc[mask_bool].reset_index(drop=True)

In [None]:
plt.figure(figsize=(8, 5))
df_alias = df_masked_fibros.copy()
cats = sorted(list(set(df_alias['pair_A']) | set(df_alias['pair_B'])))

df_heatmap = pd.DataFrame(0, index=cats, columns=cats)


for cat_A, cat_B in itl.combinations_with_replacement(cats, 2):
    sub_df = df_alias[((df_alias['pair_A'] == cat_A) & (df_alias['pair_B'] == cat_B)) | 
                      ((df_alias['pair_A'] == cat_B) & (df_alias['pair_B'] == cat_A))].drop_duplicates(['gene_A', 'gene_B'])
    df_heatmap.loc[cat_A, cat_B] = len(sub_df)
    df_heatmap.loc[cat_B, cat_A] = len(sub_df)
#     print(cat_A, cat_B, len(sub_df))

sns.heatmap(df_heatmap , annot=True)

In [None]:
type_A, type_B = 'fibro_A2', 'fibro_A3'

df_masked_fibros[((df_masked_fibros['pair_A'] == type_A) & (df_masked_fibros['pair_B'] == type_B)) | 
          ((df_masked_fibros['pair_A'] == type_B) & (df_masked_fibros['pair_B'] == type_A))]

## Extracting information between fibroblasts and other cell types

In [None]:
# Processing 

df_processed = df_unstack
%store -r dict_knee_markers

# 1) Remove duplicates by all columns expect value. This is because if pair_A == pair_B in some cases the values are the same. 
# This produces problems later when counting datasets.

pair_gene_cols = ['gene_A', 'gene_B', 'pair_A', 'pair_B']

df_processed = df_processed.groupby(['dataset'] + pair_gene_cols, as_index=False).mean()



# 2) We group the values. For the numeric values, we compute the median across cases. For string values, we join and count them. 

df_grouped = pd.DataFrame(index=df_processed.groupby(pair_gene_cols).sum().index)
df_grouped['value'] = df_processed.groupby(pair_gene_cols).median()['value']
df_grouped['dataset_list'] = df_processed.groupby(pair_gene_cols)['dataset'].apply(', '.join)
df_grouped['dataset_count'] = df_processed.groupby(pair_gene_cols)['dataset'].count()
df_grouped = df_grouped.reset_index()



# 3) We want to keep only the LR pairs that are consistentr among datasets. Therefore, we will establish a threshold of datasets in which 
# that interaction appears. Because some clusters are specific of fewer datasets, we will consider the lower threshold a percentage (60-80%) of
# the less frequent cluster

# 3A) Create the dict with correspondence between clusters and counts
list_clusters = []
for adata in list_adatas_fb:
    list_clusters += adata.obs['cluster_robust'].cat.categories.tolist()
clusters, counts = np.unique(list_clusters, return_counts=True)
dict_cluster_counts = dict(zip(clusters, counts))

# 3B) Apply the dataset filter
thres = 0.5

pair_A_counts = [dict_cluster_counts[i[-2:]] if i[-2:] in dict_cluster_counts.keys() else 100 for i in df_grouped['pair_A'].values]
pair_B_counts = [dict_cluster_counts[i[-2:]] if i[-2:] in dict_cluster_counts.keys() else 100 for i in df_grouped['pair_B'].values]
min_pair_counts = [min(i, j) for i, j in zip(pair_A_counts, pair_B_counts)]

mask_idx = df_grouped['dataset_count'] > np.array(min_pair_counts) * thres


df_grouped = df_grouped.replace('nan', np.nan)
df_masked = df_grouped.loc[mask_idx].dropna(subset=['gene_A', 'gene_B'], how='all').reset_index(drop=True)

# 3C) Apply the gene filter: we are going to keep genes that are "relevant" markers. Other genes that appear as positive in CellPhoneDB tend to 
# be thorougly expressed and are not relevant
pair_A_bool = [((pair_A[-2:] in dict_knee_markers.keys()) & (pair_B[-2:] not in dict_knee_markers.keys()) & 
                (gene_A in dict_knee_markers.get(pair_A[-2:], []))) 
                 for gene_A, pair_A, pair_B in zip(df_masked['gene_A'].values, df_masked['pair_A'].values, df_masked['pair_B'].values)]
pair_B_bool = [((pair_B[-2:] in dict_knee_markers.keys()) & (pair_A[-2:] not in dict_knee_markers.keys()) & 
                (gene_B in dict_knee_markers.get(pair_B[-2:], []))) 
                 for gene_B, pair_B, pair_A in zip(df_masked['gene_B'].values, df_masked['pair_B'].values, df_masked['pair_A'].values)]

mask_bool = np.array(pair_A_bool) != np.array(pair_B_bool)
# mask_bool = mask_bool == 1

df_masked = df_masked.loc[mask_bool].reset_index(drop=True)

In [None]:
plt.figure(figsize=(8, 5))
df_alias = df_masked.copy()
cats_nofibro = [i for i in sorted(list(set(df_alias['pair_A']) | set(df_alias['pair_B']))) if 'fibro' not in i ]
cats_fibro = [i for i in sorted(list(set(df_alias['pair_A']) | set(df_alias['pair_B']))) if 'fibro' in i ]
df_heatmap = pd.DataFrame(0, index=cats_fibro, columns=cats_nofibro)


for cat_A, cat_B in itl.product(cats_nofibro, cats_fibro):
    sub_df = df_alias[((df_alias['pair_A'] == cat_A) & (df_alias['pair_B'] == cat_B)) | 
                      ((df_alias['pair_A'] == cat_B) & (df_alias['pair_B'] == cat_A))].drop_duplicates(['gene_A', 'gene_B'])
    if 'fibro' in cat_B:
        cat_A, cat_B = cat_B, cat_A
    df_heatmap.loc[cat_A, cat_B] = len(sub_df)

sns.heatmap(df_heatmap.iloc[:-1], annot=True)

In [None]:
type_A, type_B = 'fibro_D1', 'peri - CYCS'

df_masked[((df_masked['pair_A'] == type_A) & (df_masked['pair_B'] == type_B)) | 
          ((df_masked['pair_A'] == type_B) & (df_masked['pair_B'] == type_A))]

# Manual curation

Although these list are interesting and have been fioltered from many spurious interactions, they need further improvement. To do that, We have selected the L-R pairs, regardless of cluster, and assigned the clusters manually, so that we account for all possible interactions. This curated list is available at [this link](https://docs.google.com/spreadsheets/d/1lfI6sgjEyg37BGL7VRMfW7KgwGKwX5QrCtnKYk1DXY4/edit?usp=sharing).