# Ligand receptor analysis

In this analysis we are going to use CellPhoneDB (>= v3) to analyse interactions between (1) fibroblas subpopulations and (2) fibroblast subpopulations with the rest of populations. To save time, we are going to do (2) and, then, extract (1) from it. We are goin to run the analysis on each dataset, and then do a general combination of interactions to get a general frame of interactions that occur across datasets.

**YOU NEED TO RUN NOTEBOOH 4H FIRST**

## imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import numpy as np
import os
import triku as tk
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from tqdm.notebook import tqdm
import scipy.sparse as spr
import matplotlib.cm as cm
import networkx as nx

In [None]:
# local imports and imports from other notebooks
from cellassign import assign_cats
from fb_functions import make_gene_scoring_with_expr, plot_score_graph, plot_UMAPS_gene, plot_adata_cluster_properties
%store -r dict_colors
%store -r seed
%store -r magma
%store -r data_dir

In [None]:
%store -r dict_make_gene_scoring_robust
%store -r dict_make_gene_scoring_axis_robust

In [None]:
mpl.rcParams['figure.dpi'] = 120
pd.options.display.float_format = "{:,.2f}".format

In [None]:
def join_fbs_adatas(adata_full, adata_fb):
    cell_types = adata_full.obs['assigned_cats'].copy().astype(str)
    intersect_idx = np.intersect1d(adata_fb.obs_names, adata_full.obs_names)
    cell_types[intersect_idx] = [f'fibro_{i}' for i in adata_fb[intersect_idx].obs['cluster']]
    adata_full.obs['full_cell_type'] = cell_types.astype('category')

## Anndata loading

In [None]:
list_adatas_fb, list_adatas_full = [], []

In [None]:
ahlers_2022_dir = data_dir + '/ahlers_2022'
ahlers_2022_young_fb = sc.read(ahlers_2022_dir + '/ahlers_2022_young_fb_robust.h5')
ahlers_2022_young = sc.read(ahlers_2022_dir + '/ahlers_2022_young_processed.h5')
join_fbs_adatas(ahlers_2022_young, ahlers_2022_young_fb)
list_adatas_fb.append(ahlers_2022_young_fb); list_adatas_full.append(ahlers_2022_young)

In [None]:
boothby_2021_dir = data_dir + '/boothby_2021'
boothby_2021_ctrl_fb = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_fb_robust.h5')
boothby_2021_ctrl = sc.read(boothby_2021_dir + '/boothby_2021_ctrl_processed.h5')
join_fbs_adatas(boothby_2021_ctrl, boothby_2021_ctrl_fb)
list_adatas_fb.append(boothby_2021_ctrl_fb); list_adatas_full.append(boothby_2021_ctrl)

In [None]:
deng_2021_dir = data_dir + '/deng_2021'
deng_2021_scar_fb = sc.read(deng_2021_dir + '/deng_2021_scar_fb_robust.h5')
deng_2021_scar = sc.read(deng_2021_dir + '/deng_2021_scar_processed.h5')
join_fbs_adatas(deng_2021_scar, deng_2021_scar_fb)
list_adatas_fb.append(deng_2021_scar_fb); list_adatas_full.append(deng_2021_scar)

In [None]:
gao_2021_dir = data_dir + '/gao_2021'
gao_2021_ctrl_fb = sc.read(gao_2021_dir + '/gao_2021_ctrl_fb_robust.h5')
gao_2021_ctrl = sc.read(gao_2021_dir + '/gao_2021_ctrl_processed.h5')
join_fbs_adatas(gao_2021_ctrl, gao_2021_ctrl_fb)
list_adatas_fb.append(gao_2021_ctrl_fb); list_adatas_full.append(gao_2021_ctrl)

In [None]:
gaydosik_2020_dir = data_dir + '/gaydosik_2020'
gaydosik_2020_ctrl_fb = sc.read(gaydosik_2020_dir + '/gaydosik_2020_ctrl_fb_robust.h5')
gaydosik_2020_ctrl = sc.read(gaydosik_2020_dir + '/gaydosik_2020_ctrl_processed.h5')
join_fbs_adatas(gaydosik_2020_ctrl, gaydosik_2020_ctrl_fb)
list_adatas_fb.append(gaydosik_2020_ctrl_fb); list_adatas_full.append(gaydosik_2020_ctrl)

In [None]:
gur_2022_dir = data_dir + '/gur_2022'
gur_2022_ctrl_fb = sc.read(gur_2022_dir + '/gur_2022_ctrl_fb_robust.h5')
gur_2022_ctrl = sc.read(gur_2022_dir + '/gur_2022_ctrl_processed.h5')
join_fbs_adatas(gur_2022_ctrl, gur_2022_ctrl_fb)
list_adatas_fb.append(gur_2022_ctrl_fb); list_adatas_full.append(gur_2022_ctrl)

In [None]:
he_2020_dir = data_dir + '/He_2020'
he_2020_ctrl_fb = sc.read(he_2020_dir + '/he_2020_ctrl_fb_robust.h5')
he_2020_ctrl = sc.read(he_2020_dir + '/he_2020_ctrl_processed.h5')
join_fbs_adatas(he_2020_ctrl, he_2020_ctrl_fb)
list_adatas_fb.append(he_2020_ctrl_fb); list_adatas_full.append(he_2020_ctrl)

In [None]:
hughes_2020_dir = data_dir + '/hughes_2020'
hughes_2020_ctrl_fb = sc.read(hughes_2020_dir + '/hughes_2020_ctrl_fb_robust.h5')
hughes_2020_ctrl = sc.read(hughes_2020_dir + '/hughes_2020_ctrl_processed.h5')
join_fbs_adatas(hughes_2020_ctrl, hughes_2020_ctrl_fb)
list_adatas_fb.append(hughes_2020_ctrl_fb); list_adatas_full.append(hughes_2020_ctrl)

In [None]:
kim_2020_dir = data_dir + '/Kim_2020'
kim_2020_ctrl_fb = sc.read(kim_2020_dir + '/kim_2020_ctrl_fb_robust.h5')
kim_2020_ctrl = sc.read(kim_2020_dir + '/kim_2020_ctrl_processed.h5')
join_fbs_adatas(kim_2020_ctrl, kim_2020_ctrl_fb)
list_adatas_fb.append(kim_2020_ctrl_fb); list_adatas_full.append(kim_2020_ctrl)

In [None]:
liu_2021_dir = data_dir + '/liu_2021'
liu_2021_ctrl_fb = sc.read(liu_2021_dir + '/liu_2021_ctrl_fb_robust.h5')
liu_2021_ctrl = sc.read(liu_2021_dir + '/liu_2021_ctrl_processed.h5')
join_fbs_adatas(liu_2021_ctrl, liu_2021_ctrl_fb)
list_adatas_fb.append(liu_2021_ctrl_fb); list_adatas_full.append(liu_2021_ctrl)

In [None]:
mariottoni_2021_dir = data_dir + '/mariottoni_2021'
mariottoni_2021_ctrl_fb = sc.read(mariottoni_2021_dir + '/mariottoni_2021_ctrl_fb_robust.h5')
mariottoni_2021_ctrl = sc.read(mariottoni_2021_dir + '/mariottoni_2021_ctrl_processed.h5')
join_fbs_adatas(mariottoni_2021_ctrl, mariottoni_2021_ctrl_fb)
list_adatas_fb.append(mariottoni_2021_ctrl_fb); list_adatas_full.append(mariottoni_2021_ctrl)

In [None]:
mirizio_2020_dir = data_dir + '/mirizio_2020'
mirizio_2020_scl_fb = sc.read(mirizio_2020_dir + '/mirizio_2020_scl_fb_robust.h5')
mirizio_2020_scl = sc.read(mirizio_2020_dir + '/mirizio_2020_scl_processed.h5')
join_fbs_adatas(mirizio_2020_scl, mirizio_2020_scl_fb)
list_adatas_fb.append(mirizio_2020_scl_fb); list_adatas_full.append(mirizio_2020_scl)

In [None]:
reynolds_2021_dir = data_dir + '/reynolds_2021'
reynolds_2021_ctrl_fb = sc.read(reynolds_2021_dir + '/reynolds_2021_ctrl_fb_robust.h5')
# Dataset with the rest of cells is not available. It can be used but since the dataset doesn't have good quality, is not fully relevant.
reynolds_2021_ctrl_fb.obs['full_cell_type'] = [f'fibro_{i}' for i in reynolds_2021_ctrl_fb.obs['cluster']]
reynolds_2021_ctrl_fb.obs['full_cell_type'] = reynolds_2021_ctrl_fb.obs['full_cell_type'].astype('category')
list_adatas_fb.append(reynolds_2021_ctrl_fb); list_adatas_full.append(reynolds_2021_ctrl_fb)

In [None]:
rindler_2021_dir = data_dir + '/rindler_2021'
rindler_2021_ctrl_fb = sc.read(rindler_2021_dir + '/rindler_2021_ctrl_fb_robust.h5')
rindler_2021_ctrl = sc.read(rindler_2021_dir + '/rindler_2021_ctrl_processed.h5')
join_fbs_adatas(rindler_2021_ctrl, rindler_2021_ctrl_fb)
list_adatas_fb.append(rindler_2021_ctrl_fb); list_adatas_full.append(rindler_2021_ctrl)

In [None]:
sole_2020_dir = data_dir + '/Sole-Boldo_2020'
sole_2020_young_fb = sc.read(sole_2020_dir + '/sole_2020_young_fb_robust.h5')
sole_2020_young = sc.read(sole_2020_dir + '/sole_2020_young_processed.h5')
join_fbs_adatas(sole_2020_young, sole_2020_young_fb)
list_adatas_fb.append(sole_2020_young_fb); list_adatas_full.append(sole_2020_young)

In [None]:
tabib_2018_dir = data_dir + '/Tabib_2018'
tabib_2018_ctrl_fb = sc.read(tabib_2018_dir + '/tabib_2018_ctrl_fb_robust.h5')
tabib_2018_ctrl = sc.read(tabib_2018_dir + '/tabib_2018_ctrl_processed.h5')
join_fbs_adatas(tabib_2018_ctrl, tabib_2018_ctrl_fb)
list_adatas_fb.append(tabib_2018_ctrl_fb); list_adatas_full.append(tabib_2018_ctrl)

In [None]:
tabib_2021_dir = data_dir + '/Tabib_2021'
tabib_2021_ctrl_fb = sc.read(tabib_2021_dir + '/tabib_2021_ctrl_fb_robust.h5')
tabib_2021_ctrl = sc.read(tabib_2021_dir + '/tabib_2021_ctrl_processed.h5')
join_fbs_adatas(tabib_2021_ctrl, tabib_2021_ctrl_fb)
list_adatas_fb.append(tabib_2021_ctrl_fb); list_adatas_full.append(tabib_2021_ctrl)

In [None]:
theo_2020_dir = data_dir + '/Theocharidis_2020/'
theo_2020_ctrl_dm_fb = sc.read(theo_2020_dir + '/theo_2020_ctrl_dm_fb_robust.h5')
theo_2020_ctrl_dm = sc.read(theo_2020_dir + '/theo_2020_ctrl_dm_processed.h5')
join_fbs_adatas(theo_2020_ctrl_dm, theo_2020_ctrl_dm_fb)
list_adatas_fb.append(theo_2020_ctrl_dm_fb); list_adatas_full.append(theo_2020_ctrl_dm)

In [None]:
theo_2021_dir = data_dir + '/Theocharidis_2021/'
theo_2021_ctrl_fb = sc.read(theo_2021_dir + '/theo_2021_ctrl_fb_robust.h5')
theo_2021_ctrl = sc.read(theo_2021_dir + '/theo_2021_ctrl_processed.h5')
join_fbs_adatas(theo_2021_ctrl, theo_2021_ctrl_fb)
list_adatas_fb.append(theo_2021_ctrl_fb); list_adatas_full.append(theo_2021_ctrl)

In [None]:
vors_2020_dir = data_dir + '/Vorstandlechner_2020'
vors_2020_ctrl_fb = sc.read(vors_2020_dir + '/vors_2020_ctrl_fb_robust.h5')
vors_2020_ctrl = sc.read(vors_2020_dir + '/vors_2020_ctrl_processed.h5')
join_fbs_adatas(vors_2020_ctrl, vors_2020_ctrl_fb)
list_adatas_fb.append(vors_2020_ctrl_fb); list_adatas_full.append(vors_2020_ctrl)

In [None]:
vors_2021_dir = data_dir + '/Vorstandlechner_2021'
vors_2021_ctrl_fb = sc.read(vors_2021_dir + '/vors_2021_ctrl_fb_robust.h5')
vors_2021_ctrl = sc.read(vors_2021_dir + '/vors_2021_ctrl_processed.h5')
join_fbs_adatas(vors_2021_ctrl, vors_2021_ctrl_fb)
list_adatas_fb.append(vors_2021_ctrl_fb); list_adatas_full.append(vors_2021_ctrl)

In [None]:
xu_2021_dir = data_dir + '/xu_2021'
xu_2021_ctrl_fb = sc.read(xu_2021_dir + '/xu_2021_ctrl_fb_robust.h5')
xu_2021_ctrl = sc.read(xu_2021_dir + '/xu_2021_ctrl_processed.h5')
join_fbs_adatas(xu_2021_ctrl, xu_2021_ctrl_fb)
list_adatas_fb.append(xu_2021_ctrl_fb); list_adatas_full.append(xu_2021_ctrl)

In [None]:
list_names = [str(adata.obs['Author'].iloc[0]) + ' ' + str(int(adata.obs['Year'].iloc[0])) for adata in list_adatas_fb]

# Creating a custom db
We are going to create a custom database with additional LR interactions from other sources.

# Running CellPhoneDB on each dataset

In [None]:
cellphone_dir = f'results/CellPhoneDB/'
os.makedirs(cellphone_dir, exist_ok=True)

In [None]:
for adata, name in zip(list_adatas_full, list_names):
    print(name)
    adata = ahlers_2022_young
    name_str = name.lower().replace(' ', '_')
    os.makedirs(cellphone_dir + name_str, exist_ok=True)

    df_meta = pd.DataFrame(data={'Cell':list(adata.obs.index), 'cell_type':[ i for i in adata.obs['full_cell_type']]})
    df_meta.set_index('Cell', inplace=True)
    df_meta.to_csv(f'{cellphone_dir}/{name_str}/meta.tsv', sep = '\t')
    
    df_counts = pd.DataFrame(adata.X.toarray()).transpose()
    df_counts.index = adata.var_names
    df_counts.columns = adata.obs_names

    df_counts.to_csv(f'{cellphone_dir}/{name_str}/{name_str}.txt', sep='\t')
    
    !cellphonedb method statistical_analysis  \
    {cellphone_dir}/{name_str}/meta.tsv  \
    {cellphone_dir}/{name_str}/{name_str}.txt \
    --threshold 0.1 --threads 40 \
    --output-path={cellphone_dir}{name_str} --counts-data hgnc_symbol