# Clinica analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns  
from bioinfokit import analys, visuz
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA  
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn import metrics
from sklearn.metrics import silhouette_score,adjusted_rand_score,normalized_mutual_info_score
from sklearn.metrics.cluster import contingency_matrix
import warnings

warnings.filterwarnings("ignore")

In [None]:
# output 5 upregulated proteins and plot the volcano plot
def rank_proteins_and_volcano_plot(adata):
        
    names = list(adata.uns['rank_genes_groups']['names'])
    logfoldchanges = list(adata.uns['rank_genes_groups']['logfoldchanges'])
    pvals_adj = list(adata.uns['rank_genes_groups']['pvals_adj'])

    names_list = []
    logfoldchanges_list = []
    pvals_adj_list = []

    for i in range(len(names)):
        names_list.append(names[i][1])
        logfoldchanges_list.append(logfoldchanges[i][1])
        pvals_adj_list.append(pvals_adj[i][1])

    print('top 10 upregulated proteins:', names_list[:10])
    print(names_list)

    dic = {'GeneNames':names_list,
        'log2FC' : logfoldchanges_list,
        'p-value' : pvals_adj_list}
    data = pd.DataFrame(dic) 
    visuz.GeneExpression.volcano(df=data, lfc="log2FC", pv="p-value", geneid="GeneNames", lfc_thr = (np.log2(1.5), np.log2(1.5)),
        gstyle=2, sign_line=True,xlm=(-2,2,0.5),genenames = (names_list[0]),color = ("#fd625e", "grey", "#01b8aa"))

In [None]:
adata_raw = sc.read_h5ad('/home/chenjn/rna2adt_push/downstream/clinical_proteomic_application/data/ECCITE_seq_processed.h5ad')
adata = sc.read_h5ad("/home/chenjn/rna2adt_push/downstream/clinical_proteomic_application/emb_old/ECCITE_step2.h5ad")
# adata = sc.read_h5ad("./emb/ECCITE_step1.h5ad")

mapping_dict = {
    0: 'control',
    1: 'patient'
}
adata_raw.obs['label'] = adata_raw.obs['donor_type'].replace(mapping_dict)

In [None]:
sc.pp.neighbors(adata_raw, n_neighbors=15,use_rep='X')
sc.tl.louvain(adata_raw,resolution = 0.9,key_added = "leiden")  

sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
sc.tl.louvain(adata,resolution = 1.6,key_added = "leiden")  

### UMAP plot showing the cells colored by clusters

In [None]:
# Raw data
sc.tl.umap(adata_raw,n_components=2)
sc.pl.umap(adata_raw, color='label', show=False, legend_loc='none')
plt.title("")
# plt.savefig("ECCITE_figure/umap/raw_umap.png")
# plt.savefig("ECCITE_figure/umap/raw_umap.svg")

# Embeddings
sc.tl.umap(adata,n_components=2)
sc.pl.umap(adata, color='donor_type', show=False, legend_loc='none')
plt.title("")
# plt.savefig("ECCITE_figure/umap/batch_umap.png")
# plt.savefig("ECCITE_figure/umap/batch_umap.svg")

# Leiden
sc.tl.umap(adata,n_components=2)
sc.pl.umap(adata, color="leiden", show=False, legend_loc='on data')
plt.title("")
# plt.savefig("ECCITE_figure/umap/leiden_umap.png")
# plt.savefig("ECCITE_figure/umap/leiden_umap.svg")

In [None]:
adata_raw.var.index

In [None]:
adata_raw.obs['leiden'] = adata.obs['leiden']
# markers = adata_raw.var.index.values
# markers = ['CD19', 'MHCII', 'PECAM', 'CD11c', 'CD4', 'CD27','CD8','CCR7','PD-L1','CD34','CD62L','CD7']
markers = ['PECAM','CD11c','MHCII','CD11b','CD19','CD8','CD16','CD27', 'CD62L', 'CD4']
# markers = ['CD19','MHCII','PECAM','CD11b','CD7','CD8','CD223','CD16','CD27','CD62L','CD26','CD4']

sc.tl.dendrogram(adata_raw, groupby='leiden')
sc.tl.rank_genes_groups(adata_raw, 'leiden')
sc.pl.rank_genes_groups_matrixplot(
    adata_raw,
    values_to_plot="logfoldchanges",
    var_names = markers,
    cmap='Blues',
    vmin=-4,
    vmax=4,
    min_logfoldchange=3,
    colorbar_title='log fold change',
    show=False
)
# plt.savefig("./ECCITE_figure/marker/differential_analysis.png")
# plt.savefig("./ECCITE_figure/marker/differential_analysis.svg")

### Combine similar clusters

We perform subcluster detection at a higher resolution, further hoping to find markers that cannot be detected at lower resolution. We then combined the clusters with similar pattern such as 12,15,19/14,6,3,8,9/25,18,20 and construct a new adata.

In [None]:
cluster_label = list(adata_raw.obs['leiden'])
combine_cluster_list1 = ['15','12','19']
combine_cluster_list2 = ['14','6','3','8','9']
combine_cluster_list3 = ['25','18','20']
new_cluster_label = ['12' if i in combine_cluster_list1 else i for i in cluster_label]
new_cluster_label = ['3' if i in combine_cluster_list2 else i for i in new_cluster_label]
new_cluster_label = ['18' if i in combine_cluster_list3 else i for i in new_cluster_label]

unique_cluster = list(pd.unique(new_cluster_label))
new_cluster_label = [str(unique_cluster.index(i)) for i in new_cluster_label]

In [None]:
unique_cluster

In [None]:
new_cluster_label = cluster_label
np.unique(new_cluster_label)

In [None]:
adata_raw_reorder = ad.AnnData(adata_raw.X)
adata_raw_reorder.var_names = adata_raw.var_names
adata_raw_reorder.obs['donor_type'] = adata_raw.obs['donor_type']
adata_raw_reorder.obs['new_cluster_label'] = pd.Categorical(new_cluster_label)
adata_raw_reorder.obs['new_cluster_label']

### Heatmap showing the protein levels across new clusters

In [None]:
sc.tl.rank_genes_groups(adata_raw_reorder, 'new_cluster_label')
sc.pl.rank_genes_groups_matrixplot(
    adata_raw_reorder,
    values_to_plot="logfoldchanges",
    var_names = markers,
    cmap='Blues',
    vmin=-4,
    vmax=4,
    min_logfoldchange=3,
    colorbar_title='log fold change',
)

In [None]:
adata.obs['new_cluster_label'] = adata_raw_reorder.obs['new_cluster_label']
adata

In [None]:
for i in np.unique(adata_raw.obs['leiden']):
    tmp = adata_raw[adata_raw.obs['leiden'] == i]
    tmp1 = tmp[tmp.obs['donor_type'] == tmp.obs['donor_type'][0]]
    print(i, tmp1.shape[0] / tmp.shape[0])

In [None]:
for i in np.unique(adata.obs['new_cluster_label']):
    tmp = adata_raw[adata.obs['new_cluster_label'] == i]
    tmp1 = tmp[tmp.obs['donor_type'] == tmp.obs['donor_type'][0]]
    print(i, tmp1.shape[0] / tmp.shape[0])

### UMAP plot showing the cells colored by new clusters
We can then plot the umap of scPROTEIN with the new combined cluster labels

In [None]:
sc.pl.umap(adata, color="new_cluster_label",legend_loc='on data')

In [None]:
adata_raw_reorder.obs['new_cluster_label'],unique_cluster

### Number of cells from different batch in each clusters

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def make_cluster_pipelines(adata):
    cluster_summary = adata.obs.groupby(['new_cluster_label', 'donor_type']).size().unstack(fill_value=0)
    cluster_summary = cluster_summary.sort_values(by=1, ascending=False)

    clusters = [f"cluster {i}" for i in cluster_summary.index.values]
    patient_means = cluster_summary[1]
    control_means = [-i for i in cluster_summary[0]]

    x = np.arange(len(clusters))
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.scatter(x, patient_means, color='red', label='Num of samples from patient')
    ax.scatter(x, control_means, color='green', label='Num of samples from control donor')

    for i in range(len(x)):
        if patient_means[i] > 0 and control_means[i] > 0:
            ax.plot([x[i], x[i]], [control_means[i], patient_means[i]], color='red')
        elif patient_means[i] < 0 and control_means[i] < 0:
            ax.plot([x[i], x[i]], [control_means[i], patient_means[i]], color='green')
        else:
            ax.plot([x[i], x[i]], [control_means[i], 0], color='green')
            ax.plot([x[i], x[i]], [0, patient_means[i]], color='red')

    ax.axhline(0, color='black', linewidth=0.8, linestyle='--')
    ax.grid(True, which='both', axis='y', linestyle='--', linewidth=0.7)

    ax.set_xticks(x)
    ax.set_xticklabels(clusters, rotation=45, ha='right')
    # ax.set_ylabel('Num of samples')
    ax.legend()

    plt.tight_layout()
    # plt.savefig("./ECCITE_figure/data_analyusis/Clustered_cell_distribution.png")
    # plt.savefig("./ECCITE_figure/data_analyusis/Clustered_cell_distribution.svg")
    plt.show()

make_cluster_pipelines(adata_raw_reorder)

### Cell marker detection within subcluster

Take subcluster 7 as an example to illusrate

In [None]:
cluster_id = '10'
adata_raw_reorder
adata_raw_reorder_7 = adata_raw_reorder[adata_raw_reorder.obs['new_cluster_label']==cluster_id]
sc.tl.rank_genes_groups(adata_raw_reorder_7, groupby = 'donor_type', method='t-test')

In [None]:
fig, ax = plt.subplots()
labels = 'control', 'patient'
sizes = [adata_raw_reorder_7[adata_raw_reorder_7.obs['donor_type']==0].X.shape[0],adata_raw_reorder_7[adata_raw_reorder_7.obs['donor_type']==1].X.shape[0],]
plt.title(f"Cluster{cluster_id}")
ax.pie(sizes, labels=labels, autopct='%1.1f%%',colors=['#01b8aa','#fd625e'],explode=[0.,0.1])
# plt.savefig(f"./ECCITE_figure/pie/{cluster_id}_pie.png")
# plt.savefig(f"./ECCITE_figure/pie/{cluster_id}_pie.svg")

In [None]:
rank_proteins_and_volcano_plot(adata_raw_reorder_7)