In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import bbknn
import os
from scipy import sparse
import matplotlib.pyplot as plt
# from scanpy_base_moudle_update2 import *
# import scrublet as scr
import datetime
import harmonypy as hm

sc.settings.verbosity = 3
#sc.logging.print_versions()
# 设置图片的分辨率以及其他样式
sc.settings.set_figure_params(dpi=150, figsize = (4, 3), fontsize=12)

import matplotlib.font_manager
flist = matplotlib.font_manager.get_fontconfig_fonts()
names = [matplotlib.font_manager.FontProperties(fname=fname).get_name() for fname in flist]
print(names)

params={
        'font.style':'italic',
        'font.weight':'normal',    #or 'blod'
        }
plt.rcParams.update(params)

plt.rcParams['font.family']='Arial'

# 数据准备

## DEG

### 计算区域间差异

In [None]:
adata = sc.read('/mnt/data/Project2021/Gut_immune_surveillance/data/h5ad_Figure/Human/Epithelium/Postnatal/human_postnatal_epi_ann.h5ad')
adata = sc.AnnData(X=adata.raw.X, var=adata.raw.var, obs = adata.obs)
obs_index = adata.obs.loc[adata.obs["health"].isin(['Normal']), :].index
adata = adata[obs_index, :]
obs_index = adata.obs.loc[adata.obs["ann_for_cpdb"].isin(['APOA1+Enterocytes','CA1+Enterocytes']), :].index
adata = adata[obs_index, :]

adata_list = []

In [None]:
    for j in list(adata.obs['pan_organ'].cat.categories):
        
        print(j + ' is being processed ...')
        single_index = adata.obs.loc[adata.obs['pan_organ'].isin([j]), :].index
        adata_single = adata[single_index,:]

        print('The number of '+ j + ' is '+ str(len(adata_single.obs.index)))
        if len(adata_single.obs.index) > 5000:
            # 设置随机筛选10000个细胞，设置随机种子为1
            sc.pp.subsample(adata_single, n_obs=5000, random_state=1, copy=False)
        else:
            adata_single = adata_single
            
        adata_list.append(adata_single)

In [None]:
def change_obs_index_v2(x):
    """
    该函数用于修正concatenate样本或数据集后变更的barcodes名。
    """  
    x = x[:-4]
    return x


# batch_list单个数据集的样本数限制在999以下时适用，此步骤是因为Anndata的concatenate函数会自动在barcode name尾部一个批次编号，
# 在这里设置格式化的编号以便删除。
batch_list = []
for i in range(len(adata_list)):
    if i+1 < 10:
        j = '00'+ str(i+1)
    elif i+1 < 100:
        j = '0'+ str(i+1)
    else: 
        j = str(i+1)
    batch_list.append(j)
    
adata_concat = adata_list[0].concatenate(adata_list[1:len(batch_list)],join='outer', batch_categories=batch_list)
# 删除barcode name的尾部编号
adata_concat.obs.rename(index=change_obs_index_v2,inplace=True)
adata_concat

In [None]:
DEG_ann_list = []
for i in list(adata_concat.obs['pan_organ']):
    if i == 'Duo-Jej':
        j = '1-Proximal_SI'
    elif i == 'Ileum':
        j = '2-Distal_SI'
    elif i == 'App-Col-Rec':
        j = '3-LI'
        
    DEG_ann_list.append(j)
    
adata_concat.obs['ann_for_DEG'] = DEG_ann_list

In [None]:
sc.tl.rank_genes_groups(adata_concat, 
                        groupby = 'ann_for_DEG',
                        method='wilcoxon')

In [None]:
help(sc.pl.rank_genes_groups_heatmap)

In [None]:
sc.settings.set_figure_params(dpi=150, fontsize=7)
sc.pl.rank_genes_groups_heatmap(adata_concat,
                                n_genes=25,
                                groupby='ann_for_DEG',
                                min_logfoldchange=1, 
                                show_gene_labels = True,
                                dendrogram = False,
                                cmap='RdYlBu_r',
                                figsize = (3,7.5),
                                # standard_scale='var',
                                swap_axes = True
                               )

### 获取每个区域显著差异上调的基因数

**Proximal SI**

In [None]:
result_DEG = sc.get.rank_genes_groups_df(adata_concat,group="1-Proximal_SI")
result_DEG

In [None]:
gene_list = []
for i in list(result_DEG.index):
    # log2FC<=1 and pvalue<0.01
    if (result_DEG.iloc[i,2]>=1 and result_DEG.iloc[i,4]<=1e-2):
        gene_list.append(result_DEG.iloc[i,0])
len(gene_list)

In [None]:
result_DEG_1 = result_DEG.set_index('names').loc[gene_list,:]
result_DEG_1

In [None]:
result_DEG_1.to_csv('/mnt/data/Project2021/Gut_immune_surveillance/outputs/duodenum_up_enterocytes.csv')

**Distal SI**

In [None]:
result_DEG = sc.get.rank_genes_groups_df(adata_concat,group="2-Distal_SI")

gene_list = []
for i in list(result_DEG.index):
    # log2FC<=1 and pvalue<0.01
    if (result_DEG.iloc[i,2]>=1 and result_DEG.iloc[i,4]<=1e-2):
        gene_list.append(result_DEG.iloc[i,0])
print(len(gene_list))

result_DEG_1 = result_DEG.set_index('names').loc[gene_list,:]
result_DEG_1.to_csv('/mnt/data/Project2021/Gut_immune_surveillance/outputs/Ileum_up_enterocytes.csv')

**Colorectum**

In [None]:
result_DEG = sc.get.rank_genes_groups_df(adata_concat,group="3-LI")

gene_list = []
for i in list(result_DEG.index):
    # log2FC<=1 and pvalue<0.01
    if (result_DEG.iloc[i,2]>=1 and result_DEG.iloc[i,4]<=1e-2):
        gene_list.append(result_DEG.iloc[i,0])
print(len(gene_list))

result_DEG_1 = result_DEG.set_index('names').loc[gene_list,:]
result_DEG_1.to_csv('/mnt/data/Project2021/Gut_immune_surveillance/outputs/LI_up_enterocytes.csv')

## DEG-Regional Celltype

（1）分区域，每个区域随机抽取20000细胞（不足则为所有）;

（2）对任一区域，建立在adata.obs中建立DEG_ann对象，将APOA1+Enterocytes、CA1+Enterocytes、Immature enterocytes合并为Enterocytes；其余细胞谱系合并为Non-enterocytes;

（3）对DEG_ann进行差异表达分析，获取该轮循环的区域的Enterocytes相比于其他所有上皮谱系显著上调的基因列表（for Venn）、获取每个基因的差异倍数（for Figure3Aii）；

（4）对该基因列表与所在区域成熟的吸收系肠细胞与其他区域的成熟吸收系肠细胞的差异上调基因进行Venn（Figure3Aiii）；

In [None]:
adata = sc.read('/mnt/data/Project2021/Gut_immune_surveillance/data/h5ad_Figure/Human/Epithelium/Postnatal/human_postnatal_epi_ann.h5ad')
adata = sc.AnnData(X=adata.raw.X, var=adata.raw.var, obs = adata.obs)
obs_index = adata.obs.loc[adata.obs["health"].isin(['Normal']), :].index
adata = adata[obs_index, :]
adata

**分区域，每个区域随机抽取20000细胞**

In [None]:
Groups_tab_1 = pd.crosstab(index=adata.obs['pan_organ'],  # Make a crosstab
                columns=adata.obs['pan_organ'], margins=True)  # Name the count column
Groups_tab_1

In [None]:
adata_list = []

for j in list(adata.obs['pan_organ'].cat.categories):
        
    print(j + ' is being processed ...')
    single_index = adata.obs.loc[adata.obs['pan_organ'].isin([j]), :].index
    adata_single = adata[single_index,:]

    print('The number of '+ j + ' is '+ str(len(adata_single.obs.index)))
    if len(adata_single.obs.index) > 20000:
        # 设置随机筛选10000个细胞，设置随机种子为1
        sc.pp.subsample(adata_single, n_obs=20000, random_state=1, copy=False)
    else:
        adata_single = adata_single
            
    adata_list.append(adata_single)

In [None]:
def change_obs_index_v2(x):
    """
    该函数用于修正concatenate样本或数据集后变更的barcodes名。
    """  
    x = x[:-4]
    return x


# batch_list单个数据集的样本数限制在999以下时适用，此步骤是因为Anndata的concatenate函数会自动在barcode name尾部一个批次编号，
# 在这里设置格式化的编号以便删除。
batch_list = []
for i in range(len(adata_list)):
    if i+1 < 10:
        j = '00'+ str(i+1)
    elif i+1 < 100:
        j = '0'+ str(i+1)
    else: 
        j = str(i+1)
    batch_list.append(j)
    
adata_concat = adata_list[0].concatenate(adata_list[1:len(batch_list)],join='outer', batch_categories=batch_list)
# 删除barcode name的尾部编号
adata_concat.obs.rename(index=change_obs_index_v2,inplace=True)
adata_concat

**细胞类型合并**

In [None]:
adata_concat.obs['ann_for_cpdb'] = adata_concat.obs['ann_for_cpdb'].astype('category')
adata_concat.obs['ann_for_cpdb'].cat.categories

In [None]:
DEG_ann_list = []
for i in list(adata_concat.obs['ann_for_cpdb']):
    if i in ['APOA1+Enterocytes','CA1+Enterocytes','Immature enterocytes']:
        j = 'Enterocytes'
    else:
        j = 'Non-enterocytes'
        
    DEG_ann_list.append(j)
    
adata_concat.obs['ann_for_DEG'] = DEG_ann_list
adata_concat

In [None]:
adata_concat.obs['pan_organ'] = adata_concat.obs['pan_organ'].astype('category')
adata_concat.obs['pan_organ'].cat.categories

**分区域用DEG_ann进行差异表达分析**

In [None]:
for i in list(adata_concat.obs['pan_organ'].cat.categories):
    print(i + ' is being processed ...')
    single_index = adata_concat.obs.loc[adata_concat.obs['pan_organ'].isin([i]), :].index
    adata_organ = adata_concat[single_index,:]
    
    sc.tl.rank_genes_groups(adata_organ, 
                        groupby = 'ann_for_DEG',
                        method='wilcoxon')
    
    result_DEG = sc.get.rank_genes_groups_df(adata_organ,group="Enterocytes")
    result_DEG.to_csv('/mnt/data/Project2021/Gut_immune_surveillance/outputs/'+ 'DEG_' + i + '.csv')

## KEGG

In [None]:
adata = sc.read('/mnt/data/Project2021/Gut_immune_surveillance/data/h5ad_Figure/Human/Epithelium/Postnatal/human_postnatal_epi_ann.h5ad')
adata = sc.AnnData(X=adata.raw.X, var=adata.raw.var, obs = adata.obs)
obs_index = adata.obs.loc[adata.obs["health"].isin(['Normal']), :].index
adata = adata[obs_index, :]
obs_index = adata.obs.loc[adata.obs["ann_for_cpdb"].isin(['APOA1+Enterocytes','CA1+Enterocytes']), :].index
adata = adata[obs_index, :]

In [None]:
adata_list = []

In [None]:
    for j in list(adata.obs['organ'].cat.categories):
        
        print(j + ' is being processed ...')
        single_index = adata.obs.loc[adata.obs['organ'].isin([j]), :].index
        adata_single = adata[single_index,:]

        print('The number of '+ j + ' is '+ str(len(adata_single.obs.index)))
        if len(adata_single.obs.index) > 2000:
            # 设置随机筛选10000个细胞，设置随机种子为1
            sc.pp.subsample(adata_single, n_obs=2000, random_state=1, copy=False)
        else:
            adata_single = adata_single
            
        adata_list.append(adata_single)

In [None]:
def change_obs_index_v2(x):
    """
    该函数用于修正concatenate样本或数据集后变更的barcodes名。
    """  
    x = x[:-4]
    return x


# batch_list单个数据集的样本数限制在999以下时适用，此步骤是因为Anndata的concatenate函数会自动在barcode name尾部一个批次编号，
# 在这里设置格式化的编号以便删除。
batch_list = []
for i in range(len(adata_list)):
    if i+1 < 10:
        j = '00'+ str(i+1)
    elif i+1 < 100:
        j = '0'+ str(i+1)
    else: 
        j = str(i+1)
    batch_list.append(j)
    
adata_concat = adata_list[0].concatenate(adata_list[1:len(batch_list)],join='outer', batch_categories=batch_list)
# 删除barcode name的尾部编号
adata_concat.obs.rename(index=change_obs_index_v2,inplace=True)
adata_concat

In [None]:
fn = '/mnt/data/project/qilu_singlecell_1/ref_geneset/Total_kegg1.gmt'
with open(fn) as f:
    sets_raw = f.readlines()
sets_proc = [x.split('\n')[0] for x in sets_raw]
sets_proc = [x.split('\t') for x in sets_proc]

path_name_list = []
gene_ids_list = []

for x in sets_proc:
    path_name = x[0]
    gene_ids=x[2:]
    
    path_name_list.append(path_name)
    gene_ids_list.append(gene_ids)

In [None]:
dataframe_GSVA = pd.DataFrame(index=adata_concat.obs.index, columns=path_name_list)
dataframe_GSVA

In [None]:
# 330 pathway,time cost 99.978 s
import time

time_start = time.time()

j = 0
for i in path_name_list:
    score_name = i
    genesets = gene_ids_list[j]
    print(len(genesets))
    genesets = [x for x in genesets if x in adata_concat.var_names]
    print(len(genesets))
    print(j)
    j = j+1
    
    sc.tl.score_genes(adata_concat, genesets, 
                      ctrl_size=len(genesets), 
                      gene_pool=None, 
                      n_bins=25, 
                      score_name=score_name, 
                      random_state=0, 
                      copy=False, 
                      use_raw=None)
    dataframe_GSVA[score_name] = adata_concat.obs[score_name]

time_end = time.time()    #结束计时

time_c= time_end - time_start   #运行所花时间
print('time cost', time_c, 's')

In [None]:
dataframe_GSVA

In [None]:
import seaborn as sns
 
sns.kdeplot(dataframe_GSVA['Glycolysis / Gluconeogenesis'], shade = True)

In [None]:
dataframe_GSVA = dataframe_GSVA.applymap(lambda x: x +1)

In [None]:
index = dataframe_GSVA.index.to_frame(index=True, name='barcodes')
columns = dataframe_GSVA.columns.to_frame(index=True, name='pathway')

# 将dataframe_GSVA读取为Anndata
anndata_GSVA = sc.AnnData(X=dataframe_GSVA.values, var=columns, obs = index)
anndata_GSVA

In [None]:
paga_ann_list = []
for i in list(adata_concat.obs['ann_for_cpdb']):
    if i in ['APOA1+Enterocytes','CA1+Enterocytes']:
        j = 'Enterocytes'
    else: 
        j = i
    paga_ann_list.append(j)
    
adata_concat.obs['ann_for_paga'] = paga_ann_list

In [None]:
anndata_GSVA.raw = anndata_GSVA
anndata_GSVA.obs['ann_for_paga']=adata_concat.obs['ann_for_paga']
anndata_GSVA.obs['ann_for_cpdb']=adata_concat.obs['ann_for_cpdb']
anndata_GSVA.obs['organ']=adata_concat.obs['organ']
anndata_GSVA.obs

In [None]:
sc.pp.scale(anndata_GSVA, max_value=10)

In [None]:
markers = ['Oxidative phosphorylation',      # 碳代谢
           'Glycolysis / Gluconeogenesis',
           'Citrate cycle (TCA cycle)',
          'Pentose phosphate pathway',
          'Pyruvate metabolism',
          'Butanoate metabolism',
          'Nitrogen metabolism',
          'Starch and sucrose metabolism',
          'Amino sugar and nucleotide sugar metabolism',
           
          'Fatty acid degradation',          # 类脂（化合）物代谢作用
          'Synthesis and degradation of ketone bodies',
          'Primary bile acid biosynthesis',
          'alpha-Linolenic acid metabolism',
          'Primary bile acid biosynthesis',
          
          'Arginine biosynthesis',                 # 氨基酸代谢
          'Alanine, aspartate and glutamate metabolism',
          'Valine, leucine and isoleucine degradation',
          'Histidine metabolism',
          'Tryptophan metabolism',
          
          'PI3K-Akt signaling pathway',         #信号转导
          'Wnt signaling pathway',
          'Notch signaling pathway',
          'TGF-beta signaling pathway',
          'TNF signaling pathway',
          'cAMP signaling pathway',
          'mTOR signaling pathway',
          'Toll-like receptor signaling pathway',
          'IL-17 signaling pathway',
          'B cell receptor signaling pathway',
          'T cell receptor signaling pathway',
          
          'Bile secretion',                    # 消化吸收
          'Carbohydrate digestion and absorption',
          'Protein digestion and absorption',
          'Fat digestion and absorption',
          'Cholesterol metabolism',
          'Vitamin digestion and absorption',
          'Mineral absorption',
           
          'Glutamatergic synapse',            # 突触相关
          'GABAergic synapse',
          'Cholinergic synapse',
          'Dopaminergic synapse',
          'Serotonergic synapse',
          ]

In [None]:
anndata_GSVA

In [None]:
gsva_ann_list = []
for i in list(anndata_GSVA.obs['organ']):
    if i == 'Duodenum':
        j = '1-Duodenum'
    elif i == 'Jejunum':
        j = '2-Jejunum'
    elif i == 'Ileum':
        j = '3-Ileum'
    elif i == 'Appendix':
        j = '4-Appendix'
    elif i == 'Colon':
        j = '5-Colon'
    elif i == 'Rectum':
        j = '6-Rectum'
        
    gsva_ann_list.append(j)
    
anndata_GSVA.obs['ann_for_gsva'] = gsva_ann_list

### pathway整体差异分析及可视化

In [None]:
# 所有器官，bulk级别
def adata_diff(adata_test):
    
    sc.settings.set_figure_params(dpi=100, figsize = (4, 3), fontsize=6)
    sc.tl.rank_genes_groups(adata_test, 'ann_for_gsva', method='wilcoxon')
    #sc.pl.rank_genes_groups(adata_test, n_genes=25, sharey=False)
    pd.DataFrame(adata_test.uns['rank_genes_groups']['names']).to_csv('./outputs/'+ dirfile)

    #sc.get.rank_genes_groups_df(adata_test, group="1-Duodenum")
    
    sc.settings.set_figure_params(dpi=150, figsize = (4, 3), fontsize=6)

    sc.pl.rank_genes_groups_heatmap(adata_test, 
                                    n_genes=20, 
                                    use_raw=False, 
                                    swap_axes=True, 
                                    vmin=-2, 
                                    vmax=2, 
                                    cmap='bwr', 
                                    # layer='scaled', 
                                    figsize=(10,10),
                                    # categories_order = categories_order,
                                    show=True,
                                    dendrogram=False,
                                    show_gene_labels=True,
                                    # save = '_hotmap3.png'
                                   )
    
    sc.settings.set_figure_params(dpi=300, figsize = (4, 3), fontsize=20)
    mp1 = sc.pl.rank_genes_groups_matrixplot(adata_test, 
                                       n_genes=15, 
                                       use_raw=False, 
                                       dendrogram=False,
                                       return_fig=True,
                                       vmin=-1.7, 
                                       vmax=1.7,
                                       #categories_order = categories_order,
                                       cmap='bwr')
    mp1.style(edge_color='white',cmap = 'RdYlBu_r',edge_lw=1.0).show()
    
    mp1 = sc.pl.rank_genes_groups_matrixplot(adata_test, 
                                       n_genes=6, 
                                       use_raw=False, 
                                       dendrogram=False,
                                       return_fig=True,
                                       vmin=-1.7, 
                                       vmax=1.7,
                                       #categories_order = categories_order,
                                       cmap='bwr')
    mp1.style(edge_color='white',cmap = 'RdYlBu_r',edge_lw=1.0).show()   
    
    mp1 = sc.pl.rank_genes_groups_matrixplot(adata_test, 
                                       n_genes=6, 
                                       use_raw=False, 
                                       dendrogram=False,
                                       return_fig=True,
                                       vmin=-1.7, 
                                       vmax=1.7,
                                       #categories_order = categories_order,
                                       cmap='bwr')
    mp1.style(edge_color='white',cmap = 'bwr',edge_lw=1.0).show()       
    
    mp = sc.pl.matrixplot(adata_test, 
                     markers, 
                     groupby='ann_for_gsva', 
                     dendrogram=False, 
                     return_fig=True,
                     cmap='bwr',
                     #categories_order = categories_order,
                     #standard_scale='var'
                    )
    mp.style(edge_color='white',cmap = 'RdYlBu_r',edge_lw=1.0).show()
    
    sc.settings.set_figure_params(dpi=100, figsize = (3, 5), fontsize=10)
    sc.pl.heatmap(adata_test, 
                  markers, 
                  groupby='ann_for_gsva', 
                  dendrogram=False,
                  swap_axes=True,
                  cmap='bwr')

    sc.pl.heatmap(adata_test, 
                  markers, 
                  groupby='ann_for_gsva', 
                  dendrogram=False,
                  swap_axes=True,
                  standard_scale='var',
                  cmap='bwr')
    
    return adata_test

In [None]:
adata_test = anndata_GSVA
dirfile = 'Enterocytes_rank_gsva_kegg_20211209.csv'
adata_diff(adata_test)

### pathway分组差异分析及可视化 

In [None]:
anndata_GSVA

In [None]:
# 读取组别pathway(6组)
kegg_df = pd.read_csv('/mnt/data/project/qilu_singlecell_1/data/outputs/KEGG_list_for_figure2.csv')
kegg_df

In [None]:
# 同样的方法也适用于特定gene list
def enrichment_group_pathway(input_adata, kegg_df, n_show_pathway=10):
    
    num_col = kegg_df.shape[1] # 1为列数, 0为行数
    
    for i in range(num_col):
        kegg_list = list(kegg_df.iloc[:,i].dropna()) # dropna()去除NAN所在行
        anndata_GSVA_test = input_adata[:, kegg_list]
        
        sc.tl.rank_genes_groups(anndata_GSVA_test, 'ann_for_gsva', method='wilcoxon',use_raw = False)
        
        sc.settings.set_figure_params(dpi=400, figsize = (4, 3), fontsize=20)
        mp1 = sc.pl.rank_genes_groups_matrixplot(anndata_GSVA_test, 
                                            n_genes=n_show_pathway, 
                                            use_raw=False, 
                                            dendrogram=False,
                                            return_fig=True,
                                            vmin=-1.7, 
                                            vmax=1.7,
                                            #categories_order = categories_order,
                                            standard_scale = True,
                                            cmap='bwr')
        mp1.style(edge_color='white',cmap = 'RdYlBu_r',edge_lw=1.0).show()

In [None]:
enrichment_group_pathway(input_adata=anndata_GSVA, kegg_df=kegg_df, n_show_pathway=10)

## Reactome

In [None]:
adata = sc.read('/mnt/data/Project2021/Gut_immune_surveillance/data/h5ad_Figure/Human/Epithelium/Postnatal/human_postnatal_epi_ann.h5ad')
adata = sc.AnnData(X=adata.raw.X, var=adata.raw.var, obs = adata.obs)
obs_index = adata.obs.loc[adata.obs["health"].isin(['Normal']), :].index
adata = adata[obs_index, :]
obs_index = adata.obs.loc[adata.obs["ann_for_cpdb"].isin(['APOA1+Enterocytes','CA1+Enterocytes']), :].index
adata = adata[obs_index, :]

In [None]:
adata_list = []

In [None]:
    for j in list(adata.obs['pan_organ'].cat.categories):
        
        print(j + ' is being processed ...')
        single_index = adata.obs.loc[adata.obs['pan_organ'].isin([j]), :].index
        adata_single = adata[single_index,:]

        print('The number of '+ j + ' is '+ str(len(adata_single.obs.index)))
        if len(adata_single.obs.index) > 5000:
            # 设置随机筛选10000个细胞，设置随机种子为1
            sc.pp.subsample(adata_single, n_obs=5000, random_state=1, copy=False)
        else:
            adata_single = adata_single
            
        adata_list.append(adata_single)

In [None]:
def change_obs_index_v2(x):
    """
    该函数用于修正concatenate样本或数据集后变更的barcodes名。
    """  
    x = x[:-4]
    return x


# batch_list单个数据集的样本数限制在999以下时适用，此步骤是因为Anndata的concatenate函数会自动在barcode name尾部一个批次编号，
# 在这里设置格式化的编号以便删除。
batch_list = []
for i in range(len(adata_list)):
    if i+1 < 10:
        j = '00'+ str(i+1)
    elif i+1 < 100:
        j = '0'+ str(i+1)
    else: 
        j = str(i+1)
    batch_list.append(j)
    
adata_concat = adata_list[0].concatenate(adata_list[1:len(batch_list)],join='outer', batch_categories=batch_list)
# 删除barcode name的尾部编号
adata_concat.obs.rename(index=change_obs_index_v2,inplace=True)
adata_concat

In [None]:
fn = '/mnt/data/Project2021/Gut_immune_surveillance/ref_geneset/c2.cp.reactome.v7.2.symbols.gmt'

with open(fn) as f:
    sets_raw = f.readlines()
sets_proc = [x.split('\n')[0] for x in sets_raw]
sets_proc = [x.split('\t') for x in sets_proc]

path_name_list = []
gene_ids_list = []

for x in sets_proc:
    path_name = x[0]
    gene_ids=x[2:]
    
    path_name_list.append(path_name)
    gene_ids_list.append(gene_ids)

In [None]:
dataframe_GSVA = pd.DataFrame(index=adata_concat.obs.index, columns=path_name_list)
dataframe_GSVA

In [None]:
# 1554 pathway,time cost 856.7754094600677 s
import time

time_start = time.time()

j = 0
for i in path_name_list:
    score_name = i
    genesets = gene_ids_list[j]
    print(len(genesets))
    genesets = [x for x in genesets if x in adata_concat.var_names]
    print(len(genesets))
    print(j)
    j = j+1
    
    sc.tl.score_genes(adata_concat, genesets, 
                      ctrl_size=len(genesets), 
                      gene_pool=None, 
                      n_bins=25, 
                      score_name=score_name, 
                      random_state=0, 
                      copy=False, 
                      use_raw=None)
    dataframe_GSVA[score_name] = adata_concat.obs[score_name]

time_end = time.time()    #结束计时

time_c= time_end - time_start   #运行所花时间
print('time cost', time_c, 's')

In [None]:
dataframe_GSVA

In [None]:
dataframe_GSVA = dataframe_GSVA.applymap(lambda x: x +1)

In [None]:
import seaborn as sns
sns.kdeplot(dataframe_GSVA['REACTOME_REMOVAL_OF_THE_FLAP_INTERMEDIATE_FROM_THE_C_STRAND'], shade = True)

In [None]:
index = dataframe_GSVA.index.to_frame(index=True, name='barcodes')
columns = dataframe_GSVA.columns.to_frame(index=True, name='pathway')

# 将dataframe_GSVA读取为Anndata
anndata_GSVA = sc.AnnData(X=dataframe_GSVA.values, var=columns, obs = index)
anndata_GSVA

In [None]:
paga_ann_list = []
for i in list(adata_concat.obs['ann_for_cpdb']):
    if i in ['APOA1+Enterocytes','CA1+Enterocytes']:
        j = 'Enterocytes'
    else: 
        j = i
    paga_ann_list.append(j)
    
adata_concat.obs['ann_for_paga'] = paga_ann_list

anndata_GSVA.raw = anndata_GSVA
anndata_GSVA.obs['ann_for_paga']=adata_concat.obs['ann_for_paga']
anndata_GSVA.obs['ann_for_cpdb']=adata_concat.obs['ann_for_cpdb']
anndata_GSVA.obs['organ']=adata_concat.obs['organ']
anndata_GSVA.obs

In [None]:
anndata_GSVA.obs['pan_organ']=adata_concat.obs['pan_organ']
anndata_GSVA.obs

In [None]:
adata.obs['pan_organ'].cat.categories

In [None]:
sc.pp.scale(anndata_GSVA, max_value=10)

In [None]:
gsva_ann_list = []
for i in list(anndata_GSVA.obs['pan_organ']):
    if i == 'Duo-Jej':
        j = '1-Proximal_SI'
    elif i == 'Ileum':
        j = '2-Distal_SI'
    elif i == 'App-Col-Rec':
        j = '3-LI'
        
    gsva_ann_list.append(j)
    
anndata_GSVA.obs['ann_for_gsva'] = gsva_ann_list

### pathway整体差异分析及可视化

In [None]:
# 所有器官，bulk级别
def adata_diff(adata_test):
    
    sc.settings.set_figure_params(dpi=100, figsize = (4, 3), fontsize=6)
    sc.tl.rank_genes_groups(adata_test, 'ann_for_gsva', method='wilcoxon')
    #sc.pl.rank_genes_groups(adata_test, n_genes=25, sharey=False)
    pd.DataFrame(adata_test.uns['rank_genes_groups']['names']).to_csv('./outputs/'+ dirfile)

    #sc.get.rank_genes_groups_df(adata_test, group="1-Duodenum")
    
    sc.settings.set_figure_params(dpi=150, figsize = (4, 3), fontsize=6)

    sc.pl.rank_genes_groups_heatmap(adata_test, 
                                    n_genes=20, 
                                    use_raw=False, 
                                    swap_axes=True, 
                                    vmin=-2, 
                                    vmax=2, 
                                    cmap='bwr', 
                                    # layer='scaled', 
                                    figsize=(10,10),
                                    # categories_order = categories_order,
                                    show=True,
                                    dendrogram=False,
                                    show_gene_labels=True,
                                    # save = '_hotmap3.png'
                                   )
    
    sc.settings.set_figure_params(dpi=300, figsize = (4, 3), fontsize=20)
    mp1 = sc.pl.rank_genes_groups_matrixplot(adata_test, 
                                       n_genes=30, 
                                       use_raw=False, 
                                       dendrogram=False,
                                       return_fig=True,
                                       vmin=-1.7, 
                                       vmax=1.7,
                                       #categories_order = categories_order,
                                       cmap='bwr')
    mp1.style(edge_color='white',cmap = 'bwr',edge_lw=1.0).show()
    
    mp1 = sc.pl.rank_genes_groups_matrixplot(adata_test, 
                                       n_genes=6, 
                                       use_raw=False, 
                                       dendrogram=False,
                                       return_fig=True,
                                       vmin=-1.7, 
                                       vmax=1.7,
                                       #categories_order = categories_order,
                                       cmap='bwr')
    mp1.style(edge_color='white',cmap = 'RdYlBu_r',edge_lw=1.0).show()   
    
    mp1 = sc.pl.rank_genes_groups_matrixplot(adata_test, 
                                       n_genes=6, 
                                       use_raw=False, 
                                       dendrogram=False,
                                       return_fig=True,
                                       vmin=-1.7, 
                                       vmax=1.7,
                                       #categories_order = categories_order,
                                       cmap='bwr')
    mp1.style(edge_color='white',cmap = 'bwr',edge_lw=1.0).show()       

    
    return adata_test

In [None]:
adata_test = anndata_GSVA
dirfile = 'Enterocytes_rank_gsva_reactome_20211210.csv'
adata_diff(adata_test)

#### Proximal SI

In [None]:
result_Proximal_SI = pd.DataFrame()

pd_Proximal_SI = sc.get.rank_genes_groups_df(adata_test, group="1-Proximal_SI")
pd_Proximal_SI

In [None]:
# 设置横纵轴，横轴为logfoldchanges，纵轴为pvals

result_Proximal_SI['names'] = pd_Proximal_SI['names']

result_Proximal_SI['x:logfoldchanges'] = pd_Proximal_SI['logfoldchanges']
result_Proximal_SI['x:z-score'] = pd_Proximal_SI['scores']

smooth = 1e-320
result_Proximal_SI['y:-Log10(pvals_adj)'] = -np.log10(pd_Proximal_SI['pvals_adj'] + smooth)

result_Proximal_SI

In [None]:
result_Proximal_SI.to_csv('/mnt/data/Project2021/Gut_immune_surveillance/outputs/Proximal_SI_enterocytes.csv')

#### Distal SI

In [None]:
result_Distal_SI = pd.DataFrame()
pd_Distal_SI = sc.get.rank_genes_groups_df(adata_test, group="2-Distal_SI")

result_Distal_SI['names'] = pd_Distal_SI['names']
result_Distal_SI['x:logfoldchanges'] = pd_Distal_SI['logfoldchanges']
result_Distal_SI['x:z-score'] = pd_Distal_SI['scores']
smooth = 1e-320
result_Distal_SI['y:-Log10(pvals_adj)'] = -np.log10(pd_Distal_SI['pvals_adj'] + smooth)
result_Distal_SI

In [None]:
result_Distal_SI.to_csv('/mnt/data/Project2021/Gut_immune_surveillance/outputs/result_Distal_SI_enterocytes.csv')

#### LI

In [None]:
result_LI = pd.DataFrame()
pd_LI = sc.get.rank_genes_groups_df(adata_test, group="3-LI")

result_LI['names'] = pd_LI['names']
result_LI['x:logfoldchanges'] = pd_LI['logfoldchanges']
result_LI['x:z-score'] = pd_LI['scores']
smooth = 1e-320
result_LI['y:-Log10(pvals_adj)'] = -np.log10(pd_LI['pvals_adj'] + smooth)
result_LI

In [None]:
result_LI.to_csv('/mnt/data/Project2021/Gut_immune_surveillance/outputs/result_LI_enterocytes.csv')

# 绘图

## Plot_Radar

**1、导入数据**

In [None]:
# 轴长度
zscore_df = pd.read_csv('/mnt/data/Project2021/Gut_immune_surveillance/outputs/zscore.csv')
zscore_df.head()

In [None]:
zscore_df.set_index('names').values.T

In [None]:
# 点颜色
pvalue_df = pd.read_csv('/mnt/data/Project2021/Gut_immune_surveillance/outputs/pvalue.csv')
pvalue_df.head()

**2、设置函数**

In [None]:
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.patches import Circle, RegularPolygon
from matplotlib.path import Path
from matplotlib.projections.polar import PolarAxes
from matplotlib.projections import register_projection
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D


def radar_factory(num_vars, frame='circle'):
    """
    Create a radar chart with `num_vars` axes.

    This function creates a RadarAxes projection and registers it.

    Parameters
    ----------
    num_vars : int
        Number of variables for radar chart.
    frame : {'circle', 'polygon'}
        Shape of frame surrounding axes.

    """
    # calculate evenly-spaced axis angles
    theta = np.linspace(0, 2*np.pi, num_vars, endpoint=False)

    class RadarAxes(PolarAxes):

        name = 'radar'
        # use 1 line segment to connect specified points
        RESOLUTION = 1

        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            # rotate plot such that the first axis is at the top
            self.set_theta_zero_location('N')

        def fill(self, *args, closed=True, **kwargs):
            """Override fill so that line is closed by default"""
            return super().fill(closed=closed, *args, **kwargs)

        def plot(self, *args, **kwargs):
            """Override plot so that line is closed by default"""
            lines = super().plot(*args, **kwargs)
            for line in lines:
                self._close_line(line)

        def _close_line(self, line):
            x, y = line.get_data()
            # FIXME: markers at x[0], y[0] get doubled-up
            if x[0] != x[-1]:
                x = np.append(x, x[0])
                y = np.append(y, y[0])
                line.set_data(x, y)

        def set_varlabels(self, labels):
            self.set_thetagrids(np.degrees(theta), labels)

        def _gen_axes_patch(self):
            # The Axes patch must be centered at (0.5, 0.5) and of radius 0.5
            # in axes coordinates.
            if frame == 'circle':
                return Circle((0.5, 0.5), 0.5)
            elif frame == 'polygon':
                return RegularPolygon((0.5, 0.5), num_vars,
                                      radius=.5, edgecolor="k")
            else:
                raise ValueError("Unknown value for 'frame': %s" % frame)

        def _gen_axes_spines(self):
            if frame == 'circle':
                return super()._gen_axes_spines()
            elif frame == 'polygon':
                # spine_type must be 'left'/'right'/'top'/'bottom'/'circle'.
                spine = Spine(axes=self,
                              spine_type='circle',
                              path=Path.unit_regular_polygon(num_vars))
                # unit_regular_polygon gives a polygon of radius 1 centered at
                # (0, 0) but we want a polygon of radius 0.5 centered at (0.5,
                # 0.5) in axes coordinates.
                spine.set_transform(Affine2D().scale(.5).translate(.5, .5)
                                    + self.transAxes)
                return {'polar': spine}
            else:
                raise ValueError("Unknown value for 'frame': %s" % frame)

    register_projection(RadarAxes)
    return theta

**3、设置绘图程序**

In [None]:
if __name__ == '__main__':
    
    #############################  参数和输入数据设置  ###############################
    N = 36 # pathway数
    theta = radar_factory(N, frame='polygon')

    # pathway名称的列表
    spoke_labels = list(pvalue_df['names'])
    
    # 表型的颜色
    colors = ['b', 'r', 'g']
    
    ############################# 绘图 ##################################

    fig, ax = plt.subplots(figsize=(25, 25), subplot_kw=dict(projection='radar'))

    # Plot the four cases from the example data on separate axes
    # title = 'Enterocytes'
    case_data = zscore_df.set_index('names').values.T
    
    pvalue = pvalue_df.set_index('names').values.T
    
    ax.set_rgrids([-100, -50, 0, 50, 100])
    
    # ax.set_title(title, weight='bold', size='medium', position=(0.5, 1.1),horizontalalignment='center', verticalalignment='center')
    # 按行读取数据（即按表型顺序读取）
    
    index = 0
    for d, color in zip(case_data, colors):
        ax.plot(theta, d, color=color)
        ax.fill(theta, d, facecolor=color, alpha=0.25)
        
        # 二维向量横向展开
        ax.scatter(theta, d, c=pvalue[index] , s=50, cmap = 'viridis')
        index = index + 1
        
    ax.set_varlabels(spoke_labels)    
    
    # add legend relative to top-left plot
    labels = ('Proximal SI', 'Distal SI', 'Colorectum')
    legend = ax.legend(labels, loc=(0.9, .95),labelspacing=0.1, fontsize='small')

    plt.show()

In [None]:
if __name__ == '__main__':
    
    #############################  参数和输入数据设置  ###############################
    N = 36 # pathway数
    theta = radar_factory(N, frame='polygon')

    # pathway名称的列表
    spoke_labels = list(pvalue_df['names'])
    
    # 表型的颜色
    colors = ['b', 'r', 'g']
    
    ############################# 绘图 ##################################

    fig, ax = plt.subplots(figsize=(15, 15), subplot_kw=dict(projection='radar'))

    # Plot the four cases from the example data on separate axes
    # title = 'Enterocytes'
    case_data = zscore_df.set_index('names').values.T
    
    pvalue = pvalue_df.set_index('names').values.T
    
    ax.set_rgrids([-100, -50, 0, 50, 100], fontsize=20)
    
    # ax.set_title(title, weight='bold', size='medium', position=(0.5, 1.1),horizontalalignment='center', verticalalignment='center')
    # 按行读取数据（即按表型顺序读取）
    
    for d, color in zip(case_data, colors):
        ax.plot(theta, d, color=color)
        ax.fill(theta, d, facecolor=color, alpha=0.25)
        
    index = 0
    for d, color in zip(case_data, colors):
        # 二维向量横向展开
        ax.scatter(theta, d, c=pvalue[index] , s=80, cmap = 'viridis')
        index = index + 1
        
    ax.set_varlabels(['','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',])    
    
    # add legend relative to top-left plot
    labels = ('Proximal SI', 'Distal SI', 'Colorectum')
    legend = ax.legend(labels, loc=(0.9, .95),labelspacing=0.1, fontsize='small')

    plt.show()

In [None]:
if __name__ == '__main__':
    
    #############################  参数和输入数据设置  ###############################
    N = 36 # pathway数
    theta = radar_factory(N, frame='polygon')

    # pathway名称的列表
    spoke_labels = list(pvalue_df['names'])
    
    # 表型的颜色
    colors = ['b', 'r', 'g']
    
    ############################# 绘图 ##################################

    fig, ax = plt.subplots(figsize=(15, 15), subplot_kw=dict(projection='radar'))

    # Plot the four cases from the example data on separate axes
    # title = 'Enterocytes'
    case_data = zscore_df.set_index('names').values.T
    
    pvalue = pvalue_df.set_index('names').values.T
    
    ax.set_rgrids([-100, -50, 0, 50, 100], fontsize= 0)
    
    # ax.set_title(title, weight='bold', size='medium', position=(0.5, 1.1),horizontalalignment='center', verticalalignment='center')
    # 按行读取数据（即按表型顺序读取）
    
    for d, color in zip(case_data, colors):
        ax.plot(theta, d, color=color)
        ax.fill(theta, d, facecolor=color, alpha=0.25)
        
    index = 0
    for d, color in zip(case_data, colors):
        # 二维向量横向展开
        ax.scatter(theta, d, c=pvalue[index] , s=200, cmap = 'viridis',edgecolors = '#000000')
        index = index + 1
        
    ax.set_varlabels(['','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',])    
    
    # add legend relative to top-left plot
    labels = ('Proximal SI', 'Distal SI', 'Colorectum')
    legend = ax.legend(labels, loc=(0.9, .95),labelspacing=0.1, fontsize='small')
    plt.legend([],[], frameon=False)

    plt.show()

## Barplot-Figure3Aii

In [None]:

import matplotlib.pyplot as plt
import numpy as np

n = 12
X = np.arange(n)
Y1 = (1 - X / float(n)) * np.random.uniform(0.5, 1.0, n)
Y2 = (1 - X / float(n)) * np.random.uniform(0.5, 1.0, n)

plt.bar(X, +Y1)
plt.bar(X, -Y2)

plt.xlim(-.5, n)

plt.ylim(-1.25, 1.25)
#隐藏坐标轴刻度数字
#plt.xticks(())
#plt.yticks(())
#移动坐标轴
# ax=plt.gca()
# ax.spines['bottom'].set_position(('data',0))

plt.show()


In [None]:
n = 10
X = np.arange(n)
Y = [-1,1,2,3,-2,5,6,-1,-5,6]

plt.bar(X, Y)

plt.xlim(-.5, n)

plt.ylim(-6, 6)

plt.grid(False)
plt.show()

In [None]:
df_gene_list = pd.read_csv('/mnt/data/Project2021/Gut_immune_surveillance/outputs/Figure3Aii.csv')
df_gene_list

In [None]:
X = list(df_gene_list['names'])

Y = list(df_gene_list['logfoldchanges'])

Y1 = []
Y2 = []

for i in list(df_gene_list['logfoldchanges']):
    if i >= 0:
        j = i
    else:
        j = 0
        
    Y1.append(j)
    
    if i <= 0:
        k = i
    else:
        k = 0
        
    Y2.append(k)

In [None]:
fig, ax = plt.subplots(figsize=(25, 12))


plt.bar(X, Y1, color=['#a00627'])
plt.bar(X, Y2, color=['#404a9f'])

plt.xlim(-1, len(X))

plt.ylim(-5, 5)

plt.grid(False)
#隐藏坐标轴刻度数字
plt.xticks(())
# plt.yticks(())
#移动坐标轴
# ax=plt.gca()
# ax.spines['bottom'].set_position(('data',0))

plt.show()

## Venn

**Proximal SI**

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

# Change one group only
v=venn2(subsets = (1115, 171, 242), set_labels = ('', ''))
c=venn2_circles(subsets = (1115, 171, 242), linewidth=1, color="grey")
# c[0].set_lw(8.0)
# c[0].set_ls('dotted')

# c[1].set_color('#ad1c1c')
# c[0].set_color('#404a9f')

# c=venn2_circles(subsets = (1115, 171, 242), linewidth=1, color="grey")
plt.show()

**Ileum**

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

# Change one group only
v=venn2(subsets = (224, 310, 196), set_labels = ('', ''))
c=venn2_circles(subsets = (224, 310, 196), linewidth=1, color="grey")

plt.show()

**Colorectum**

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

# Change one group only
v=venn2(subsets = (469, 35, 23), set_labels = ('', ''))
c=venn2_circles(subsets = (469, 35, 23), linewidth=1, color="grey")

plt.show()

## Bar plot

In [None]:
# create a dataset
height = [0.82, 0.53, 0.95]
bars = (' ', ' ', ' ')
x_pos = np.arange(len(bars))

fig, ax = plt.subplots(figsize=(20, 3))
# Create bars with different colors
plt.bar(x_pos, height, color=['blue', 'red', 'green'])

# Create names on the x-axis
plt.xticks(x_pos, bars, fontsize = 30)
plt.yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0 ], fontsize = 30)

plt.grid(False)

print(ax.axis())
# ax.axis([-0.54, 2.5400000000000005, 0.0, 1])

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.show()