In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from matplotlib import rcParams
import scanpy as sc
import os
import anndata
from sklearn.mixture import GaussianMixture
from fcsy import DataFrame
import matplotlib
from glob import glob
matplotlib.rcParams['pdf.fonttype']=42
matplotlib.rcParams['ps.fonttype']=42
import warnings
warnings.filterwarnings("ignore")
from igraph import InternalError

# scanpy settings
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=150, frameon=False, figsize=(4, 4)) 
sc._settings.ScanpyConfig.n_jobs=4 # useless

In [2]:
data_path = '/Users/tan/sex-change/data/220915_CyTOF_FtM_1234batches_ComBat_corrected_FlowSOM_3rdAnnotation.csv'
raw = pd.read_csv(data_path)

In [3]:
raw.loc[raw['flowSOM_level2']=='pDC', 'flowSOM_level1']='DC'

In [4]:
raw = raw[~raw['flowSOM_level1'].isna()]
raw = raw[raw['flowSOM_level1']!='unknown']

data = raw.iloc[:,0:48]
label = raw.iloc[:, 48:54]

In [5]:
# filtering and preprocess
label['PAGA_label'] = label['flowSOM_level1']
label.loc[label['flowSOM_level2']=='pDC', 'PAGA_label']='pDC'
label.loc[np.logical_and(label['flowSOM_level1']=='DC', label['flowSOM_level2']!='pDC'), 'PAGA_label']='other DC'
label['Visit']='Visit' + label['Visit'].astype('str')

In [7]:
label.groupby('PAGA_label').count()

Unnamed: 0_level_0,batch,SubjectID,Visit,Subject,flowSOM_level1,flowSOM_level2
PAGA_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bcells,121682,121682,121682,121682,121682,121682
Monocytes,580453,580453,580453,580453,580453,580453
NK,163166,163166,163166,163166,163166,163166
NKT,79206,79206,79206,79206,79206,79206
Tcell_CD4,1046902,1046902,1046902,1046902,1046902,1046902
Tcell_CD8,543327,543327,543327,543327,543327,543327
other DC,99521,99521,99521,99521,99521,99521
pDC,15197,15197,15197,15197,15197,15197


In [8]:
label.groupby(['flowSOM_level1', 'PAGA_label', 'flowSOM_level2']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,batch,SubjectID,Visit,Subject
flowSOM_level1,PAGA_label,flowSOM_level2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bcells,Bcells,Bcells_CD1cCD39,33090,33090,33090,33090
Bcells,Bcells,Bcells_CD39CD22,25186,25186,25186,25186
Bcells,Bcells,Bcells_IgDCD1c,50110,50110,50110,50110
Bcells,Bcells,Bcells_Plasmablast,13296,13296,13296,13296
DC,other DC,DC_CD141DC,77723,77723,77723,77723
DC,other DC,DC_CD1c,21798,21798,21798,21798
DC,pDC,pDC,15197,15197,15197,15197
Monocytes,Monocytes,Monocytes_CM,493909,493909,493909,493909
Monocytes,Monocytes,Monocytes_IM,29128,29128,29128,29128
Monocytes,Monocytes,Monocytes_NCM,57416,57416,57416,57416


In [9]:
nsample = 5000
drop_dic = {'pDC': [], 
            'other DC': [],
            'Bcells': ['CD33', 'CD3e', 'gdTCR', 'Siglec-8', 'CD14', 'CD141', 'CD4'],
            'Tcell_CD4': ['IgD', 'CD1c', 'gdTCR', 'Siglec-8', 'CD20', 'CD14'],
            'Tcell_CD8': ['IgD', 'CD11c', 'CD1c', 'gdTCR', 'Siglec-8', 'CD20', 'CD14'],
            'Monocytes': ['CD57', 'IgD', 'CD25', 'CD20', 'gdTCR', 'CD22', 'CD127'], 
            'NK': [],
            'NKT': []}
# conctruct adata obj
adata_all = anndata.AnnData(data.reset_index(drop=True))
adata_all.obs = label.reset_index(drop=True)

In [10]:
for sub_name in drop_dic:    
    drop_indicator = np.in1d(adata_all.var_names, drop_dic[sub_name])
    # subsampling and drop negative columns
    try:
        sample_index = adata_all.obs[adata_all.obs['PAGA_label']==sub_name].groupby('Subject').apply(lambda x: x.sample(n=nsample, random_state=0) if x.shape[0]>=nsample else x).index.droplevel(level=0)       
        adata = adata_all[sample_index, ~drop_indicator]
    except ValueError as e: 
        # usually when no file has more cells than nsample and thus no subsampling at all.
        # so the results after apply will not be a multiplex index and the droplevel func will fail.
        print(e)
        sample_index = adata_all.obs[adata_all.obs['PAGA_label']==sub_name].groupby('Subject').apply(lambda x: x.sample(n=nsample, random_state=0) if x.shape[0]>=nsample else x).index 
        adata = adata_all[sample_index, ~drop_indicator]
    
    figpath='./figures_2022/' + sub_name + '/'
    os.makedirs(figpath, exist_ok=True)
    sc.settings.figdir=figpath

    print('calculating PAGA...')
    sc.pp.neighbors(adata, n_neighbors=10, use_rep='X')

    # paga process
    sc.tl.leiden(adata, resolution=0.3) 
    sc.tl.paga(adata, groups='leiden')
    
    try:
        sc.pl.paga(adata, color=['leiden'], threshold=0.1, show=False, 
                   save='_' + sub_name + '.pdf')
    except InternalError as e: # maybe there're too little cells
        print(e)
        sc.pl.paga(adata, color=['leiden'], show=False, 
                   save='_' + sub_name + '.pdf')        

    print('embedding with FA...')
    sc.tl.draw_graph(adata, init_pos='paga')

    sc.pl.draw_graph(adata, color=['leiden', 'Visit', 'SubjectID', 'batch'], show=False,
                     save='_' + sub_name + '.pdf')

    sc.pl.draw_graph(adata, color=adata.var.index.values, show=False,
                     save='_' + sub_name + '_markers.pdf')

    print('embedding with density plot...')

    sc.tl.embedding_density(adata, basis='draw_graph_fa', groupby='Visit')
    sc.pl.embedding_density(adata, basis='draw_graph_fa', key='draw_graph_fa_density_Visit', 
                            group=['Visit1', 'Visit2', 'Visit3'], show=False, 
                            save='_' + sub_name + '_Visit_density.pdf')

    print('saving results...')
    os.makedirs('PAGA_result_data_2022/', exist_ok=True)
    adata.write(filename='PAGA_result_data_2022/' + '/' + sub_name + '_sample5000.h5ad', compression = 'gzip')
    print('done!')


Cannot remove 1 levels from an index with 1 levels: at least one level must be left.
calculating PAGA...


... storing 'batch' as categorical
... storing 'SubjectID' as categorical
... storing 'Visit' as categorical
... storing 'Subject' as categorical
... storing 'flowSOM_level1' as categorical
... storing 'flowSOM_level2' as categorical
... storing 'PAGA_label' as categorical


embedding with FA...
embedding with density plot...
saving results...
done!
calculating PAGA...


... storing 'batch' as categorical
... storing 'SubjectID' as categorical
... storing 'Visit' as categorical
... storing 'Subject' as categorical
... storing 'flowSOM_level1' as categorical
... storing 'flowSOM_level2' as categorical
... storing 'PAGA_label' as categorical


embedding with FA...
embedding with density plot...
saving results...
done!
calculating PAGA...


... storing 'batch' as categorical
... storing 'SubjectID' as categorical
... storing 'Visit' as categorical
... storing 'Subject' as categorical
... storing 'flowSOM_level1' as categorical
... storing 'flowSOM_level2' as categorical
... storing 'PAGA_label' as categorical


embedding with FA...
embedding with density plot...
saving results...
done!
calculating PAGA...


... storing 'batch' as categorical
... storing 'SubjectID' as categorical
... storing 'Visit' as categorical
... storing 'Subject' as categorical
... storing 'flowSOM_level1' as categorical
... storing 'flowSOM_level2' as categorical
... storing 'PAGA_label' as categorical


embedding with FA...
embedding with density plot...
saving results...
done!
calculating PAGA...


... storing 'batch' as categorical
... storing 'SubjectID' as categorical
... storing 'Visit' as categorical
... storing 'Subject' as categorical
... storing 'flowSOM_level1' as categorical
... storing 'flowSOM_level2' as categorical
... storing 'PAGA_label' as categorical


embedding with FA...
embedding with density plot...
saving results...
done!
calculating PAGA...


... storing 'batch' as categorical
... storing 'SubjectID' as categorical
... storing 'Visit' as categorical
... storing 'Subject' as categorical
... storing 'flowSOM_level1' as categorical
... storing 'flowSOM_level2' as categorical
... storing 'PAGA_label' as categorical


embedding with FA...
embedding with density plot...
saving results...
done!
calculating PAGA...


... storing 'batch' as categorical
... storing 'SubjectID' as categorical
... storing 'Visit' as categorical
... storing 'Subject' as categorical
... storing 'flowSOM_level1' as categorical
... storing 'flowSOM_level2' as categorical
... storing 'PAGA_label' as categorical


embedding with FA...
embedding with density plot...
saving results...
done!
calculating PAGA...


... storing 'batch' as categorical
... storing 'SubjectID' as categorical
... storing 'Visit' as categorical
... storing 'Subject' as categorical
... storing 'flowSOM_level1' as categorical
... storing 'flowSOM_level2' as categorical
... storing 'PAGA_label' as categorical


embedding with FA...
embedding with density plot...
saving results...
done!
