In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from matplotlib import rcParams
import scanpy as sc
import os
import anndata
from sklearn.mixture import GaussianMixture
from fcsy import DataFrame
import matplotlib
from glob import glob
matplotlib.rcParams['pdf.fonttype']=42
matplotlib.rcParams['ps.fonttype']=42
import warnings
warnings.filterwarnings("ignore")
from igraph import InternalError

# scanpy settings
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=150, frameon=False, figsize=(4, 4)) 
sc._settings.ScanpyConfig.n_jobs=4 # useless

In [3]:
# readin the information table
info_path = '/Users/tan/ionctura-collab/data/Sample_info_reformat.xlsx'
sampleInfo = pd.read_excel(info_path, dtype={'Sample ID':str}, sheet_name=0)

In [4]:
sampleInfo

Unnamed: 0,Tumor_Groups,Subject_ID,Tumor_type,Cohort_and_dose,timepoint,Sampling date,Gender,Sample_ID,Sample nr in a 96 well plate,Barcode key,Barcoded Batch,Expt_ID
0,Tumor Group 1,3901-01,Melanoma / Follicular NHL (Stage IV),1 (10mg),AC2D1,2020-03-25,M,100094,1,1,1,EXP-21-DG3637
1,Tumor Group 1,3901-01,Melanoma / Follicular NHL (Stage IV),1 (10mg),AC3D1,2020-04-20,M,100095,2,2,1,EXP-21-DG3637
2,Tumor Group 1,3901-01,Melanoma / Follicular NHL (Stage IV),1 (10mg),AC4D1,2020-05-18,M,100096,3,3,1,EXP-21-DG3637
3,Tumor Group 1,3901-01,Melanoma / Follicular NHL (Stage IV),1 (10mg),AC5D1,2020-06-16,M,100097,4,5,1,EXP-21-DG3637
4,Tumor Group 1,3901-01,Melanoma / Follicular NHL (Stage IV),1 (10mg),AC6D1*,2020-07-14,M,100098,5,6,1,EXP-21-DG3637
...,...,...,...,...,...,...,...,...,...,...,...,...
112,Tumor Group 3,4401-06,Uveal Melanoma (Stage IV),3 (40mg),AC1D1,2020-11-10,M,100158,45,6,5,EXP-21-DG3637
113,Tumor Group 3,4401-06,Uveal Melanoma (Stage IV),3 (40mg),AC1D2,2020-11-11,M,100159,46,8,5,EXP-21-DG3637
114,Tumor Group 3,4401-06,Uveal Melanoma (Stage IV),3 (40mg),AC1D15,2020-11-24,M,100160,47,11,5,EXP-21-DG3637
115,Tumor Group 3,4401-06,Uveal Melanoma (Stage IV),3 (40mg),AC2D1,2020-12-08,M,100161,48,12,5,EXP-21-DG3637


In [None]:
selectInfo = sampleInfo

In [None]:
dataDir = '/Users/tan/cytof_data/*/renamed'
labelDir = '/Users/tan/cytof_data/*/classifiedV3'
nsample = 3000
sub_name = 'Tregs'

drop_columns = ['Time', 'Event_length', 'Center', 'Width', 'Residual', 'Offset',
                'Amplitude', '88Sr', 'CD45', '102Pd', '103Rh', '104Pd', '105Pd',
                '106Pd', '108Pd','116Cd', '120Sn', '127I', '131Xe', '138Ba',
                '190BCKG', '191Ir', '193Ir', '208Pb']
drop_dic = {'pDC': [], 
            'B-cells': ['CD33', 'CD3', 'TCRgd', 'Siglec-8', 'CD14', 'CD141', 'CD4'],
            'CD4 T-cells': ['IgD', 'CD1c', 'TCRgd', 'Siglec-8', 'CD20', 'CD14'],
            'CD8 T-cells': ['IgD', 'CD11c', 'CD1c', 'TCRgd', 'Siglec-8', 'CD20', 'CD14'],
            #'Eosinophils': ['IgD', 'CD57', 'CD25', 'TCRgd', 'CD14'],
            'Monocytes': ['CD57', 'IgD', 'CD25', 'CD20', 'TCRgd', 'CD22', 'CD127'], 
            'Neutrophils': ['IgD', 'HLA-DR', 'CD57', 'CD25', 'CD22', 'TCRgd', 'CD123', 'CD161'],
            'NK cells': [],
            'Lin Neg': [],
            'gdT': [], 
            'Plasmablasts': [],
            'Basophils': [],
            'Tregs':[]}

# readin and merge the file according to "selectInfo" as an anndata object
all_data_path = 'adata/ion_EXP-21-DG3656_V3.h5ad'

#for sub_name in drop_dic:
print('currently working on ' + sub_name)
if not os.path.exists(all_data_path):
    print('adata not found, load and preprocess raw data...')
    all_data_list = []
    all_label_list = []
    for i in range(len(selectInfo)):
        label_path = glob(labelDir + '/*/' + str(selectInfo['Facility barcode/Sample ID'].iloc[i]) + '*.csv')[0]
        data_path = glob(dataDir + '/*/' + str(selectInfo['Facility barcode/Sample ID'].iloc[i]) + '*.fcs')[0]
        labelTmp = pd.read_csv(label_path)
        labelTmp['Sample ID'] = np.repeat(str(selectInfo['Facility barcode/Sample ID'].iloc[i]), len(labelTmp.index))
        labelTmp['timepoint'] = np.repeat(str(selectInfo['Time point '].iloc[i]), len(labelTmp.index))
        labelTmp['Subject ID'] = np.repeat(str(selectInfo['Patient ID'].iloc[i]), len(labelTmp.index))
        labelTmp['group'] = np.repeat(str(selectInfo['Cohort and dose'].iloc[i]), len(labelTmp.index))
        labelTmp['type'] = np.repeat(str(selectInfo['Tumor type'].iloc[i]), len(labelTmp.index))
        labelTmp['timepoint_group'] = np.repeat(selectInfo['Time point '].iloc[i] + 
                                                '_' + 
                                                str(selectInfo['Cohort and dose'].iloc[i]), len(labelTmp.index))
        labelTmp['batch'] = np.repeat(data_path.split('/')[-4], len(labelTmp.index)) #EXP-XX-XXXXXX
        dataTmp = DataFrame.from_fcs(data_path, channel_type='long')
        if '4-1BB' in set(dataTmp.columns):
            dataTmp.rename(columns={"4-1BB": "CD137"}, inplace=True)
        # filter the cells without a level1 tag
        dataTmp = dataTmp[labelTmp['level1']!=' ']
        labelTmp = labelTmp[labelTmp['level1']!=' ']
        # remove EQBeads and DNA channel # also remove the negative channels
        dataTmp.drop(columns=drop_columns, inplace=True)
        #dataTmp = dataTmp[select_columns]
        dataTmp = np.arcsinh(dataTmp/5)
        all_data_list.append(dataTmp)
        all_label_list.append(labelTmp)

    all_data = pd.concat(all_data_list, ignore_index=True)
    all_label = pd.concat(all_label_list, ignore_index=True)
    adata = anndata.AnnData(all_data)
    adata.obs = all_label
    #print('batch correction...')
    #sc.pp.combat(adata, key = 'batch', covariates = ['timepoint'])
    print('scaling...')
    sc.pp.scale(adata)
    os.makedirs('adata/', exist_ok=True)
    print('write to h5ad file...')
    adata.write(filename=all_data_path, compression = 'gzip')
    adata=None
    print('finished!')
print('loading...')
adata_all = sc.read_h5ad(filename = all_data_path)
print('finished!')

In [None]:
adata_all.obs['Sample ID'].nunique()

In [None]:
pd.set_option('display.max_rows', adata_all.obs['Sample ID'].nunique()+1)
adata_all.obs[adata_all.obs['level3'].isin(['CD39 Memory Tregs', 'Memory Tregs', 'Naive Tregs'])]['Sample ID'].value_counts()

In [None]:
# subsampling
print('subsampling...')
try:
    if sub_name in adata_all.obs['level1'].unique().tolist():
        sample_index = adata_all.obs[adata_all.obs['level1']==sub_name].groupby('Sample ID').apply(lambda x: x.sample(n=nsample, random_state=0) if x.shape[0]>=nsample else x).index.droplevel(level=0)
    elif sub_name in adata_all.obs['level2'].unique().tolist():
        sample_index = adata_all.obs[adata_all.obs['level2']==sub_name].groupby('Sample ID').apply(lambda x: x.sample(n=nsample, random_state=0) if x.shape[0]>=nsample else x).index.droplevel(level=0)
    elif sub_name == 'Tregs':
        sample_index = adata_all.obs[adata_all.obs['level3'].isin(['CD39 Memory Tregs', 'Memory Tregs', 'Naive Tregs'])].groupby('Sample ID').apply(lambda x: x.sample(n=nsample, random_state=0) if x.shape[0]>=nsample else x).index.droplevel(level=0)        
    adata_sample = adata_all[sample_index]
except ValueError as e: 
    # usually when no file has more cells than nsample and thus no subsampling at all.
    # so the results after apply will not be a multiplex index and the droplevel func will fail.
    print(e)
    if sub_name in adata_all.obs['level1'].unique().tolist():
        adata_sample = adata_all[adata_all.obs['level1']==sub_name]
    elif sub_name in adata_all.obs['level2'].unique().tolist():
        adata_sample = adata_all[adata_all.obs['level2']==sub_name]
    elif sub_name == 'Tregs':
        adata_sample = adata_all[adata_all.obs['level3'].isin(['CD39 Memory Tregs', 'Memory Tregs', 'Naive Tregs'])]
        

In [None]:
adata = anndata.AnnData(adata_sample.to_df().drop(columns = drop_dic[sub_name]))
adata.obs = pd.DataFrame(adata_sample.obs)

# skip
# optimization
#sc.settings.figdir='./optimization/'
#for res in [0.3]:
#    adata_opt = adata
#    n_comps = min([adata_opt.n_obs, adata_opt.n_vars, 21])-1
#    sc.tl.pca(adata_opt, svd_solver='arpack', n_comps=n_comps)
#    sc.pp.neighbors(adata_opt, n_neighbors=10, n_pcs=n_comps)
#    sc.tl.leiden(adata_opt, resolution=res)
#    sc.tl.paga(adata_opt, groups='leiden')
#    sc.pl.paga(adata_opt, color=['leiden'], threshold=0.1, show=False, 
#               save='_' + sub_name + '_' + str(res) + '.pdf')
#    adata_opt = []

# save figures to a sub dir
figpath='./figures/' + sub_name + '/'
os.makedirs(figpath, exist_ok=True)
sc.settings.figdir=figpath

print('calculating PAGA...')
n_comps = min([adata.n_obs, adata.n_vars, 21])-1
sc.tl.pca(adata, svd_solver='arpack', n_comps=n_comps)
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=n_comps)

# paga process
sc.tl.leiden(adata, resolution=0.3) 

sc.tl.paga(adata, groups='leiden')
try:
    sc.pl.paga(adata, color=['leiden'], threshold=0.1, show=False, 
               save='_' + sub_name + '.pdf')
except InternalError as e: # maybe there're too little cells
    print(e)
    sc.pl.paga(adata, color=['leiden'], show=False, 
               save='_' + sub_name + '.pdf')        

print('embedding with FA...')
sc.tl.draw_graph(adata, init_pos='paga')

sc.pl.draw_graph(adata, color=['timepoint', 'group', 'leiden', 'batch', 'Subject ID', 'type'], show=False,
                 save='_' + sub_name + '.pdf')

sc.pl.draw_graph(adata, color=adata.var.index.values, show=False,
                 save='_' + sub_name + '_markers.pdf')

In [None]:
print('embedding with density plot...')

sc.tl.embedding_density(adata, basis='draw_graph_fa', groupby='timepoint')
sc.pl.embedding_density(adata, basis='draw_graph_fa', key='draw_graph_fa_density_timepoint', 
                        group=['AC1D1', 'AC1D2', 'AC1D15', 'AC2D1', 'AC3D1', 'AC4D1',
                               'AC5D1','AC6D1','AC7D1','AC8D1','AC9D1','AC10D1','AC11D1',
                               'AC12D1','AC13D1','AC14D1','AC15D1','AC17D1','AC18D1',], show=False, 
                        save='_' + sub_name + '_timepoint_density.pdf')

In [None]:
print('saving results...')
os.makedirs('PAGA_result_data/', exist_ok=True)
adata.write(filename='PAGA_result_data/EXP-21-DG3656_' + sub_name + '_sample3000.h5ad', compression = 'gzip')
print('done!')

In [None]:
list(adata_all.obs['timepoint'].unique())