# Set up

In [1]:
import scyan as sy
import os
import glob
import anndata
import re
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import scanpy as sc
import scanpy.external as sce


  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 0


In [2]:
print(sy.__version__)


1.5.1


In [3]:
# define the working path
panel = "PB1"
data_path='/home/jupyter/projects/pre-ra/flow/raw-data/' + panel + '/labelled-expr/cache/'
fig_path = '/home/jupyter/projects/pre-ra/flow/02-clustering/results/' + panel + '_global_subsample'  + "/""/"
proj_name = 'pre-ra_flow_clustering_' + panel
output_path = '/home/jupyter/projects/pre-ra/flow/02-clustering/data/' +panel +'/'

if not os.path.exists(fig_path):
    os.makedirs(fig_path)
    
if not os.path.exists(output_path):
    os.makedirs(output_path)
    
# define scanpy verbose levels
sc.settings.verbosity = 3
sc.settings.figdir = fig_path
sc.settings.n_jobs = -1

# Helper Functions 

In [4]:
# make a function to find files
def get_filepaths_with_glob(root_path: str, file_regex: str):
    return glob.glob(os.path.join(root_path, file_regex))


def subset_files(file_names, substrings):
    # Initialize an empty list to store the matching file names
    matching_files = []

    # Iterate through each file name
    for file_name in file_names:
        # Check if any substring is present in the file name
        if any(substring in file_name for substring in substrings):
            # If yes, add the file name to the matching list
            matching_files.append(file_name)

    return matching_files

In [19]:
# load data from csv
def read_one(file_path):
    #print(file_path)
    adata = sy.read_csv(file_path, marker_regex='^cd|^hla|tcr|ig|^ccr|klrg|^cx',  exclude_markers=None)
    adata.obs["batch"] = re.findall( 'B\\d\\d\\d', file_path)[0]
    adata.obs["panel"] = re.findall( 'PB1|PT1|PM1|PS1', file_path)[0]
    sample_id = re.findall( 'PB\\d\\d\\d\\d\\d...', file_path)[0]
    adata.obs["sample_id"] = sample_id
    adata.obs["sample.sampleKitGuid"] = "KT" + sample_id[2:7]
    return adata


In [6]:
# Function to check if any substring is present in the file name
def contains_substring(file_name, substrings):
    return any(substring in file_name for substring in substrings)

In [7]:
### UMAP helper plot, define a helper function to set a single column for our UMAP figure legends. 
def one_col_lgd(umap):
    legend = umap.legend(bbox_to_anchor=[1.00, 0.5],
    loc='center left', ncol=1, prop={'size': 6})
    legend.get_frame().set_linewidth(0.0)
    for handle in legend.legendHandles:
        handle.set_sizes([25.0])
    return legend

# Load files

In [8]:
flow_files=get_filepaths_with_glob(data_path, '*/*transform_labelled_expr.csv')


In [9]:
### read in meta data and pull only non duplicate samples to anndata

freq_tbl = pd.read_csv('/home/jupyter/projects/pre-ra/flow/01-qc-reports/data/' + panel+ '/freq_tbl_sample_info_' + panel + '.csv')

In [10]:
freq_tbl.head()

Unnamed: 0,full_filename,labels,counts,frequency_live,total_counts,sample_id,barcode,l1_labels,filename,panel,...,file.fileType,file.majorVersion,subject.id,subject.biologicalSex,subject.birthYear,subject.ethnicity,subject.partnerCode,subject.race,subject.subjectGuid,cohort.cohortGuid
0,/home//jupyter//projects/pre-ra//flow//raw-dat...,Unknown,98294,0.42684,230283,PB00052-02,4338bff0484111ee9968aeaf4c3a0897,unknown,Flow_Cyanno_PB1_PB00052-02_summary_frequency_s...,PB1,...,FlowCytometry-summary-frequency-stats,2,f8424819-e5c7-44fd-b19c-12d8a5dd0771,Female,1963,Non-Hispanic origin,CU,Caucasian,CU1009,CU1
1,/home//jupyter//projects/pre-ra//flow//raw-dat...,cd14_monocytes,6869,0.029829,230283,PB00052-02,4338c676484111ee9968aeaf4c3a0897,total_myeloid_cells,Flow_Cyanno_PB1_PB00052-02_summary_frequency_s...,PB1,...,FlowCytometry-summary-frequency-stats,2,f8424819-e5c7-44fd-b19c-12d8a5dd0771,Female,1963,Non-Hispanic origin,CU,Caucasian,CU1009,CU1
2,/home//jupyter//projects/pre-ra//flow//raw-dat...,debris,8211,0.035656,230283,PB00052-02,4338bfc8484111ee9968aeaf4c3a0897,debris,Flow_Cyanno_PB1_PB00052-02_summary_frequency_s...,PB1,...,FlowCytometry-summary-frequency-stats,2,f8424819-e5c7-44fd-b19c-12d8a5dd0771,Female,1963,Non-Hispanic origin,CU,Caucasian,CU1009,CU1
3,/home//jupyter//projects/pre-ra//flow//raw-dat...,naive_b_cells,9403,0.040832,230283,PB00052-02,4338c00e484111ee9968aeaf4c3a0897,total_b_cells,Flow_Cyanno_PB1_PB00052-02_summary_frequency_s...,PB1,...,FlowCytometry-summary-frequency-stats,2,f8424819-e5c7-44fd-b19c-12d8a5dd0771,Female,1963,Non-Hispanic origin,CU,Caucasian,CU1009,CU1
4,/home//jupyter//projects/pre-ra//flow//raw-dat...,nk_cells,33032,0.143441,230283,PB00052-02,4338c14e484111ee9968aeaf4c3a0897,total_nk_cells,Flow_Cyanno_PB1_PB00052-02_summary_frequency_s...,PB1,...,FlowCytometry-summary-frequency-stats,2,f8424819-e5c7-44fd-b19c-12d8a5dd0771,Female,1963,Non-Hispanic origin,CU,Caucasian,CU1009,CU1


In [20]:
# subset flow files based on non duplicate files in meta data
strings = freq_tbl['sample_id'].unique()

# Subset file_names based on matching substrings
matching_files = [file_name for file_name in flow_files if contains_substring(file_name, strings)]

# Print the result
len(matching_files)
len(strings)

138

In [21]:
flow_files[:5]

['/home/jupyter/projects/pre-ra/flow/raw-data/PB1/labelled-expr/cache/3a448e90-a9ee-40e0-9afb-595f7bb92a45/B088_PB1_PB00467-01_live_logical_transform_labelled_expr.csv',
 '/home/jupyter/projects/pre-ra/flow/raw-data/PB1/labelled-expr/cache/06ba97cf-c5c3-4832-b123-2a70468ed4cb/B140_PB1_PB00063-05_live_logical_transform_labelled_expr.csv',
 '/home/jupyter/projects/pre-ra/flow/raw-data/PB1/labelled-expr/cache/aa619715-6657-4913-9dc4-9a01644d3617/B182_PB1_PB04671-001_live_logical_transform_labelled_expr.csv',
 '/home/jupyter/projects/pre-ra/flow/raw-data/PB1/labelled-expr/cache/e9dab109-aa04-4525-a726-31df06430799/B057_PB1_PB00436-02_live_logical_transform_labelled_expr.csv',
 '/home/jupyter/projects/pre-ra/flow/raw-data/PB1/labelled-expr/cache/aeb0ccc0-2b27-498a-bca4-37de84ecca7d/B139_PB1_PB04103-001_live_logical_transform_labelled_expr.csv']

In [22]:

adata = anndata.concat([read_one(p) for p in matching_files], index_unique="-")

In [23]:
len(adata.obs['sample_id'].unique())

138

In [24]:
adata

AnnData object with n_obs × n_vars = 57762311 × 24
    obs: 'Unnamed: 0', 'sample_id', 'cell_id', 'barcode', 'Time', 'SSC-W', 'SSC-H', 'SSC-A', 'FSC-W', 'FSC-H', 'FSC-A', 'SSC-B-W', 'SSC-B-H', 'SSC-B-A', 'Viability_logicle', 'labels', 'batch', 'panel', 'sample.sampleKitGuid'

In [25]:
adata.obs

Unnamed: 0.1,Unnamed: 0,sample_id,cell_id,barcode,Time,SSC-W,SSC-H,SSC-A,FSC-W,FSC-H,FSC-A,SSC-B-W,SSC-B-H,SSC-B-A,Viability_logicle,labels,batch,panel,sample.sampleKitGuid
0-0,0,PB00467-01,1,908e13a82a8411eda605fe22d8c79dde,48450.0,740557.9375,564999.0,6.973574e+05,716652.3750,1189311.0,1420537.500,741975.1875,418228.0,517191.34375,1.698486,Unknown,B088,PB1,KT00467
1-0,1,PB00467-01,2,908e13d02a8411eda605fe22d8c79dde,48450.0,731824.6250,459221.0,5.601154e+05,708948.3750,996820.0,1177823.250,721398.6250,426078.0,512286.81250,1.635329,t_cells,B088,PB1,KT00467
2-0,2,PB00467-01,3,908e13f82a8411eda605fe22d8c79dde,48455.0,845166.8750,495960.0,6.986149e+05,763026.3750,634781.0,807257.750,820858.0000,558711.0,764370.68750,0.916245,Unknown,B088,PB1,KT00467
3-0,3,PB00467-01,4,908e14202a8411eda605fe22d8c79dde,48456.0,757639.6875,362719.0,4.580172e+05,725696.1250,949282.0,1148150.375,741761.0000,343311.0,424424.50000,1.490283,t_cells,B088,PB1,KT00467
4-0,4,PB00467-01,6,908e14702a8411eda605fe22d8c79dde,48460.0,746102.8750,502148.0,6.244234e+05,744195.0625,1243487.0,1542328.125,751597.1250,466484.0,584346.68750,1.774413,Unknown,B088,PB1,KT00467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304361-178,304361,PB02292-00,347124,b7f2d2a25d9711ee9ec94e89935a78d6,1174455.0,670863.8750,418244.0,4.676413e+05,647108.1250,934053.0,1007388.750,644245.0000,324757.0,348705.12500,1.621018,Unknown,B158,PB1,KT02292
304362-178,304362,PB02292-00,347125,b7f2d2b65d9711ee9ec94e89935a78d6,1174460.0,706157.8125,362969.0,4.271890e+05,724599.0625,1363746.0,1646948.500,703045.1875,329698.0,386321.00000,1.208885,transitional_b_cells,B158,PB1,KT02292
304363-178,304363,PB02292-00,347126,b7f2d2d45d9711ee9ec94e89935a78d6,1174461.0,759415.5625,1179359.0,1.492706e+06,808968.0000,1092078.0,1472426.875,724650.5625,827733.0,999695.25000,2.054986,Unknown,B158,PB1,KT02292
304364-178,304364,PB02292-00,347127,b7f2d2f25d9711ee9ec94e89935a78d6,1174473.0,680054.4375,374818.0,4.248278e+05,699214.1875,1297234.0,1511740.750,649316.6250,300684.0,325398.53125,1.291036,transitional_b_cells,B158,PB1,KT02292


## Merge L1 pop

In [26]:
labels_df = freq_tbl[["labels", "l1_labels"]].drop_duplicates()

In [27]:
adata.obs = adata.obs.join(labels_df.set_index('labels'), on='labels')

In [28]:
adata.obs['l1_labels'].unique()

array(['unknown', 't_cells', 'total_b_cells', 'debris', 'total_nk_cells',
       'total_myeloid_cells'], dtype=object)

In [29]:
adata

AnnData object with n_obs × n_vars = 57762311 × 24
    obs: 'Unnamed: 0', 'sample_id', 'cell_id', 'barcode', 'Time', 'SSC-W', 'SSC-H', 'SSC-A', 'FSC-W', 'FSC-H', 'FSC-A', 'SSC-B-W', 'SSC-B-H', 'SSC-B-A', 'Viability_logicle', 'labels', 'batch', 'panel', 'sample.sampleKitGuid', 'l1_labels'

In [None]:
# save unscaled adata for subsetting 
adata.write_h5ad(output_path + "adata_preprocess_unscaled_" + panel + ".h5ad")

# QC check

In [None]:
adata = sc.read_h5ad(output_path + "adata_preprocess_unscaled_" + panel + ".h5ad")

In [None]:
adata

In [None]:
# Important: data from the Cyanno pipeline are already logcile tranformed. 
# No need to redo the transformation
adata.X


In [None]:
## check for NAs in exp array, PM1 has NAns
np.isnan((np.sum(adata.X)))

In [None]:
adata.to_df().notnull()

# Define highly var markers

In [None]:
#### ==== DEFINE HIGHLY VAR MARKERS BASED ON PANEL DF ==== ####
# load panel info for pt1
panel_df = pd.read_csv('/home/jupyter/projects/pre-ra/flow/raw-data/AIFI_flow_' + panel + '_panel_breakdown.csv')
gating_antigens = panel_df.loc[panel_df['used_for_clustering']=='Yes', 'antigen']

gating_antigens = [s + '_logicle' for s in gating_antigens]


# set up the variable
adata.var['antigens'] = adata.var.index.str.replace('_logicle', '')
adata.var[['gating_antigens']] = False
adata.var.loc[adata.var.index.isin(gating_antigens),'gating_antigens'] = True
adata.var[['highly_variable']] = adata.var[['gating_antigens']]
adata.var


# Subsample

In [None]:
#### ==== SUBSAMPLE ==== ####
sc.pp.subsample(adata,n_obs=3500000, random_state = 123)
print(adata)

In [None]:
## check for NAs in exp array, PM1 has NAns
np.isnan((np.sum(adata.X)))

# Scale

In [None]:
#### ==== SCALE ==== ####
sy.preprocess.scale(adata)
print(adata.X)

In [None]:
cell_labels_to_subset_list = adata.obs['l1_labels'].unique()
print(cell_labels_to_subset_list)

print(pd.crosstab(adata.obs['l1_labels'], adata.obs['panel'], margins=True)) 

# Save Scaled

In [None]:
adata.write_h5ad(output_path  + "adata_preprocess_scaled_global_subsample_3.5mill_" + panel + ".h5ad")


# PCA

In [None]:
# setting highly variable as highly deviant to use scanpy 'use_highly_variable' argument in sc.pp.pca
sc.pp.pca(adata, svd_solver="arpack", use_highly_variable=True)

In [None]:
sce.pp.harmony_integrate(adata, 'batch', adjusted_basis='X_pca_harmony')


In [None]:
# replace new pca slot
adata.obsm['X_pca'] = adata.obsm['X_pca_harmony']

In [None]:
sy.tools.umap(adata, markers=gating_antigens)


In [None]:
p1 = sy.plot.umap(adata, color=['labels','l1_labels', 'batch','panel'], ncols = 2, return_fig = True, size = .5)
p1.set_size_inches(16.5, 16.5)
p1.savefig(fig_path + "global_subsample_3.5mill_" + "_umap_labels_batch_panel_" +panel + ".png",
               dpi=400, bbox_inches='tight')

In [None]:
p1 = sy.plot.umap(adata, color=['batch'], ncols = 2, return_fig = True, size = .5)
p1.savefig(fig_path + "global_subsample_3.5mill_" + "_umap_batch_" +panel + ".png",
               dpi=400, bbox_inches='tight')

In [None]:
p1 = sy.plot.umap(adata, color=['l1_labels'], ncols = 2, return_fig = True, size = .5)
p1.savefig(fig_path + "global_subsample_3.5mill_" + "_umap_labels_l1_" +panel + ".png",
               dpi=400, bbox_inches='tight')

In [None]:
p1 = sy.plot.umap(adata, color=['labels'], ncols = 2, return_fig = True, size = .5)
p1.savefig(fig_path + "global_subsample_3.5mill_" + "_umap_labels_l2_" +panel + ".png",
               dpi=400, bbox_inches='tight')

In [None]:
adata.obs["Unknown"] = np.nan
adata.obs.loc[adata.obs['labels']=='Unknown',"Unknown"] = "Unknown"
p1=sy.plot.umap(adata, color=['Unknown'], return_fig = True)
p1.savefig(fig_path + "global_subsample_3.5mill_" + "unknown_pop_" +panel + ".png",
               dpi=400, bbox_inches='tight')

In [None]:
p1=sy.plot.umap(adata, color=adata.var_names.sort_values(),
             ncols=6, show=False, return_fig=True)
p1.set_size_inches(18.5, 18.5)

p1.savefig(fig_path + "global_subsample_3.5mill_" + "_umap_expression_labels_corrected_panel_" +panel + ".png",
               dpi=400, bbox_inches='tight')

In [None]:
adata

In [None]:
adata.write_h5ad(output_path + "adata_preprocess_global_subsample_3.5mill_" + panel + ".h5ad")