This script performs LOO test on nuclear-seq dataset using baseline linear regression pipeline

In [2]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import mnnpy

from functools import reduce
from scipy.io import mmread
from scipy.sparse import csr_matrix
from anndata import AnnData
from tqdm import tqdm

  dist[i, j] = np.dot(m[i], n[j])
  dist[i, j] = np.dot(m[i], n[j])
  scale = np.dot(working, grad)
  scale = np.dot(working, grad)
  curproj = np.dot(grad, curcell)
  curproj = np.dot(grad, curcell)


Read nuc-seq data

In [3]:
nuc_nonsmoker = sc.read_h5ad("data/step1_nuc_nonsmoker_processed.h5ad")
nuc_smoker = sc.read_h5ad("data/step1_nuc_smoker_processed.h5ad")

remove donors with ages < 40

In [4]:
nuc_nonsmoker = nuc_nonsmoker[nuc_nonsmoker.obs["Age"] >= 40,]
nuc_smoker = nuc_smoker[nuc_smoker.obs["Age"] >= 40, ]

Define different types of gene sets used as input features

In [5]:
test_genes = dict()

# We use gene list from HLCA data
test_genes["all_smoker_baseline"] = sc.read_h5ad("data/step1_HLCA_smoker_processed.h5ad").var_names.tolist()
test_genes["all_nonsmoker_baseline"] = sc.read_h5ad("data/step1_HLCA_nonsmoker_processed.h5ad").var_names.tolist()



Function for extracting expression data

In [6]:
'''
Filter anndata by cell types, marker genes, and subjects. Subjects are selected by min_cells
Return filtered expression matrix and ages.
'''
def filter_anndata(anndata, ct_column, cts, donor_column, age_column, marker_genes = None, min_cells = 50):
        
    # Keep rows annotated with current cell type, and columns annotated with marker genes,
    comm_genes = np.intersect1d(anndata.var_names, marker_genes)
    #diff_genes = np.setdiff1d(marker_genes,anndata.var_names)
    anndata = anndata[anndata.obs[ct_column].isin(cts), :]
    anndata = anndata[:, anndata.var_names.isin(comm_genes)]
    
    # For marker genes not present in the anndata, add these genes as expressions = 0 to anndata object.
    '''
    if diff_genes.shape[0] != 0:
        padding_arr = np.zeros((anndata.shape[0],diff_genes.shape[0]))
        padding_obs = anndata.obs.copy()
        padding_var = pd.DataFrame([i for i in range(diff_genes.shape[0])], index = diff_genes)
        anndata_padding = AnnData(X=padding_arr, obs = padding_obs, var = padding_var)
        anndata = concat([anndata, anndata_padding],join = "outer", axis = 1)
        anndata.obs = padding_obs
    '''
    # Reorder the gene columns
    anndata = anndata[:,comm_genes]
    
    # Select subjects having number of cells greater than min_cells 
    subjects = anndata.obs[donor_column]
    subjects_count = subjects.groupby(subjects.values).count()
    selected_subjects = subjects_count.loc[subjects_count >= min_cells].index 
    
    # Further subset anndata using the selected subjects
    anndata = anndata[anndata.obs[donor_column].isin(selected_subjects),]
    
    # Generate filtered expression matrix and ages
    expr = anndata.to_df()
    expr.index = anndata.obs[donor_column].values
    ages = anndata.obs[age_column]
    ages.index = expr.index
        
    return expr, ages

cell type mapping between the two datasets key(Nuc-seq types) -> val(HLCA cell types)

In [7]:
ct_mapping = {
    "LymphEndo":["Lymphatic EC","Lymphatic EC mature"],
    "B.type1":["B cells"],
    "B.type2":["B cells"],
    "B.type3":["B cells"],
    "VEcapillary":["EC capillary","EC aerocyte capillary","EC general capillary"],
    "VEaerocyte":["EC aerocyte capillary"],
    "VEarterial":["EC arterial"],
    "VEvenous":["EC venous pulmonary","EC venous systemic","EC venous"],
    "FibroblastAdventitial":["Adventitial fibroblasts"],
    "FibroblastAlveolar":["Alveolar fibroblasts"],
    "Basal":["Suprabasal","Basal","Basal resting"],
    "Goblet":["Goblet"],
    "AlvMac":["Alveolar macrophages"],
    "AT1":["AT1"],
    "AT2":["AT2"],
    "AT2transitional":["Transitional AT2"],
    "cDC":["Dendritic cells"],
    "Ciliated":["Multiciliated lineage"],
    "cMonocyte":["Classical monocytes"],
    "Mac":["Macrophages"],
    "Macrophage.type2":["Macrophages"],
    "Mast":["Mast cells"],
    "Megakaryocyte":["Megakaryocytes"],
    "ncMonocyte":["Non-classical monocytes"],
    "NK":["Innate lymphoid cell NK","NK cells"],
    "SMC":["Smooth muscle"],
    "T":["T cell lineage"],
    "Tcyto":["CD8 T cells"],
    "Treg":["CD4 T cells"]
}

In [8]:
import warnings
warnings.filterwarnings("ignore")

iterator = []
n_rep = 5
ct_column = "cell.type2"
donor_column = 'Subject.ID'
age_column = "Age"
groups = ["nonsmoker","smoker"]
adatas = [nuc_nonsmoker,nuc_smoker]

for adata,group in zip(adatas, groups):
    for ct_nuc,cts_hlca in ct_mapping.items():
               
        # Get donor number
        adata_ct = adata[adata.obs[ct_column] == ct_nuc,]
        expr,ages = filter_anndata(adata_ct,
                                 ct_column = ct_column,
                                 cts = [ct_nuc],
                                 donor_column = donor_column,
                                 age_column = age_column,
                                 marker_genes = adata.var_names,
                                 min_cells = 20
                                )

        all_donor = expr.index.unique()
        all_donor_num = all_donor.shape[0]
        del(adata_ct)

        if all_donor_num >= 10:
            test_gene_types = [f"all_{group}_baseline"]
            for gene_type in test_gene_types:
                expr_sub = expr.loc[:,np.intersect1d(expr.columns,test_genes[gene_type])].copy()
                for use_pca in [False]:
                    for rep in range(1,n_rep+1):
                        iterator.append([expr_sub,ages,group,cts_hlca,ct_nuc,gene_type,use_pca,rep,all_donor_num])
            del(expr)
        print(f"Group: {group}; Nuc cell type: {ct_nuc}; HLCA cell types: {cts_hlca} finished.")

Group: nonsmoker; Nuc cell type: LymphEndo; HLCA cell types: ['Lymphatic EC', 'Lymphatic EC mature'] finished.
Group: nonsmoker; Nuc cell type: B.type1; HLCA cell types: ['B cells'] finished.
Group: nonsmoker; Nuc cell type: B.type2; HLCA cell types: ['B cells'] finished.
Group: nonsmoker; Nuc cell type: B.type3; HLCA cell types: ['B cells'] finished.
Group: nonsmoker; Nuc cell type: VEcapillary; HLCA cell types: ['EC capillary', 'EC aerocyte capillary', 'EC general capillary'] finished.
Group: nonsmoker; Nuc cell type: VEaerocyte; HLCA cell types: ['EC aerocyte capillary'] finished.
Group: nonsmoker; Nuc cell type: VEarterial; HLCA cell types: ['EC arterial'] finished.
Group: nonsmoker; Nuc cell type: VEvenous; HLCA cell types: ['EC venous pulmonary', 'EC venous systemic', 'EC venous'] finished.
Group: nonsmoker; Nuc cell type: FibroblastAdventitial; HLCA cell types: ['Adventitial fibroblasts'] finished.
Group: nonsmoker; Nuc cell type: FibroblastAlveolar; HLCA cell types: ['Alveolar 

### Main analysis pipeline

In [9]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet,LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

from functools import partial,reduce
from hyperopt import hp, Trials, fmin, tpe
from hyperopt import space_eval

In [10]:
def compute_features(expr, ages, mean_degree, var_degree, n_components, use_pca, pca_model):

    if use_pca and pca_model is None:
        pca_model = PCA(n_components=n_components, whiten=True)
        expr = pd.DataFrame(pca_model.fit_transform(expr), index = expr.index)
        expr.columns = [f"PC{i+1}" for i in range(expr.shape[1])]
    elif use_pca and pca_model is not None:
        expr = pd.DataFrame(pca_model.transform(expr), index = expr.index)
        expr.columns = [f"PC{i+1}" for i in range(expr.shape[1])]
        
    # Get each subject/individual's mean expression and polynomials of mean expressions.
    subjects = expr.index.to_list()
    expr_mean = expr.groupby(subjects).mean()
    expr_mean_poly = np.hstack([expr_mean**i for i in range(1, mean_degree+1)])
    feature_names = np.hstack([[f"{col}_mean^{deg}" for col in expr.columns] for deg in range(1,mean_degree+1)]) # Name the polynomial features
    expr_mean_poly = pd.DataFrame(
                        expr_mean_poly,
                        index = expr_mean.index,
                        columns = feature_names 
                    )
    
    if var_degree > 0:

        # Get each subject/indivisual's variances.
        expr_var = expr.groupby(subjects).var()
        expr_var.fillna(value=0,inplace=True) # This should not happen 
        expr_var_poly = np.hstack([expr_var**i for i in range(1, var_degree+1)])
        feature_names = np.hstack([[f"{col}_var^{deg}" for col in expr.columns] for deg in range(1,var_degree+1)]) # Name the polynomial features
        expr_var_poly = pd.DataFrame(
                        expr_var_poly,
                        index = expr_var.index,
                        columns = feature_names 
                    )

        # Concatenate mean and var polynomial features
        X = pd.concat([expr_mean_poly,expr_var_poly],axis = 1)
    else:
        X = expr_mean_poly
    
    # Get Y
    Y = ages.groupby(subjects).mean()
    
    return X,Y,pca_model

In [11]:
def train_test_loo(expr, ages, use_pca, param_space, n_hyper_eval, subject_type):
    
    subjects = expr.index.unique()
    Y_trues = []
    Y_preds = []
    Y_trues_untrans = []
    Y_preds_untrans = []
    
    # Leave-one-out test
    for subj in subjects:
        
        test_subjects = [subj]
        train_subjects = subjects[~subjects.isin(test_subjects)]
        
        X_train = expr.loc[expr.index.isin(train_subjects),]
        Y_train = ages.loc[ages.index.isin(train_subjects),]
        X_test = expr.loc[expr.index.isin(test_subjects),]
        Y_test = ages.loc[ages.index.isin(test_subjects),]
        
        # Note that PCA trained on training data was used on test data.
        X_train, Y_train, pca_model = compute_features(X_train, Y_train, mean_degree=1, var_degree=0, n_components=10, use_pca = use_pca, pca_model=None)
        X_test, Y_test, _ = compute_features(X_test, Y_test, mean_degree=1, var_degree=0, n_components=10, use_pca=use_pca, pca_model = pca_model)
        
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        # Center and scale Y
        Y_train = Y_train.values
        Y_test = Y_test.values
        
        scaler = StandardScaler().fit(Y_train.reshape(-1,1))
        Y_train = scaler.transform(Y_train.reshape(-1,1)).ravel()
        Y_test = scaler.transform(Y_test.reshape(-1,1)).ravel()
        
        # Use the best hyperparameters to fit a model on training data
        model = LinearRegression()
        model.fit(X_train, Y_train)
        
        # Get predicted ages for testing set
        Y_trues.append(Y_test)
        Y_preds.append(model.predict(X_test))
        
        #Y_trues_untrans = Y_trues
        #Y_preds_untrans = Y_preds
        Y_trues_untrans.append(Y_trues[-1][0]*np.sqrt(scaler.var_[0]) + scaler.mean_[0])
        Y_preds_untrans.append(Y_preds[-1][0]*np.sqrt(scaler.var_[0]) + scaler.mean_[0])
        
    # Compute R2 and MAE as evaluation metrics
    Y_trues = np.hstack(Y_trues)
    Y_preds = np.hstack(Y_preds)
    
    r2 = r2_score(Y_trues, Y_preds)
    MAE = mean_absolute_error(Y_trues, Y_preds)
    
    return r2,MAE,Y_trues_untrans,Y_preds_untrans

### Main analysis pipeline

In [17]:
from joblib import delayed, Parallel
import warnings

def run_all(expr, ages, group, cts_hlca, ct_nuc, gene_type, use_pca, all_donor_num, n_hyper_eval = 30, rep = rep):
    warnings.filterwarnings("ignore")

    # Define hyperparameter search space
    param_space = {'n_components' : hp.choice('n_components', [10]),
                 'mean_degree': hp.choice('mean_degree', [2]),
                 'var_degree': hp.choice('var_degree', [2]), 
                 'alpha': hp.choice('alpha', [0.001, 0.01, 0.1, 1, 10, 100]), 
                 'l1_ratio': hp.uniform('l1_ratio', 0.1, 1.0)
        }
    
    # Run leave-one-out test for current cell type
    r2,MAE,true_age,pred_age = train_test_loo(expr, ages, use_pca, param_space, n_hyper_eval = n_hyper_eval, subject_type = group)
    
    true_age = ",".join([str(age) for age in true_age])
    pred_age = ",".join([str(age) for age in pred_age])
    
    return group,";".join(cts_hlca),ct_nuc,gene_type,use_pca,r2,rep,all_donor_num,true_age,pred_age
    #return group,cts_hlca,ct_nuc,gene_type,r2,rep,all_donor_num,true_age,pred_age

res = Parallel(n_jobs = 10)(delayed(run_all)(
                                          expr,
                                          ages,
                                          group,
                                          cts_hlca,
                                          ct_nuc,
                                          gene_type,
                                          use_pca,
                                          all_donor_num,
                                          n_hyper_eval = 30,
                                          rep = rep) for expr,ages,group,cts_hlca,ct_nuc,gene_type,use_pca,rep,all_donor_num in tqdm(iterator))

100%|██████████| 130/130 [07:31<00:00,  3.47s/it]


In [18]:
to_save = pd.DataFrame(res, columns = ["Group","Cell types(HLCA)","Cell type(Nuc-seq)","gene_type","use_pca","R2","rep","donor_num","true_age","pred_age"])
to_save.sort_values(by = "R2",ascending = False,inplace=True)

save results

In [21]:
if not os.path.isdir("results/eval"):
    os.mkdir("results/eval")
to_save.to_csv("results/eval/performances_nucseq_loo_baselineLR.csv",index = None)