In [4]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
from tqdm import tqdm
from functools import reduce

Read HLCA data

In [5]:
hlca_smoker = sc.read_h5ad("data/step1_HLCA_smoker_processed.h5ad")
hlca_nonsmoker = sc.read_h5ad("data/step1_HLCA_nonsmoker_processed.h5ad")



Keep only these columns

In [6]:
hlca_smoker.obs = hlca_smoker.obs.loc[:,['donor_id',
                                       'age',
                                       'ann_level_1',
                                       'ann_level_2',
                                       'ann_level_3',
                                       'ann_level_4',
                                       'ann_level_5']
                                     ]
hlca_nonsmoker.obs = hlca_nonsmoker.obs.loc[:,['donor_id',
                                       'age',
                                       'ann_level_1',
                                       'ann_level_2',
                                       'ann_level_3',
                                       'ann_level_4',
                                       'ann_level_5']
                                     ]

Define different types of gene sets used as input features

In [7]:
test_genes = {"fridman":["ALDH1A3", "AOPEP", "CCND1", "CD44", "CDKN1A", "CDKN1C", "CDKN2A", "CDKN2B", "CDKN2D", "CITED2",
                                "CLTB", "COL1A2","CREG1","CRYAB","CCN2","CXCL14","CYP1B1","EIF2S2","ESM1","F3","FILIP1L","FN1","GSN","GUK1","HBS1L",
                                "HPS5","HSPA2","HTATIP2","IFI16","IFNG","IGFBP1","IGFBP2","IGFBP3","IGFBP4","IGFBP5","IGFBP6","IGFBP7","IGSF3",
                                "ING1","IRF5","IRF7","ISG15","MAP1LC3B","MAP2K3","MDM2","MMP1","NDN","NME2","NRG1","OPTN","PEA15","RAB13","RAB31",
                                "RAB5B","RABGGTA","RAC1","RBL2","RGL2","RHOB","RRAS","S100A11","SERPINB2","SERPINE1","SMPD1","SMURF2","SOD1","SPARC",
                                "STAT1","TES","TFAP2A","TGFB1I1","THBS1","TNFAIP2","TNFAIP3","TP53","TSPYL5","VIM","ALDH1A1","BMI1","CCNB1","CDC25B",
                                "CKS1BP7","COL3A1","E2F4","EGR1","ID1","LAMA1","LDB2","MARCKS","CCN4"],
              "sasp2":["VEGFA", "TNFRSF12A", "TNFRSF10C", "TNFRSF10B", "TIMP2", "TIMP1", "TGFB1", "SERPINE1", "TNFRSF1A",
                                    "PLAUR", "PLAU", "MMP14", "MMP13", "MMP7", "MMP3", "MIF", "LMNA", "KITLG", "IL32", "IGFBP7", "IGFBP2",
                                     "ICAM1", "FAS", "EREG", "CXCL17", "CXCL16", "CXCL8", "CXCL1", "CTSB", "CLU", "CCL20", "CCL2", "BTC",
                                     "AREG"
                                  ],
              "senmayo":pd.read_excel("data/senescence_list.xlsx",sheet_name="SenMayo")["symbol"].tolist(),
              "cellage":pd.read_excel("data/senescence_list.xlsx",sheet_name="CellAge Senescence Genes")["Symbol"].tolist()
                }

test_genes["union"] = reduce(np.union1d, [test_genes["fridman"],
                    test_genes["sasp2"],
                    test_genes["senmayo"],
                    test_genes["cellage"]]
      )
test_genes["all_smoker"] = hlca_smoker.var_names.tolist()
test_genes["all_nonsmoker"] = hlca_nonsmoker.var_names.tolist()

## Modeling

In [8]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

from functools import partial,reduce
from hyperopt import hp, Trials, fmin, tpe
from hyperopt import space_eval

Data filtering by Number of Cells

In [9]:
'''
Filter anndata by cell type, marker genes, and subjects. Subjects are selected by min_cells
Return filtered expression matrix and ages.
'''
def filter_anndata_single_ct(anndata, ct_column, ct, donor_column, age_column, marker_genes = None, min_cells = 20):
        
    # Keep rows annotated with current cell type, and columns annotated with marker genes,  
    ct_anndata = anndata[anndata.obs[ct_column] == ct, :]
    ct_anndata = ct_anndata[:, ct_anndata.var_names.isin(marker_genes)]
    
    # Select subjects having number of cells greater than min_cells 
    subjects = ct_anndata.obs[donor_column]
    subjects_count = subjects.groupby(subjects.values).count()
    selected_subjects = subjects_count.loc[subjects_count >= min_cells].index 
    
    # Further subset anndata using the selected subjects
    ct_anndata = ct_anndata[ct_anndata.obs[donor_column].isin(selected_subjects),]
    
    # Generate filtered expression matrix and ages
    expr = ct_anndata.to_df()
    expr.index = ct_anndata.obs[donor_column].values
    ages = ct_anndata.obs[age_column]
    ages.index = expr.index
        
    return expr, ages

In [10]:
def compute_features(expr, ages, mean_degree, var_degree, n_components, use_pca, pca_model):

    if use_pca and pca_model is None:
        pca_model = PCA(n_components=n_components, whiten=True)
        expr = pd.DataFrame(pca_model.fit_transform(expr), index = expr.index)
        expr.columns = [f"PC{i+1}" for i in range(expr.shape[1])]
    elif use_pca and pca_model is not None:
        expr = pd.DataFrame(pca_model.transform(expr), index = expr.index)
        expr.columns = [f"PC{i+1}" for i in range(expr.shape[1])]
        
    # Get each subject/individual's mean expression and polynomials of mean expressions.
    subjects = expr.index.to_list()
    expr_mean = expr.groupby(subjects).mean()
    expr_mean_poly = np.hstack([expr_mean**i for i in range(1, mean_degree+1)])
    feature_names = np.hstack([[f"{col}_mean^{deg}" for col in expr.columns] for deg in range(1,mean_degree+1)]) # Name the polynomial features
    expr_mean_poly = pd.DataFrame(
                        expr_mean_poly,
                        index = expr_mean.index,
                        columns = feature_names 
                    )
    
    if var_degree > 0:

        # Get each subject/indivisual's variances.
        expr_var = expr.groupby(subjects).var()
        expr_var.fillna(value=0,inplace=True) # This should not happen 
        expr_var_poly = np.hstack([expr_var**i for i in range(1, var_degree+1)])
        feature_names = np.hstack([[f"{col}_var^{deg}" for col in expr.columns] for deg in range(1,var_degree+1)]) # Name the polynomial features
        expr_var_poly = pd.DataFrame(
                        expr_var_poly,
                        index = expr_var.index,
                        columns = feature_names 
                    )

        # Concatenate mean and var polynomial features
        X = pd.concat([expr_mean_poly,expr_var_poly],axis = 1)
    else:
        X = expr_mean_poly
    
    # Get Y
    Y = ages.groupby(subjects).mean()
    
    return X,Y,pca_model

Generate data for feature extraction and training/testing

In [11]:
import warnings
warnings.filterwarnings("ignore")

iterator = []
n_rep = 5
donor_types = ["nonsmoker","smoker"]
adatas = [hlca_nonsmoker,hlca_smoker]

for annLevel in ['ann_level_2','ann_level_3','ann_level_4','ann_level_5']:
    c1 = hlca_nonsmoker.obs[annLevel]
    c2 = hlca_smoker.obs[annLevel]
    cell_types = np.union1d(c1,c2)
    selected_cell_types = []
    
    for adata,group in zip(adatas,donor_types):
        for ct in cell_types:
            
            # Get donor number
            adata_ct = adata[adata.obs[annLevel] == ct,]
            expr,ages = filter_anndata_single_ct(adata_ct,
                                     ct_column = annLevel,
                                     ct = ct,
                                     donor_column = "donor_id",
                                     age_column = "age",
                                     marker_genes = adata.var_names,
                                     min_cells = 20
                                    )
            all_donor = expr.index.unique()
            all_donor_num = all_donor.shape[0]
            
            del(adata_ct)
            if all_donor_num >= 10 and ct != "None":
                selected_cell_types.append(ct)
                test_gene_types = ["fridman",
                                   "sasp2",
                                   "senmayo",
                                   "cellage",
                                   "union",
                                   f"all_{group}"
                                  ]
                for gene_type in test_gene_types:
                    expr_sub = expr.loc[:,np.intersect1d(expr.columns,test_genes[gene_type])].copy()
                    for use_pca in [True,False]:
                        for rep in range(1,n_rep+1):
                            iterator.append([expr_sub,ages,group,ct,annLevel,gene_type,use_pca,rep,all_donor_num])
                del(expr)

Train and test models

In [12]:
def tune(param, expr, ages, subject_type, ct):
    
    '''
    # Get Hyperparameters
    '''
    n_components = param['n_components']
    mean_degree = param['mean_degree']
    var_degree = param['var_degree']
    alpha = param['alpha']
    l1_ratio = param['l1_ratio']
    
    X_train = expr.copy()
    Y_train = ages.copy()
    
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
        
    # Center and scale Y.
    Y_train = Y_train.values
    Y_train = StandardScaler().fit_transform(Y_train.reshape(-1,1)).ravel()
    
    # Model fitting
    polyreg = make_pipeline(ElasticNet(max_iter=40000, alpha=alpha, l1_ratio=l1_ratio))
    polyreg.fit(X_train, Y_train)
        
    # Get predicted age for training data
    pred = polyreg.predict(X_train)
    
    # Return negative R2 as loss
    return(-r2_score(Y_train, pred))

In [13]:
def train_test_loo(expr, ages, use_pca, param_space, n_hyper_eval, subject_type, ct):
    
    subjects = expr.index.unique()
    Y_trues = []
    Y_preds = []
    Y_trues_untrans = []
    Y_preds_untrans = []
    
    # Leave-one-out test
    for subj in subjects:
        
        test_subjects = [subj]
        train_subjects = subjects[~subjects.isin(test_subjects)]
        
        X_train = expr.loc[expr.index.isin(train_subjects),]
        Y_train = ages.loc[ages.index.isin(train_subjects),]
        X_test = expr.loc[expr.index.isin(test_subjects),]
        Y_test = ages.loc[ages.index.isin(test_subjects),]
        
        # Note that PCA trained on training data was used on test data.
        X_train, Y_train, pca_model = compute_features(X_train, Y_train, mean_degree=2, var_degree=2, n_components=10, use_pca = use_pca, pca_model=None)
        X_test, Y_test, _ = compute_features(X_test, Y_test, mean_degree=2, var_degree=2, n_components=10, use_pca=use_pca, pca_model = pca_model)
        
        # Partial will freeze some arguments for tune()
        fmin_objective = partial(
                                tune,
                                expr=X_train,
                                ages=Y_train,
                                subject_type = subject_type,
                                ct = ct
                            )
        
        # Search for the best hyperparameters on training data
        param_best = fmin(fmin_objective,
                            space = param_space,
                            algo=tpe.suggest,
                            max_evals=n_hyper_eval,
                            verbose = False
                         )
        param_best = space_eval(param_space, param_best)
        
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        # Center and scale Y
        Y_train = Y_train.values
        Y_test = Y_test.values
        
        scaler = StandardScaler().fit(Y_train.reshape(-1,1))
        Y_train = scaler.transform(Y_train.reshape(-1,1)).ravel()
        Y_test = scaler.transform(Y_test.reshape(-1,1)).ravel()
        
        # Use the best hyperparameters to fit a model on training data
        polyreg = make_pipeline(ElasticNet(max_iter=40000, alpha=param_best['alpha'], l1_ratio=param_best['l1_ratio']))
        polyreg.fit(X_train, Y_train)
        
        # Get predicted ages for testing set
        Y_trues.append(Y_test)
        Y_preds.append(polyreg.predict(X_test))
        
        Y_trues_untrans.append(Y_trues[-1]*np.sqrt(scaler.var_[0]) + scaler.mean_[0])
        Y_preds_untrans.append(Y_preds[-1]*np.sqrt(scaler.var_[0]) + scaler.mean_[0])
        
    # Compute R2 and MAE as evaluation metrics
    Y_trues = np.hstack(Y_trues)
    Y_preds = np.hstack(Y_preds)
    
    r2 = r2_score(Y_trues, Y_preds)
    MAE = mean_absolute_error(Y_trues, Y_preds)
    
    return r2,MAE,Y_trues_untrans,Y_preds_untrans

### Main analysis pipeline

Adjust n_jobs based on number of cores and memory available in your machine

In [15]:
from joblib import delayed, Parallel
import warnings

def run_all(expr, ages, group, ct, ct_column, gene_type, use_pca, all_donor_num, n_hyper_eval, rep):
    warnings.filterwarnings("ignore")

    # Define hyperparameter search space
    param_space = {'n_components' : hp.choice('n_components', [10]),
                 'mean_degree': hp.choice('mean_degree', [2]),
                 'var_degree': hp.choice('var_degree', [2]), 
                 'alpha': hp.choice('alpha', [0.001, 0.01, 0.1, 1, 10, 100]), 
                 'l1_ratio': hp.uniform('l1_ratio', 0.1, 1.0)
        }
    
    # Run leave-one-out test for current cell type
    r2,MAE,Y_trues,Y_preds = train_test_loo(expr, ages, use_pca, param_space, n_hyper_eval = n_hyper_eval, subject_type = group, ct = ct)
    
    Y_trues = ",".join([str(age) for age in Y_trues])
    Y_preds = ",".join([str(age) for age in Y_preds])
    
    return ct_column,group,ct,gene_type,use_pca,r2,rep,all_donor_num,Y_trues,Y_preds

res = Parallel(n_jobs = 100)(delayed(run_all)(
                                          expr,
                                          ages,
                                          group,
                                          ct,
                                          ct_column,
                                          gene_type,
                                          use_pca,
                                          all_donor_num,
                                          n_hyper_eval = 30,
                                          rep = rep) for expr,ages,group,ct,ct_column,gene_type,use_pca,rep,all_donor_num in tqdm(iterator))

100%|██████████| 5400/5400 [33:28:14<00:00, 22.31s/it]   


Save results

In [57]:
to_save = pd.DataFrame(res, columns = ["ann_level","group","cell_type","gene_type","use_pca","R2","rep","all_donor_num","true_age","pred_age"])
to_save.replace(to_replace="common_smoker", value="all",inplace=True)
to_save.replace(to_replace="common_nonsmoker", value="all",inplace=True)

remove "[" and "]"

In [58]:
for idx,row in to_save.iterrows():
    row["true_age"] = re.sub("\[","",row["true_age"])
    row["true_age"] = re.sub("\]","",row["true_age"])
    row["pred_age"] = re.sub("\[","",row["pred_age"])
    row["pred_age"] = re.sub("\]","",row["pred_age"])
    to_save.iloc[idx,:] = row

In [59]:
dfs = []
for grp,df in to_save.groupby(["ann_level","group","cell_type","gene_type","use_pca"]):
    df["R2_mean"] = df["R2"].mean()
    df["R2_std"] = df["R2"].std()
    del(df["R2"])
    del(df["rep"])
    dfs.append(df.drop_duplicates())
dfs = pd.concat(dfs)

to_save = to_save.sort_values(by = "R2", ascending = False)

if not os.path.isdir("results/eval"):
    os.mkdir("results/eval")    
to_save.to_csv("results/eval/performances_HLCA_loo_polyEN.csv",index = None)