This script performs CD test on HLCA dataset and Carraro and IPF dataset using polyEN pipeline

In [2]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
from tqdm import tqdm
from functools import reduce
from anndata import concat

Read HLCA and corrected combined data

In [3]:
hlca_smoker = sc.read_h5ad("data/step1_HLCA_smoker_processed.h5ad")
hlca_nonsmoker = sc.read_h5ad("data/step1_HLCA_nonsmoker_processed.h5ad")



In [4]:
combined_smoker = sc.read_h5ad("results/correction_mnnpy/corrected_combined_smoker.h5ad")
combined_nonsmoker = sc.read_h5ad("results/correction_mnnpy/corrected_combined_nonsmoker.h5ad")

combined_smoker = combined_smoker[combined_smoker.obs["batch"] != "HLCA",]
combined_nonsmoker = combined_nonsmoker[combined_nonsmoker.obs["batch"] != "HLCA",]
smoker_ind = pd.Index(["-".join(ind[:-1]) for ind in combined_smoker.obs.index.str.split("-")])
nonsmoker_ind = pd.Index(["-".join(ind[:-1]) for ind in combined_nonsmoker.obs.index.str.split("-")])

combined_smoker_obs1 = sc.read_h5ad("data/step1_combined_smoker_processed.h5ad").obs
combined_nonsmoker_obs1 = sc.read_h5ad("data/step1_combined_nonsmoker_processed.h5ad").obs

In [5]:
combined_smoker = sc.read_h5ad("results/correction_mnnpy/corrected_combined_smoker.h5ad")
combined_nonsmoker = sc.read_h5ad("results/correction_mnnpy/corrected_combined_nonsmoker.h5ad")

combined_smoker = combined_smoker[combined_smoker.obs["batch"] != "HLCA",]
combined_nonsmoker = combined_nonsmoker[combined_nonsmoker.obs["batch"] != "HLCA",]
smoker_ind = pd.Index(["-".join(ind[:-1]) for ind in combined_smoker.obs.index.str.split("-")])
nonsmoker_ind = pd.Index(["-".join(ind[:-1]) for ind in combined_nonsmoker.obs.index.str.split("-")])

combined_smoker_obs1 = sc.read_h5ad("data/step1_combined_smoker_processed.h5ad").obs
combined_nonsmoker_obs1 = sc.read_h5ad("data/step1_combined_nonsmoker_processed.h5ad").obs

combined_smoker_obs2 = sc.read_h5ad("results/label_transfer/combined_smoker/combined_embedding.h5ad").obs
combined_nonsmoker_obs2 = sc.read_h5ad("results/label_transfer/combined_nonsmoker/combined_embedding.h5ad").obs
combined_smoker_obs2 = combined_smoker_obs2.loc[combined_smoker_obs2["batch"] == '1',]
combined_nonsmoker_obs2 = combined_nonsmoker_obs2.loc[combined_nonsmoker_obs2["batch"] == '1',]

del(combined_smoker_obs2['age'])
del(combined_nonsmoker_obs2['age'])
del(combined_smoker_obs2['dataset'])
del(combined_nonsmoker_obs2['dataset'])

combined_smoker.obs = pd.concat([combined_smoker_obs1.loc[smoker_ind,], combined_smoker_obs2.loc[smoker_ind,]],axis = 1)
combined_nonsmoker.obs = pd.concat([combined_nonsmoker_obs1.loc[nonsmoker_ind,], combined_nonsmoker_obs2.loc[nonsmoker_ind,]], axis = 1)

  utils.warn_names_duplicates("obs")


Keep these columns for cell meta data

In [6]:
hlca_smoker.obs = hlca_smoker.obs.loc[:,['donor_id',
                                       'age',
                                       'dataset',
                                       'ann_level_1',
                                       'ann_level_2',
                                       'ann_level_3',
                                       'ann_level_4',
                                       'ann_level_5']
                                     ]
hlca_nonsmoker.obs = hlca_nonsmoker.obs.loc[:,['donor_id',
                                       'age',
                                        'dataset',
                                       'ann_level_1',
                                       'ann_level_2',
                                       'ann_level_3',
                                       'ann_level_4',
                                       'ann_level_5']
                                     ]

combined_smoker.obs = combined_smoker.obs.loc[:,['donor_id',
                                       'age',
                                        'dataset',
                                       'Level_1_transfered_label',
                                       'Level_2_transfered_label',
                                       'Level_3_transfered_label',
                                       'Level_4_transfered_label',
                                       'Level_5_transfered_label']
                                      ]
combined_nonsmoker.obs = combined_nonsmoker.obs.loc[:,['donor_id',
                                       'age',
                                        'dataset',
                                       'Level_1_transfered_label',
                                       'Level_2_transfered_label',
                                       'Level_3_transfered_label',
                                       'Level_4_transfered_label',
                                       'Level_5_transfered_label']
                                      ]
combined_smoker.obs.columns = ['donor_id','age','dataset','ann_level_1','ann_level_2','ann_level_3','ann_level_4','ann_level_5']
combined_nonsmoker.obs.columns = ['donor_id','age','dataset','ann_level_1','ann_level_2','ann_level_3','ann_level_4','ann_level_5']

In [7]:
train_nonsmoker = hlca_nonsmoker[hlca_nonsmoker.obs['dataset'] != "Banovich_Kropski_2020",].copy()
test_nonsmoker = concat([hlca_nonsmoker[hlca_nonsmoker.obs['dataset'] == "Banovich_Kropski_2020",],
        combined_nonsmoker]
      )

train_smoker = hlca_smoker[hlca_smoker.obs['dataset'] == "Banovich_Kropski_2020",].copy()
test_smoker = concat([
         hlca_smoker[hlca_smoker.obs['dataset'] != "Banovich_Kropski_2020",],
        combined_smoker[combined_smoker.obs["dataset"] == "Kaminski",]]
      )

Define different types of gene sets used as input features

In [8]:
test_genes = {"fridman":["ALDH1A3", "AOPEP", "CCND1", "CD44", "CDKN1A", "CDKN1C", "CDKN2A", "CDKN2B", "CDKN2D", "CITED2",
                                "CLTB", "COL1A2","CREG1","CRYAB","CCN2","CXCL14","CYP1B1","EIF2S2","ESM1","F3","FILIP1L","FN1","GSN","GUK1","HBS1L",
                                "HPS5","HSPA2","HTATIP2","IFI16","IFNG","IGFBP1","IGFBP2","IGFBP3","IGFBP4","IGFBP5","IGFBP6","IGFBP7","IGSF3",
                                "ING1","IRF5","IRF7","ISG15","MAP1LC3B","MAP2K3","MDM2","MMP1","NDN","NME2","NRG1","OPTN","PEA15","RAB13","RAB31",
                                "RAB5B","RABGGTA","RAC1","RBL2","RGL2","RHOB","RRAS","S100A11","SERPINB2","SERPINE1","SMPD1","SMURF2","SOD1","SPARC",
                                "STAT1","TES","TFAP2A","TGFB1I1","THBS1","TNFAIP2","TNFAIP3","TP53","TSPYL5","VIM","ALDH1A1","BMI1","CCNB1","CDC25B",
                                "CKS1BP7","COL3A1","E2F4","EGR1","ID1","LAMA1","LDB2","MARCKS","CCN4"],
              "sasp2":["VEGFA", "TNFRSF12A", "TNFRSF10C", "TNFRSF10B", "TIMP2", "TIMP1", "TGFB1", "SERPINE1", "TNFRSF1A",
                                    "PLAUR", "PLAU", "MMP14", "MMP13", "MMP7", "MMP3", "MIF", "LMNA", "KITLG", "IL32", "IGFBP7", "IGFBP2",
                                     "ICAM1", "FAS", "EREG", "CXCL17", "CXCL16", "CXCL8", "CXCL1", "CTSB", "CLU", "CCL20", "CCL2", "BTC",
                                     "AREG"
                                  ],
              "senmayo":pd.read_excel("data/senescence_list.xlsx",sheet_name="SenMayo")["symbol"].tolist(),
              "cellage":pd.read_excel("data/senescence_list.xlsx",sheet_name="CellAge Senescence Genes")["Symbol"].tolist()
                }

test_genes["union"] = reduce(np.union1d, [test_genes["fridman"],
                    test_genes["sasp2"],
                    test_genes["senmayo"],
                    test_genes["cellage"]]
      )

## Modeling

In [9]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

from functools import partial
from hyperopt import hp, Trials, fmin, tpe
from hyperopt import space_eval

Data filtering by Number of Cells

In [10]:
'''
Filter anndata by cell type, marker genes, and subjects. Subjects are selected by min_cells
Return filtered expression matrix and ages.
'''
def filter_anndata_single_ct(anndata, ct_column, ct, donor_column, age_column, marker_genes = None, min_cells = 20):
        
    # Keep rows annotated with current cell type, and columns annotated with marker genes,  
    ct_anndata = anndata[anndata.obs[ct_column] == ct, :]
    ct_anndata = ct_anndata[:, ct_anndata.var_names.isin(marker_genes)]
    
    # Select subjects having number of cells greater than min_cells 
    subjects = ct_anndata.obs[donor_column]
    subjects_count = subjects.groupby(subjects.values).count()
    selected_subjects = subjects_count.loc[subjects_count >= min_cells].index 
    
    # Furtuer suset anndata using the selected subjects
    ct_anndata = ct_anndata[ct_anndata.obs[donor_column].isin(selected_subjects),]
    
    # Generate filtered expression matrix and ages
    expr = ct_anndata.to_df()
    expr.index = ct_anndata.obs[donor_column].values
    ages = ct_anndata.obs[age_column]
    ages.index = expr.index
        
    return expr, ages

keep consistent genes between two datasets

In [11]:
common_smoker_gene = np.intersect1d(train_smoker.var_names, test_smoker.var_names)
common_nonsmoker_gene = np.intersect1d(train_nonsmoker.var_names, test_nonsmoker.var_names)

In [12]:
train_smoker = train_smoker[:,common_smoker_gene]
train_nonsmoker = train_nonsmoker[:,common_nonsmoker_gene]
test_smoker = test_smoker[:,common_smoker_gene]
test_nonsmoker = test_nonsmoker[:,common_nonsmoker_gene]

In [13]:
test_genes["common_smoker"] = common_smoker_gene
test_genes["common_nonsmoker"] = common_nonsmoker_gene

Genearte data

In [14]:
import warnings
warnings.filterwarnings("ignore")

groups = ["smoker","nonsmoker"]
train_data = [train_smoker,train_nonsmoker]
test_data = [test_smoker,test_nonsmoker]
annLevels = ['ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5']
gene_types = test_genes.keys()
n_rep = 5
iterator = []

for adata_train,adata_test,group in zip(train_data,test_data,groups):
    for annLevel in annLevels:
        
        cts1 = adata_train.obs[annLevel].unique().tolist()
        cts2 = adata_test.obs[annLevel].unique().tolist()
        cell_types = np.intersect1d(cts1,cts2)
        cell_types = cell_types[cell_types != "None"]
        for ct in cell_types:
            gene_types = ['fridman', 'sasp2', 'senmayo', 'cellage','union',f'common_{group}'
                         ]
            
            for gene_type in gene_types:
                
                # Get donor number for train data
                adata_ct = adata_train[adata_train.obs[annLevel] == ct,]
                expr_train,ages_train = filter_anndata_single_ct(adata_ct,
                                         ct_column = annLevel,
                                         ct = ct,
                                         donor_column = "donor_id",
                                         age_column = "age",
                                         marker_genes = np.intersect1d(test_genes[gene_type],adata_ct.var_names),
                                         min_cells = 20
                                        )
                donor_num_train = expr_train.index.unique().shape[0]
                age_train_mean = ages_train.groupby(ages_train).mean().mean()
                age_train_std = ages_train.groupby(ages_train).mean().std()
                del(adata_ct)
                
                # Get donor number for test data
                adata_ct = adata_test[adata_test.obs[annLevel] == ct,]
                expr_test,ages_test = filter_anndata_single_ct(adata_ct,
                                         ct_column = annLevel,
                                         ct = ct,
                                         donor_column = "donor_id",
                                         age_column = "age",
                                         marker_genes = np.intersect1d(test_genes[gene_type],adata_ct.var_names),
                                         min_cells = 20
                                        )
                    
                donor_num_test = expr_test.index.unique().shape[0]
                age_test_mean = ages_test.groupby(ages_test).mean().mean()
                age_test_std = ages_test.groupby(ages_test).mean().std()
                del(adata_ct)


                #if donor_num_train >= 5 and donor_num_test >= 5 and ct != "None":
                for use_pca in [True,False]:
                    for rep in range(1,n_rep+1):
                          iterator.append([expr_train,ages_train,
                                       expr_test,ages_test,
                                       group,ct,annLevel,gene_type,
                                       use_pca,rep,donor_num_train,donor_num_test]
                                     )
                print(group,annLevel,ct,gene_type,"done")

smoker ann_level_2 Airway epithelium fridman done
smoker ann_level_2 Airway epithelium sasp2 done
smoker ann_level_2 Airway epithelium senmayo done
smoker ann_level_2 Airway epithelium cellage done
smoker ann_level_2 Airway epithelium union done
smoker ann_level_2 Airway epithelium common_smoker done
smoker ann_level_2 Alveolar epithelium fridman done
smoker ann_level_2 Alveolar epithelium sasp2 done
smoker ann_level_2 Alveolar epithelium senmayo done
smoker ann_level_2 Alveolar epithelium cellage done
smoker ann_level_2 Alveolar epithelium union done
smoker ann_level_2 Alveolar epithelium common_smoker done
smoker ann_level_2 Blood vessels fridman done
smoker ann_level_2 Blood vessels sasp2 done
smoker ann_level_2 Blood vessels senmayo done
smoker ann_level_2 Blood vessels cellage done
smoker ann_level_2 Blood vessels union done
smoker ann_level_2 Blood vessels common_smoker done
smoker ann_level_2 Fibroblast lineage fridman done
smoker ann_level_2 Fibroblast lineage sasp2 done
smoker

smoker ann_level_3 SM activated stress response union done
smoker ann_level_3 SM activated stress response common_smoker done
smoker ann_level_3 Secretory fridman done
smoker ann_level_3 Secretory sasp2 done
smoker ann_level_3 Secretory senmayo done
smoker ann_level_3 Secretory cellage done
smoker ann_level_3 Secretory union done
smoker ann_level_3 Secretory common_smoker done
smoker ann_level_3 Smooth muscle FAM83D+ fridman done
smoker ann_level_3 Smooth muscle FAM83D+ sasp2 done
smoker ann_level_3 Smooth muscle FAM83D+ senmayo done
smoker ann_level_3 Smooth muscle FAM83D+ cellage done
smoker ann_level_3 Smooth muscle FAM83D+ union done
smoker ann_level_3 Smooth muscle FAM83D+ common_smoker done
smoker ann_level_3 T cell lineage fridman done
smoker ann_level_3 T cell lineage sasp2 done
smoker ann_level_3 T cell lineage senmayo done
smoker ann_level_3 T cell lineage cellage done
smoker ann_level_3 T cell lineage union done
smoker ann_level_3 T cell lineage common_smoker done
smoker ann

smoker ann_level_4 Non-classical monocytes union done
smoker ann_level_4 Non-classical monocytes common_smoker done
smoker ann_level_4 Peribronchial fibroblasts fridman done
smoker ann_level_4 Peribronchial fibroblasts sasp2 done
smoker ann_level_4 Peribronchial fibroblasts senmayo done
smoker ann_level_4 Peribronchial fibroblasts cellage done
smoker ann_level_4 Peribronchial fibroblasts union done
smoker ann_level_4 Peribronchial fibroblasts common_smoker done
smoker ann_level_4 Pericytes fridman done
smoker ann_level_4 Pericytes sasp2 done
smoker ann_level_4 Pericytes senmayo done
smoker ann_level_4 Pericytes cellage done
smoker ann_level_4 Pericytes union done
smoker ann_level_4 Pericytes common_smoker done
smoker ann_level_4 Plasma cells fridman done
smoker ann_level_4 Plasma cells sasp2 done
smoker ann_level_4 Plasma cells senmayo done
smoker ann_level_4 Plasma cells cellage done
smoker ann_level_4 Plasma cells union done
smoker ann_level_4 Plasma cells common_smoker done
smoker a

nonsmoker ann_level_2 Lymphoid fridman done
nonsmoker ann_level_2 Lymphoid sasp2 done
nonsmoker ann_level_2 Lymphoid senmayo done
nonsmoker ann_level_2 Lymphoid cellage done
nonsmoker ann_level_2 Lymphoid union done
nonsmoker ann_level_2 Lymphoid common_nonsmoker done
nonsmoker ann_level_2 Mesothelium fridman done
nonsmoker ann_level_2 Mesothelium sasp2 done
nonsmoker ann_level_2 Mesothelium senmayo done
nonsmoker ann_level_2 Mesothelium cellage done
nonsmoker ann_level_2 Mesothelium union done
nonsmoker ann_level_2 Mesothelium common_nonsmoker done
nonsmoker ann_level_2 Myeloid fridman done
nonsmoker ann_level_2 Myeloid sasp2 done
nonsmoker ann_level_2 Myeloid senmayo done
nonsmoker ann_level_2 Myeloid cellage done
nonsmoker ann_level_2 Myeloid union done
nonsmoker ann_level_2 Myeloid common_nonsmoker done
nonsmoker ann_level_2 Smooth muscle fridman done
nonsmoker ann_level_2 Smooth muscle sasp2 done
nonsmoker ann_level_2 Smooth muscle senmayo done
nonsmoker ann_level_2 Smooth muscle 

nonsmoker ann_level_3 Submucosal Secretory cellage done
nonsmoker ann_level_3 Submucosal Secretory union done
nonsmoker ann_level_3 Submucosal Secretory common_nonsmoker done
nonsmoker ann_level_3 T cell lineage fridman done
nonsmoker ann_level_3 T cell lineage sasp2 done
nonsmoker ann_level_3 T cell lineage senmayo done
nonsmoker ann_level_3 T cell lineage cellage done
nonsmoker ann_level_3 T cell lineage union done
nonsmoker ann_level_3 T cell lineage common_nonsmoker done
nonsmoker ann_level_4 AT2 proliferating fridman done
nonsmoker ann_level_4 AT2 proliferating sasp2 done
nonsmoker ann_level_4 AT2 proliferating senmayo done
nonsmoker ann_level_4 AT2 proliferating cellage done
nonsmoker ann_level_4 AT2 proliferating union done
nonsmoker ann_level_4 AT2 proliferating common_nonsmoker done
nonsmoker ann_level_4 Adventitial fibroblasts fridman done
nonsmoker ann_level_4 Adventitial fibroblasts sasp2 done
nonsmoker ann_level_4 Adventitial fibroblasts senmayo done
nonsmoker ann_level_4 

nonsmoker ann_level_4 Peribronchial fibroblasts cellage done
nonsmoker ann_level_4 Peribronchial fibroblasts union done
nonsmoker ann_level_4 Peribronchial fibroblasts common_nonsmoker done
nonsmoker ann_level_4 Pericytes fridman done
nonsmoker ann_level_4 Pericytes sasp2 done
nonsmoker ann_level_4 Pericytes senmayo done
nonsmoker ann_level_4 Pericytes cellage done
nonsmoker ann_level_4 Pericytes union done
nonsmoker ann_level_4 Pericytes common_nonsmoker done
nonsmoker ann_level_4 Plasma cells fridman done
nonsmoker ann_level_4 Plasma cells sasp2 done
nonsmoker ann_level_4 Plasma cells senmayo done
nonsmoker ann_level_4 Plasma cells cellage done
nonsmoker ann_level_4 Plasma cells union done
nonsmoker ann_level_4 Plasma cells common_nonsmoker done
nonsmoker ann_level_4 Plasmacytoid DCs fridman done
nonsmoker ann_level_4 Plasmacytoid DCs sasp2 done
nonsmoker ann_level_4 Plasmacytoid DCs senmayo done
nonsmoker ann_level_4 Plasmacytoid DCs cellage done
nonsmoker ann_level_4 Plasmacytoid D

In [15]:
def compute_features(expr, ages, mean_degree, var_degree, n_components, use_pca, pca_model):

    if use_pca and pca_model is None:
        pca_model = PCA(n_components=n_components, whiten=True)
        expr = pd.DataFrame(pca_model.fit_transform(expr), index = expr.index)
        expr.columns = [f"PC{i+1}" for i in range(expr.shape[1])]
    elif use_pca and pca_model is not None:
        expr = pd.DataFrame(pca_model.transform(expr), index = expr.index)
        expr.columns = [f"PC{i+1}" for i in range(expr.shape[1])]
        
    # Get each subject/individual's mean expression and polynomials of mean expressions.
    subjects = expr.index.to_list()
    expr_mean = expr.groupby(subjects).mean()
    expr_mean_poly = np.hstack([expr_mean**i for i in range(1, mean_degree+1)])
    feature_names = np.hstack([[f"{col}_mean^{deg}" for col in expr.columns] for deg in range(1,mean_degree+1)]) # Name the polynomial features
    expr_mean_poly = pd.DataFrame(
                        expr_mean_poly,
                        index = expr_mean.index,
                        columns = feature_names 
                    )
    
    if var_degree > 0:

        # Get each subject/indivisual's variances.
        expr_var = expr.groupby(subjects).var()
        expr_var.fillna(value=0,inplace=True) # This should not happen 
        expr_var_poly = np.hstack([expr_var**i for i in range(1, var_degree+1)])
        feature_names = np.hstack([[f"{col}_var^{deg}" for col in expr.columns] for deg in range(1,var_degree+1)]) # Name the polynomial features
        expr_var_poly = pd.DataFrame(
                        expr_var_poly,
                        index = expr_var.index,
                        columns = feature_names 
                    )

        # Concatenate mean and var polynomial features
        X = pd.concat([expr_mean_poly,expr_var_poly],axis = 1)
    else:
        X = expr_mean_poly
    
    # Get Y
    Y = ages.groupby(subjects).mean()
    
    return X,Y,pca_model

In [16]:
def tune(param, train_subjects, expr, ages, use_pca, ct):
    
    '''
    # Get Hyperparameters
    '''
    n_components = param['n_components']
    mean_degree = param['mean_degree']
    var_degree = param['var_degree']
    alpha = param['alpha']
    l1_ratio = param['l1_ratio']
    
    expr_train = expr.copy().loc[train_subjects,]
    ages_train = ages.copy().loc[train_subjects,]
    
    # Get fature matrix for training data
    X_train,Y_train,pca_model = compute_features(expr_train,
                                         ages_train,
                                         mean_degree,
                                         var_degree,
                                         n_components,
                                         use_pca,
                                         pca_model=None)
    # Center and scale the features
    Y_train = Y_train.values
    X_train = StandardScaler().fit_transform(X_train)
    #Y_train = StandardScaler().fit_transform(Y_train.reshape(-1,1)).ravel()
    
    # Model fitting
    polyreg = make_pipeline(ElasticNet(max_iter=40000, alpha=alpha, l1_ratio=l1_ratio,random_state=42))
    polyreg.fit(X_train, Y_train)
        
    # Get predicted age for training data
    pred = polyreg.predict(X_train)
    
    # Return negative R2 as loss
    return(-r2_score(Y_train, pred))

In [17]:
def train(expr, ages, use_pca, param_space, n_hyper_eval, ct):
    
    subjects = expr.index.unique()
        
    # Partial will freeze some arguments for tune()
    fmin_objective = partial(
                            tune,
                            train_subjects=subjects,
                            expr=expr,
                            ages=ages,
                            use_pca=use_pca,
                            ct = ct
                        )

    # Search for the best hyperparameters on training data
    param_best = fmin(fmin_objective,
                        space = param_space,
                        algo=tpe.suggest,
                        max_evals=n_hyper_eval,
                        verbose = False
                     )
    param_best = space_eval(param_space, param_best)

    # Get fature matrix for training and testing data
    X_train,Y_train,pca_model = compute_features(
                        expr.copy(),
                        ages.copy(),
                        mean_degree = param_best['mean_degree'],
                        var_degree = param_best['var_degree'],
                        n_components = param_best['n_components'],
                        use_pca = use_pca,
                        pca_model = None
                    )

    
    # Center and scale the features
    Y_train = Y_train.values
    scaler_X = StandardScaler().fit(X_train)
    scaler_Y = StandardScaler().fit(Y_train.reshape(-1,1))
    X_train = scaler_X.transform(X_train)
    Y_train = scaler_Y.transform(Y_train.reshape(-1,1)).ravel()
    
    # Use the best hyperparameters to fit a model on training data
    polyreg = make_pipeline(ElasticNet(max_iter=40000, alpha=param_best['alpha'], l1_ratio=param_best['l1_ratio'],random_state=42))
    polyreg.fit(X_train, Y_train)
    
    # Return the model itself
    #return polyreg,pca_model,param_best
    return polyreg,param_best,scaler_X,scaler_Y,pca_model

In [18]:
def test(expr, ages, use_pca, param_best, ct, model, scaler_X, scaler_Y, pca_model):
    
    subjects = expr.index.unique()

    # Get fature matrix for training and testing data
    X_test,Y_test,_ = compute_features(
                        expr.copy(),
                        ages.copy(),
                        mean_degree = param_best['mean_degree'],
                        var_degree = param_best['var_degree'],
                        n_components = param_best['n_components'],
                        use_pca = use_pca,
                        pca_model = pca_model
                    )
    
    # Standardize features
    Y_test = Y_test.values
    X_test = scaler_X.transform(X_test)
    Y_test = scaler_Y.transform(Y_test.reshape(-1,1)).ravel()
    
    # Use the best hyperparameters to fit a model on training data
    pred = model.predict(X_test)
    
    # Return R2 score score
    return r2_score(Y_test, pred),mean_absolute_error(Y_test, pred),Y_test*np.sqrt(scaler_Y.var_) + scaler_Y.mean_, pred*np.sqrt(scaler_Y.var_) + scaler_Y.mean_

### Main analysis pipeline
Adjust n_jobs based on number of cores and memory available in your machine

In [63]:
from joblib import delayed, Parallel
import warnings

def run_all(expr_train,ages_train,expr_test,ages_test,group,ct,annLevel,gene_type,use_pca,rep,donor_num_train,donor_num_test,n_hyper_eval):
    
    warnings.filterwarnings("ignore")

    # Parameter search space
    param_space = {'n_components' : hp.choice('n_components', [10]),
                 'mean_degree': hp.choice('mean_degree', [2]),
                 'var_degree': hp.choice('var_degree', [2]), 
                 'alpha': hp.choice('alpha', [0.001, 0.01, 0.1, 1, 10, 100]), 
                 'l1_ratio': hp.uniform('l1_ratio', 0.1, 1.0)
        }
    
    train_donor_num = expr_train.index.unique().shape[0] 
    test_donor_num = expr_test.index.unique().shape[0] 
    
    if train_donor_num >= 5 and test_donor_num >= 5:
        
        # Train on train and test on test lung data
        model,param_best,scaler_X,scaler_Y,pca_model = train(expr_train, ages_train, use_pca, param_space, n_hyper_eval = n_hyper_eval, ct = ct)
        r2_train,MAE_train,Y_trues,Y_preds = test(expr_test, ages_test, use_pca, param_best, ct = ct, model = model,scaler_X = scaler_X,scaler_Y=scaler_Y,pca_model=pca_model)
        Y_trues = ",".join([str(age) for age in Y_trues])
        Y_preds = ",".join([str(age) for age in Y_preds])
    else:
        r2_train = np.nan
        MAE_train = np.nan
        Y_trues = ""
        Y_preds = ""
        
    return annLevel, group, ct, gene_type, use_pca, r2_train, MAE_train,rep,donor_num_train,donor_num_test,Y_trues,Y_preds
    

res = Parallel(n_jobs = 120)(delayed(run_all)(
                                                expr_train,ages_train,
                                                expr_test,ages_test,
                                                group,ct,annLevel,gene_type,
                                                use_pca,rep,donor_num_train,donor_num_test,
                                                n_hyper_eval = 30
                                            ) for expr_train,ages_train,expr_test,ages_test,group,ct,annLevel,gene_type,use_pca,rep,donor_num_train,donor_num_test in tqdm(iterator))                                   



  0%|          | 0/9360 [00:00<?, ?it/s][A[A

  1%|▏         | 120/9360 [00:14<18:56,  8.13it/s][A[A

  3%|▎         | 240/9360 [00:18<10:30, 14.46it/s][A[A

  4%|▍         | 360/9360 [00:26<10:20, 14.51it/s][A[A

  5%|▌         | 480/9360 [00:42<13:50, 10.70it/s][A[A

  6%|▋         | 600/9360 [00:57<15:21,  9.51it/s][A[A

  8%|▊         | 720/9360 [01:13<16:22,  8.80it/s][A[A

  9%|▉         | 840/9360 [01:48<24:19,  5.84it/s][A[A

 10%|█         | 960/9360 [02:00<20:54,  6.70it/s][A[A

 12%|█▏        | 1080/9360 [02:24<22:47,  6.06it/s][A[A

 13%|█▎        | 1200/9360 [03:10<31:51,  4.27it/s][A[A

 14%|█▍        | 1320/9360 [03:35<30:12,  4.44it/s][A[A

 15%|█▌        | 1440/9360 [04:01<29:18,  4.50it/s][A[A

 17%|█▋        | 1560/9360 [04:36<31:32,  4.12it/s][A[A

 18%|█▊        | 1680/9360 [06:42<1:02:17,  2.05it/s][A[A

 19%|█▉        | 1800/9360 [09:44<1:40:30,  1.25it/s][A[A

 21%|██        | 1920/9360 [09:45<1:09:27,  1.79it/s][A[A

 21%|██ 

save results

In [65]:
to_save = pd.DataFrame(res, columns = ["ann_level","group","cell_type","gene_type","use_pca","R2","MAE","rep","Donor_num_train","Donor_num_test","true_age","pred_age"])

In [66]:
to_save = to_save.sort_values(by = "R2", ascending = False)

In [73]:
to_save.replace(to_replace="common_smoker", value="all",inplace=True)
to_save.replace(to_replace="common_nonsmoker", value="all",inplace=True)

In [93]:
if not os.path.isdir("results/eval"):
    os.mkdir("results/eval")    
to_save.to_csv("results/eval/performances_CD_polyEN.csv",index = None)