This script performs CD test on HLCA dataset and Carraro and IPF dataset using fleischer et al pipeline

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn

from tqdm import tqdm
from functools import reduce
from anndata import concat

Import fleischer et al prediction pipeline

In [2]:
%run fleischer_age_predictors.py

Read HLCA and corrected combined data

In [3]:
hlca_smoker = sc.read_h5ad("data/step1_HLCA_smoker_processed.h5ad")
hlca_nonsmoker = sc.read_h5ad("data/step1_HLCA_nonsmoker_processed.h5ad")



In [4]:
combined_smoker = sc.read_h5ad("results/correction_mnnpy/corrected_combined_smoker.h5ad")
combined_nonsmoker = sc.read_h5ad("results/correction_mnnpy/corrected_combined_nonsmoker.h5ad")

combined_smoker = combined_smoker[combined_smoker.obs["batch"] != "HLCA",]
combined_nonsmoker = combined_nonsmoker[combined_nonsmoker.obs["batch"] != "HLCA",]
smoker_ind = pd.Index(["-".join(ind[:-1]) for ind in combined_smoker.obs.index.str.split("-")])
nonsmoker_ind = pd.Index(["-".join(ind[:-1]) for ind in combined_nonsmoker.obs.index.str.split("-")])

combined_smoker_obs1 = sc.read_h5ad("data/step1_combined_smoker_processed.h5ad").obs
combined_nonsmoker_obs1 = sc.read_h5ad("data/step1_combined_nonsmoker_processed.h5ad").obs

In [5]:
combined_smoker = sc.read_h5ad("results/correction_mnnpy/corrected_combined_smoker.h5ad")
combined_nonsmoker = sc.read_h5ad("results/correction_mnnpy/corrected_combined_nonsmoker.h5ad")

combined_smoker = combined_smoker[combined_smoker.obs["batch"] != "HLCA",]
combined_nonsmoker = combined_nonsmoker[combined_nonsmoker.obs["batch"] != "HLCA",]
smoker_ind = pd.Index(["-".join(ind[:-1]) for ind in combined_smoker.obs.index.str.split("-")])
nonsmoker_ind = pd.Index(["-".join(ind[:-1]) for ind in combined_nonsmoker.obs.index.str.split("-")])

combined_smoker_obs1 = sc.read_h5ad("data/step1_combined_smoker_processed.h5ad").obs
combined_nonsmoker_obs1 = sc.read_h5ad("data/step1_combined_nonsmoker_processed.h5ad").obs

combined_smoker_obs2 = sc.read_h5ad("results/label_transfer/combined_smoker/combined_embedding.h5ad").obs
combined_nonsmoker_obs2 = sc.read_h5ad("results/label_transfer/combined_nonsmoker/combined_embedding.h5ad").obs
combined_smoker_obs2 = combined_smoker_obs2.loc[combined_smoker_obs2["batch"] == '1',]
combined_nonsmoker_obs2 = combined_nonsmoker_obs2.loc[combined_nonsmoker_obs2["batch"] == '1',]

del(combined_smoker_obs2['age'])
del(combined_nonsmoker_obs2['age'])
del(combined_smoker_obs2['dataset'])
del(combined_nonsmoker_obs2['dataset'])

combined_smoker.obs = pd.concat([combined_smoker_obs1.loc[smoker_ind,], combined_smoker_obs2.loc[smoker_ind,]],axis = 1)
combined_nonsmoker.obs = pd.concat([combined_nonsmoker_obs1.loc[nonsmoker_ind,], combined_nonsmoker_obs2.loc[nonsmoker_ind,]], axis = 1)

  utils.warn_names_duplicates("obs")


Keep these columns for cell meta data

In [6]:
hlca_smoker.obs = hlca_smoker.obs.loc[:,['donor_id',
                                       'age',
                                       'dataset',
                                       'ann_level_1',
                                       'ann_level_2',
                                       'ann_level_3',
                                       'ann_level_4',
                                       'ann_level_5']
                                     ]
hlca_nonsmoker.obs = hlca_nonsmoker.obs.loc[:,['donor_id',
                                       'age',
                                        'dataset',
                                       'ann_level_1',
                                       'ann_level_2',
                                       'ann_level_3',
                                       'ann_level_4',
                                       'ann_level_5']
                                     ]

combined_smoker.obs = combined_smoker.obs.loc[:,['donor_id',
                                       'age',
                                        'dataset',
                                       'Level_1_transfered_label',
                                       'Level_2_transfered_label',
                                       'Level_3_transfered_label',
                                       'Level_4_transfered_label',
                                       'Level_5_transfered_label']
                                      ]
combined_nonsmoker.obs = combined_nonsmoker.obs.loc[:,['donor_id',
                                       'age',
                                        'dataset',
                                       'Level_1_transfered_label',
                                       'Level_2_transfered_label',
                                       'Level_3_transfered_label',
                                       'Level_4_transfered_label',
                                       'Level_5_transfered_label']
                                      ]
combined_smoker.obs.columns = ['donor_id','age','dataset','ann_level_1','ann_level_2','ann_level_3','ann_level_4','ann_level_5']
combined_nonsmoker.obs.columns = ['donor_id','age','dataset','ann_level_1','ann_level_2','ann_level_3','ann_level_4','ann_level_5']

In [7]:
train_nonsmoker = hlca_nonsmoker[hlca_nonsmoker.obs['dataset'] != "Banovich_Kropski_2020",].copy()
test_nonsmoker = concat([hlca_nonsmoker[hlca_nonsmoker.obs['dataset'] == "Banovich_Kropski_2020",],
        combined_nonsmoker]
      )

train_smoker = hlca_smoker[hlca_smoker.obs['dataset'] == "Banovich_Kropski_2020",].copy()
test_smoker = concat([
         hlca_smoker[hlca_smoker.obs['dataset'] != "Banovich_Kropski_2020",],
        combined_smoker[combined_smoker.obs["dataset"] == "Kaminski",]]
      )

## Modeling

In [8]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet,LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

from functools import partial
from hyperopt import hp, Trials, fmin, tpe
from hyperopt import space_eval

Data filtering by Number of Cells

In [9]:
'''
Filter anndata by cell type, marker genes, and subjects. Subjects are selected by min_cells
Return filtered expression matrix and ages.
'''
def filter_anndata_single_ct(anndata, ct_column, ct, donor_column, age_column, marker_genes = None, min_cells = 20):
        
    # Keep rows annotated with current cell type, and columns annotated with marker genes,  
    ct_anndata = anndata[anndata.obs[ct_column] == ct, :]
    ct_anndata = ct_anndata[:, ct_anndata.var_names.isin(marker_genes)]
    
    # Select subjects having number of cells greater than min_cells 
    subjects = ct_anndata.obs[donor_column]
    subjects_count = subjects.groupby(subjects.values).count()
    selected_subjects = subjects_count.loc[subjects_count >= min_cells].index 
    
    # Furtuer suset anndata using the selected subjects
    ct_anndata = ct_anndata[ct_anndata.obs[donor_column].isin(selected_subjects),]
    
    # Generate filtered expression matrix and ages
    expr = ct_anndata.to_df()
    expr.index = ct_anndata.obs[donor_column].values
    ages = ct_anndata.obs[age_column]
    ages.index = expr.index
        
    return expr, ages

keep consistent expressed genes between two datasets

In [10]:
common_smoker_gene = np.intersect1d(train_smoker.var_names, test_smoker.var_names)
common_nonsmoker_gene = np.intersect1d(train_nonsmoker.var_names, test_nonsmoker.var_names)

In [11]:
train_smoker = train_smoker[:,common_smoker_gene]
train_nonsmoker = train_nonsmoker[:,common_nonsmoker_gene]
test_smoker = test_smoker[:,common_smoker_gene]
test_nonsmoker = test_nonsmoker[:,common_nonsmoker_gene]

In [12]:
test_genes = dict()
test_genes["all_smoker"] = common_smoker_gene
test_genes["all_nonsmoker"] = common_nonsmoker_gene

Generate data

In [13]:
import warnings
warnings.filterwarnings("ignore")

groups = ["smoker","nonsmoker"]
train_data = [train_smoker,train_nonsmoker]
test_data = [test_smoker,test_nonsmoker]
annLevels = ['ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5']
gene_types = test_genes.keys()
n_rep = 5
iterator = []

for adata_train,adata_test,group in zip(train_data,test_data,groups):
    for annLevel in annLevels:
        
        cts1 = adata_train.obs[annLevel].unique().tolist()
        cts2 = adata_test.obs[annLevel].unique().tolist()
        cell_types = np.intersect1d(cts1,cts2)
        cell_types = cell_types[cell_types != "None"]
        for ct in cell_types:
            gene_types = [f'all_{group}']
            
            for gene_type in gene_types:
                
                # Get donor number for train data
                adata_ct = adata_train[adata_train.obs[annLevel] == ct,]
                expr_train,ages_train = filter_anndata_single_ct(adata_ct,
                                         ct_column = annLevel,
                                         ct = ct,
                                         donor_column = "donor_id",
                                         age_column = "age",
                                         marker_genes = np.intersect1d(test_genes[gene_type],adata_ct.var_names),
                                         min_cells = 20
                                        )
                donor_num_train = expr_train.index.unique().shape[0]
                age_train_mean = ages_train.groupby(ages_train).mean().mean()
                age_train_std = ages_train.groupby(ages_train).mean().std()
                del(adata_ct)
                
                # Get donor number for test data
                adata_ct = adata_test[adata_test.obs[annLevel] == ct,]
                expr_test,ages_test = filter_anndata_single_ct(adata_ct,
                                         ct_column = annLevel,
                                         ct = ct,
                                         donor_column = "donor_id",
                                         age_column = "age",
                                         marker_genes = np.intersect1d(test_genes[gene_type],adata_ct.var_names),
                                         min_cells = 20
                                        )
                    
                donor_num_test = expr_test.index.unique().shape[0]
                age_test_mean = ages_test.groupby(ages_test).mean().mean()
                age_test_std = ages_test.groupby(ages_test).mean().std()
                del(adata_ct)


                #if donor_num_train >= 5 and donor_num_test >= 5 and ct != "None":
                for use_pca in [False]:
                    for rep in range(1,n_rep+1):
                          iterator.append([expr_train,ages_train,
                                       expr_test,ages_test,
                                       group,ct,annLevel,gene_type,
                                       use_pca,rep,donor_num_train,donor_num_test]
                                     )
                print(group,annLevel,ct,gene_type,"done")

smoker ann_level_2 Airway epithelium all_smoker done
smoker ann_level_2 Alveolar epithelium all_smoker done
smoker ann_level_2 Blood vessels all_smoker done
smoker ann_level_2 Fibroblast lineage all_smoker done
smoker ann_level_2 Hematopoietic stem cells all_smoker done
smoker ann_level_2 Lymphatic EC all_smoker done
smoker ann_level_2 Lymphoid all_smoker done
smoker ann_level_2 Mesothelium all_smoker done
smoker ann_level_2 Myeloid all_smoker done
smoker ann_level_2 Smooth muscle all_smoker done
smoker ann_level_3 AT1 all_smoker done
smoker ann_level_3 AT2 all_smoker done
smoker ann_level_3 B cell lineage all_smoker done
smoker ann_level_3 Basal all_smoker done
smoker ann_level_3 Dendritic cells all_smoker done
smoker ann_level_3 EC arterial all_smoker done
smoker ann_level_3 EC capillary all_smoker done
smoker ann_level_3 EC venous all_smoker done
smoker ann_level_3 Fibroblasts all_smoker done
smoker ann_level_3 Innate lymphoid cell NK all_smoker done
smoker ann_level_3 Lymphatic EC 

nonsmoker ann_level_5 Club (non-nasal) all_nonsmoker done
nonsmoker ann_level_5 Goblet (bronchial) all_nonsmoker done
nonsmoker ann_level_5 Goblet (subsegmental) all_nonsmoker done
nonsmoker ann_level_5 Interstitial Mph perivascular all_nonsmoker done
nonsmoker ann_level_5 Monocyte-derived Mph all_nonsmoker done
nonsmoker ann_level_5 Multiciliated (non-nasal) all_nonsmoker done
nonsmoker ann_level_5 SMG serous (bronchial) all_nonsmoker done
nonsmoker ann_level_5 pre-TB secretory all_nonsmoker done


In [14]:
def compute_features(expr, ages, mean_degree, var_degree, n_components, use_pca, pca_model):

    if use_pca and pca_model is None:
        pca_model = PCA(n_components=n_components, whiten=True)
        expr = pd.DataFrame(pca_model.fit_transform(expr), index = expr.index)
        expr.columns = [f"PC{i+1}" for i in range(expr.shape[1])]
    elif use_pca and pca_model is not None:
        expr = pd.DataFrame(pca_model.transform(expr), index = expr.index)
        expr.columns = [f"PC{i+1}" for i in range(expr.shape[1])]
        
    # Get each subject/individual's mean expression and polynomials of mean expressions.
    subjects = expr.index.to_list()
    expr_mean = expr.groupby(subjects).mean()
    expr_mean_poly = np.hstack([expr_mean**i for i in range(1, mean_degree+1)])
    feature_names = np.hstack([[f"{col}_mean^{deg}" for col in expr.columns] for deg in range(1,mean_degree+1)]) # Name the polynomial features
    expr_mean_poly = pd.DataFrame(
                        expr_mean_poly,
                        index = expr_mean.index,
                        columns = feature_names 
                    )
    
    if var_degree > 0:

        # Get each subject/indivisual's variances.
        expr_var = expr.groupby(subjects).var()
        expr_var.fillna(value=0,inplace=True) # This should not happen 
        expr_var_poly = np.hstack([expr_var**i for i in range(1, var_degree+1)])
        feature_names = np.hstack([[f"{col}_var^{deg}" for col in expr.columns] for deg in range(1,var_degree+1)]) # Name the polynomial features
        expr_var_poly = pd.DataFrame(
                        expr_var_poly,
                        index = expr_var.index,
                        columns = feature_names 
                    )

        # Concatenate mean and var polynomial features
        X = pd.concat([expr_mean_poly,expr_var_poly],axis = 1)
    else:
        X = expr_mean_poly
    
    # Get Y
    Y = ages.groupby(subjects).mean()
    
    return X,Y,pca_model

In [15]:
def train_test(expr_train, ages_train, expr_test, ages_test, use_pca):
    
    from numpy.linalg import LinAlgError
    subjects = expr_train.index.unique()

    # Get fature matrix for training data
    X_train,Y_train,pca_model = compute_features(
                        expr_train.copy(),
                        ages_train.copy(),
                        mean_degree = 1,
                        var_degree = 0,
                        n_components = 10,
                        use_pca = use_pca,
                        pca_model = None
                    )

    
    
    # Format train and test data for model
    df_train = pd.DataFrame({"uid":Y_train.index,
          "age":Y_train.values.astype(int),
          "meta":"healthy"})
    df_train.reset_index(inplace = True, drop = True)
    X_train.reset_index(inplace = True, drop = True)
    df_train = pd.concat([df_train, X_train], axis = 1)
    df_train = df_train.set_index(['uid','age','meta']).astype(float)

    # Fit models
    clf = LinearDiscriminantAnalysis(shrinkage='auto',solver='eigen') 
    ensemble = subset_genes_ensemble(clf=clf, class_size=20, subset_fold=5,subset_min=0.3,dataxform_log=False,verbose=True)
    
    try:
        ensemble.fit(df_train, df_train.index.get_level_values('age').values)
    except LinAlgError:
        ensemble = None
        
    subjects = expr_test.index.unique()

    # Get fature matrix for testing data
    X_test,Y_test,_ = compute_features(
                        expr_test.copy(),
                        ages_test.copy(),
                        mean_degree = 1,
                        var_degree = 0,
                        n_components = 10,
                        use_pca = use_pca,
                        pca_model = None
                    )

    df_test = pd.DataFrame({"uid":Y_test.index,
                            "age":Y_test.values.astype(int),
                            "meta":"healthy"}
                          )
    df_test.reset_index(inplace = True, drop = True)
    X_test.reset_index(inplace = True, drop = True)
    df_test = pd.concat([df_test, X_test], axis = 1)
    df_test = df_test.set_index(['uid','age','meta']).astype(float)

    # Get predicted ages for testing set. Errors may occur for some cell types.
    Y_test = Y_test.values.tolist()
    
    if ensemble is not None:
        try:
            Y_preds = ensemble.predict(df_test)
            return r2_score(Y_test, Y_preds),mean_absolute_error(Y_test, Y_preds),Y_test, Y_preds
        except IndexError:
            Y_preds = [np.nan]*len(Y_test)
            return None,None,Y_test,Y_preds
    else:
        Y_preds = [np.nan]*len(Y_test)
        return None,None,Y_test,Y_preds

### Main analysis pipeline
Adjust n_jobs based on number of cores and memory available in your machine

In [16]:
from joblib import delayed, Parallel
import warnings

def run_all(expr_train,ages_train,expr_test,ages_test,group,ct,annLevel,gene_type,use_pca,rep,donor_num_train,donor_num_test,n_hyper_eval):
    
    warnings.filterwarnings("ignore")

    # Parameter search space
    param_space = {'n_components' : hp.choice('n_components', [10]),
                 'mean_degree': hp.choice('mean_degree', [2]),
                 'var_degree': hp.choice('var_degree', [2]), 
                 'alpha': hp.choice('alpha', [0.001, 0.01, 0.1, 1, 10, 100]), 
                 'l1_ratio': hp.uniform('l1_ratio', 0.1, 1.0)
        }
    
    train_donor_num = expr_train.index.unique().shape[0] 
    test_donor_num = expr_test.index.unique().shape[0] 
    
    if train_donor_num >= 5 and test_donor_num >= 5:
        
        # Train on train and test on test lung data
        #model,scaler_X,scaler_Y,pca_model = train(expr_train, ages_train, use_pca, param_space, n_hyper_eval = n_hyper_eval, ct = ct)
        #r2_train,MAE_train,Y_trues,Y_preds = test(expr_test, ages_test, use_pca, ct = ct, model = model,scaler_X = scaler_X,scaler_Y=scaler_Y,pca_model=pca_model)
        r2_train,MAE_train,Y_trues,Y_preds = train_test(expr_train, ages_train, expr_test, ages_test, use_pca)
        Y_trues = ",".join([str(age) for age in Y_trues])
        Y_preds = ",".join([str(age) for age in Y_preds])
    else:
        r2_train = np.nan
        MAE_train = np.nan
        Y_trues = ""
        Y_preds = ""
        
    return annLevel, group, ct, gene_type, use_pca, r2_train, MAE_train,rep,donor_num_train,donor_num_test,Y_trues,Y_preds
    

res = Parallel(n_jobs = 30)(delayed(run_all)(
                                                expr_train,ages_train,
                                                expr_test,ages_test,
                                                group,ct,annLevel,gene_type,
                                                use_pca,rep,donor_num_train,donor_num_test,
                                                n_hyper_eval = 30
                                            ) for expr_train,ages_train,expr_test,ages_test,group,ct,annLevel,gene_type,use_pca,rep,donor_num_train,donor_num_test in tqdm(iterator))                                   

 62%|██████▏   | 480/780 [02:18<03:03,  1.64it/s]

using 1364 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1528 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 512 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 589 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 504 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 416 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 770 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 890 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1498 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 2521 genes in subset requiring a max FPKM > 0.3 and > 5-fold change betwe

 73%|███████▎  | 570/780 [03:16<02:14,  1.56it/s]

using 728 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 728 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1656 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 


100%|██████████| 780/780 [04:21<00:00,  2.98it/s]


using 931 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 443 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 443 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 443 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 491 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1140 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 953 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1559 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1115 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 526 genes in subset requiring a max FPKM > 0.3 and > 5-fold change betwee

using 1621 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 187 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1020 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 570 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1115 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 526 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1857 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1042 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 2073 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1439 genes in subset requiring a max FPKM > 0.3 and > 5-fold change be

using 1621 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 953 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 512 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 589 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 504 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 854 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1523 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1697 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 2523 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 2572 genes in subset requiring a max FPKM > 0.3 and > 5-fold change betw

using 1633 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1357 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 813 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 888 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 657 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 770 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 890 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1523 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1857 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 802 genes in subset requiring a max FPKM > 0.3 and > 5-fold change betwe

using 1364 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1528 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 657 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 588 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1552 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 2288 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 2556 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1093 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1752 genes in subset requiring a max FPKM > 0.3 and > 5-fold change between max and min samples 
using 1439 genes in subset requiring a max FPKM > 0.3 and > 5-fold change b

save results

In [17]:
to_save = pd.DataFrame(res, columns = ["ann_level","group","cell_type","gene_type","use_pca","R2","MAE","rep","Donor_num_train","Donor_num_test","true_age","pred_age"])

In [18]:
to_save = to_save.sort_values(by = "R2", ascending = False)

In [20]:
if not os.path.isdir("results/eval"):
    os.mkdir("results/eval")    
to_save.to_csv("results/eval/performances_CD_fleischer.csv",index = None)