# SHAP feature selection #
## Code to select feature with combination of LightGBM and SHAP ##

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [8]:
number_of_folds = 5 # this sets global setting of which how many bootstraps to use
repeats = 10
#first round of optimization
lgb_params = {
 "objective": "regression",
 'boosting_type': 'gbdt', 
 'metric': ['mae','mse', 'huber'],
 'lambda_l1': 0.03674546022666247, 
 'lambda_l2': 2.5025758383109715, 
 'max_leaves': 22,
 'max_depth': 6, 
 'feature_fraction': 0.741996402547356, 
 'bagging_fraction': 0.8929839112500518, 
 'learning_rate': 0.0892243160116563, 
 'min_data_in_leaf': 5, 
 'drop_rate': 0.14842616274718734    
}

debug_local = True #to use local version

In [9]:
from pathlib import Path
import sys
import inspect

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.models import Metrics
from yspecies.selection import ShapSelector

In [12]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

Using matplotlib backend: TkAgg


### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [13]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [14]:
data = ExpressionDataset.from_folder(locations.interim.selected)
data

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(445, 12340)",12340,39,445,"(12340, 2)","(40, 19)"


## Setting up SHAP selection pipeline ##

Deciding on selection parameters (which fields to include, exclude, predict)

In [15]:
partition_params = PartitionParameters(number_of_folds, 0, 2, [],  42)


In [16]:
partition_shap_pipe = Pipeline([
    ("partitioner", DataPartitioner()),
    ('prepare_for_partitioning', TupleWith(lgb_params)),
    ("shap_computation", ShapSelector())
]
)
repeated_cv =  Repeat(partition_shap_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))

In [17]:
selection_pipeline =  Pipeline([
    ('extractor', DataExtractor()),
    ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
    ("partition_shap", repeated_cv)]
    )

## Setting up features to select ##

In [18]:
selection = select_lifespan = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = "lifespan", #column to predict
    categorical = ["tissue"])

In [19]:
select_lifespan = selection
select_mass = replace(selection, to_predict = "mass_g")
select_gestation = replace(selection, to_predict = "gestation")
select_mtgc = replace(selection, to_predict = "mtgc")

# First stage selection #

In [20]:
stage_one_lifespan = selection_pipeline.fit_transform((data, select_lifespan))
stage_one_lifespan

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[15]	valid_0's l1: 8.95509	valid_0's l2: 189.944	valid_0's huber: 7.66501
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[138]	valid_0's l1: 3.65986	valid_0's l2: 70.135	valid_0's huber: 2.98298
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[77]	valid_0's l1: 4.57755	valid_0's l2: 31.7151	valid_0's huber: 3.74263
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[99]	valid_0's l1: 5.60866	valid_0's l2: 129.762	valid_0's huber: 4.7072
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 5.86207	valid_0's l2: 103.254	valid_0's huber: 4.92927
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 5.86207	valid_0's l2: 103.254	valid_0's huber: 4.92927


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[186]	valid_0's l1: 4.53548	valid_0's l2: 78.8119	valid_0's huber: 3.78829
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[98]	valid_0's l1: 3.60277	valid_0's l2: 74.9152	valid_0's huber: 2.95564
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[85]	valid_0's l1: 3.01916	valid_0's l2: 31.7315	valid_0's huber: 2.44062
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 5.69179	valid_0's l2: 87.4646	valid_0's huber: 4.81568
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 5.69179	valid_0's l2: 87.4646	valid_0's huber: 4.81568
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[129]	valid_0's l1: 3.38114	valid_0's l2: 39.5009	valid_0's huber: 2.77319


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[166]	valid_0's l1: 21.5694	valid_0's l2: 2027.76	valid_0's huber: 19.086
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[124]	valid_0's l1: 5.60692	valid_0's l2: 83.5029	valid_0's huber: 4.70877
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[103]	valid_0's l1: 2.8651	valid_0's l2: 24.2581	valid_0's huber: 2.25714
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[96]	valid_0's l1: 3.01419	valid_0's l2: 23.3583	valid_0's huber: 2.37319
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[7]	valid_0's l1: 10.193	valid_0's l2: 164.261	valid_0's huber: 8.78566


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[79]	valid_0's l1: 3.48594	valid_0's l2: 35.8019	valid_0's huber: 2.82972
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[144]	valid_0's l1: 3.92069	valid_0's l2: 39.7639	valid_0's huber: 3.21972
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[42]	valid_0's l1: 7.10758	valid_0's l2: 196.611	valid_0's huber: 6.05274
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[56]	valid_0's l1: 3.07037	valid_0's l2: 28.6139	valid_0's huber: 2.43764
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[23]	valid_0's l1: 6.66301	valid_0's l2: 102.957	valid_0's huber: 5.60608


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[189]	valid_0's l1: 2.28791	valid_0's l2: 14.2764	valid_0's huber: 1.76023
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[74]	valid_0's l1: 2.08904	valid_0's l2: 14.1152	valid_0's huber: 1.59454
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[33]	valid_0's l1: 6.15931	valid_0's l2: 118.042	valid_0's huber: 5.16358
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[57]	valid_0's l1: 6.68107	valid_0's l2: 96.0212	valid_0's huber: 5.66345
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[181]	valid_0's l1: 2.65449	valid_0's l2: 18.4261	valid_0's huber: 2.08742


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[15]	valid_0's l1: 9.13463	valid_0's l2: 199.097	valid_0's huber: 7.82101
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[111]	valid_0's l1: 2.35982	valid_0's l2: 22.0276	valid_0's huber: 1.83743
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 2.50547	valid_0's l2: 20.0338	valid_0's huber: 1.95365
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 2.50547	valid_0's l2: 20.0338	valid_0's huber: 1.95365
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[122]	valid_0's l1: 2.75325	valid_0's l2: 18.6402	valid_0's huber: 2.15831
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[181]	valid_0's l1: 3.49998	valid_0's l2: 34.2893	valid_0's huber: 2.87314


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[105]	valid_0's l1: 2.78978	valid_0's l2: 26.0589	valid_0's huber: 2.20182
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[105]	valid_0's l1: 2.23637	valid_0's l2: 14.1022	valid_0's huber: 1.72104
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[48]	valid_0's l1: 8.65115	valid_0's l2: 172.823	valid_0's huber: 7.42385
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[97]	valid_0's l1: 3.53089	valid_0's l2: 34.107	valid_0's huber: 2.86342
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[84]	valid_0's l1: 2.30131	valid_0's l2: 12.9446	valid_0's huber: 1.75607


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[144]	valid_0's l1: 5.78265	valid_0's l2: 114.168	valid_0's huber: 4.87759
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[90]	valid_0's l1: 1.83769	valid_0's l2: 7.96125	valid_0's huber: 1.34201
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[76]	valid_0's l1: 2.66959	valid_0's l2: 17.3894	valid_0's huber: 2.07966
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[15]	valid_0's l1: 11.0221	valid_0's l2: 247.994	valid_0's huber: 9.51794
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[128]	valid_0's l1: 2.83211	valid_0's l2: 15.1987	valid_0's huber: 2.19695


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[36]	valid_0's l1: 8.23239	valid_0's l2: 214.728	valid_0's huber: 7.03237
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[86]	valid_0's l1: 1.77836	valid_0's l2: 13.3965	valid_0's huber: 1.33522
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[15]	valid_0's l1: 11.2635	valid_0's l2: 315.795	valid_0's huber: 9.73304
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[180]	valid_0's l1: 6.21307	valid_0's l2: 129.088	valid_0's huber: 5.27803
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[80]	valid_0's l1: 2.74116	valid_0's l2: 19.5057	valid_0's huber: 2.15102


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[118]	valid_0's l1: 2.69672	valid_0's l2: 24.7225	valid_0's huber: 2.14402
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[128]	valid_0's l1: 1.73385	valid_0's l2: 13.7157	valid_0's huber: 1.29137
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[124]	valid_0's l1: 2.70245	valid_0's l2: 17.2727	valid_0's huber: 2.10128
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[136]	valid_0's l1: 3.11295	valid_0's l2: 31.1112	valid_0's huber: 2.50236
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[33]	valid_0's l1: 5.24486	valid_0's l2: 82.1457	valid_0's huber: 4.34352


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,512897.728,0.04,
ENSG00000083896,YTHDC1,192584.667,0.179,
ENSG00000167515,TRAPPC2L,132820.645,-0.22,
ENSG00000060762,MPC1,111991.538,-0.308,
ENSG00000105672,ETV2,39996.305,0.695,
ENSG00000142541,RPL13A,9135.891,0.007,
ENSG00000065268,WDR18,3898.028,-0.543,
ENSG00000101190,TCFL5,3801.633,0.635,
ENSG00000066923,STAG3,2066.474,0.516,
ENSG00000164879,CA3,1187.993,-0.519,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,512897.728,0.04
ENSG00000083896,YTHDC1,192584.667,0.179
ENSG00000167515,TRAPPC2L,132820.645,-0.22
ENSG00000060762,MPC1,111991.538,-0.308
ENSG00000105672,ETV2,39996.305,0.695
ENSG00000142541,RPL13A,9135.891,0.007
ENSG00000065268,WDR18,3898.028,-0.543
ENSG00000101190,TCFL5,3801.633,0.635
ENSG00000066923,STAG3,2066.474,0.516
ENSG00000164879,CA3,1187.993,-0.519

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.549,189.944,8.955,"[Phascolarctos_cinereus, Mus_musculus]"
1,0.912,70.135,3.66,"[Equus_caballus, Rhinolophus_ferrumequinum]"
2,0.933,31.715,4.578,"[Rattus_norvegicus, Mesocricetus_auratus]"
3,0.769,129.762,5.609,"[Meriones_unguiculatus, Tursiops_truncatus]"
4,0.809,103.254,5.862,"[Tupaia_belangeri, Cavia_aperea]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,590255.085,0.007,
ENSG00000083896,YTHDC1,263567.498,0.165,
ENSG00000060762,MPC1,131514.405,-0.349,
ENSG00000167515,TRAPPC2L,91362.823,-0.279,
ENSG00000119616,FCF1,39439.753,-0.295,
ENSG00000173567,ADGRF3,32865.557,0.435,
ENSG00000105672,ETV2,25675.42,0.672,
ENSG00000136436,CALCOCO2,15916.991,0.679,
ENSG00000188763,FZD9,15048.773,0.042,
ENSG00000129187,DCTD,14465.92,0.692,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,590255.085,0.007
ENSG00000083896,YTHDC1,263567.498,0.165
ENSG00000060762,MPC1,131514.405,-0.349
ENSG00000167515,TRAPPC2L,91362.823,-0.279
ENSG00000119616,FCF1,39439.753,-0.295
ENSG00000173567,ADGRF3,32865.557,0.435
ENSG00000105672,ETV2,25675.42,0.672
ENSG00000136436,CALCOCO2,15916.991,0.679
ENSG00000188763,FZD9,15048.773,0.042
ENSG00000129187,DCTD,14465.92,0.692

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.875,78.812,4.535,"[Sus_scrofa, Gorilla_gorilla]"
1,0.893,74.915,3.603,"[Pan_troglodytes, Bos_taurus]"
2,0.956,31.731,3.019,"[Macaca_nemestrina, Canis_lupus_familiaris]"
3,0.807,87.465,5.692,"[Rhinolophus_ferrumequinum, Monodelphis_domest..."
4,0.931,39.501,3.381,"[Ursus_americanus, Ictidomys_tridecemlineatus]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000167515,TRAPPC2L,129668.686,-0.292,
ENSG00000060762,MPC1,127707.643,-0.313,
ENSG00000105672,ETV2,35316.665,0.685,
ENSG00000188763,FZD9,27301.975,-0.005,
ENSG00000129187,DCTD,10820.332,0.664,
ENSG00000142541,RPL13A,4832.287,0.036,
ENSG00000142002,DPP9,4630.716,-0.542,
ENSG00000165568,AKR1E2,1170.903,-0.359,
ENSG00000156521,TYSND1,535.397,0.097,
ENSG00000015520,NPC1L1,302.362,0.119,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000167515,TRAPPC2L,129668.686,-0.292
ENSG00000060762,MPC1,127707.643,-0.313
ENSG00000105672,ETV2,35316.665,0.685
ENSG00000188763,FZD9,27301.975,-0.005
ENSG00000129187,DCTD,10820.332,0.664
ENSG00000142541,RPL13A,4832.287,0.036
ENSG00000142002,DPP9,4630.716,-0.542
ENSG00000165568,AKR1E2,1170.903,-0.359
ENSG00000156521,TYSND1,535.397,0.097
ENSG00000015520,NPC1L1,302.362,0.119

Unnamed: 0,R^2,MSE,MAE,validation_species
0,-9.345,2027.761,21.569,"[Homo_sapiens, Mus_caroli]"
1,0.531,83.503,5.607,"[Mus_musculus, Monodelphis_domestica]"
2,0.924,24.258,2.865,"[Ailuropoda_melanoleuca, Pan_paniscus]"
3,0.891,23.358,3.014,"[Ictidomys_tridecemlineatus, Sus_scrofa]"
4,-0.493,164.261,10.193,"[Phascolarctos_cinereus, Felis_catus]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,590346.274,0.047,
ENSG00000083896,YTHDC1,230455.733,0.172,
ENSG00000167515,TRAPPC2L,133986.944,-0.236,
ENSG00000060762,MPC1,109575.576,-0.315,
ENSG00000105672,ETV2,39655.152,0.696,
ENSG00000119616,FCF1,19019.885,-0.22,
ENSG00000134489,HRH4,15687.439,0.604,
ENSG00000157343,ARMC12,8731.685,0.59,
ENSG00000142541,RPL13A,7449.272,0.055,
ENSG00000136436,CALCOCO2,6854.902,0.699,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,590346.274,0.047
ENSG00000083896,YTHDC1,230455.733,0.172
ENSG00000167515,TRAPPC2L,133986.944,-0.236
ENSG00000060762,MPC1,109575.576,-0.315
ENSG00000105672,ETV2,39655.152,0.696
ENSG00000119616,FCF1,19019.885,-0.22
ENSG00000134489,HRH4,15687.439,0.604
ENSG00000157343,ARMC12,8731.685,0.59
ENSG00000142541,RPL13A,7449.272,0.055
ENSG00000136436,CALCOCO2,6854.902,0.699

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.94,35.802,3.486,"[Macaca_nemestrina, Cavia_porcellus]"
1,0.935,39.764,3.921,"[Mesocricetus_auratus, Ictidomys_tridecemlinea..."
2,0.695,196.612,7.108,"[Vombatus_ursinus, Tursiops_truncatus]"
3,0.959,28.614,3.07,"[Pan_paniscus, Ursus_americanus]"
4,0.751,102.957,6.663,"[Capra_hircus, Cavia_aperea]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,589154.592,0.013,
ENSG00000083896,YTHDC1,231999.447,0.146,
ENSG00000167515,TRAPPC2L,141851.12,-0.284,
ENSG00000060762,MPC1,127785.303,-0.333,
ENSG00000105672,ETV2,41535.562,0.702,
ENSG00000188763,FZD9,36319.227,0.035,
ENSG00000136436,CALCOCO2,17738.627,0.601,
ENSG00000119616,FCF1,12806.109,-0.315,
ENSG00000173567,ADGRF3,6126.74,0.453,
ENSG00000010219,DYRK4,5540.503,0.215,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,589154.592,0.013
ENSG00000083896,YTHDC1,231999.447,0.146
ENSG00000167515,TRAPPC2L,141851.12,-0.284
ENSG00000060762,MPC1,127785.303,-0.333
ENSG00000105672,ETV2,41535.562,0.702
ENSG00000188763,FZD9,36319.227,0.035
ENSG00000136436,CALCOCO2,17738.627,0.601
ENSG00000119616,FCF1,12806.109,-0.315
ENSG00000173567,ADGRF3,6126.74,0.453
ENSG00000010219,DYRK4,5540.503,0.215

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.981,14.276,2.288,"[Mus_spicilegus, Bos_taurus]"
1,0.979,14.115,2.089,"[Sus_scrofa, Tupaia_belangeri]"
2,0.811,118.042,6.159,"[Cavia_aperea, Aotus_nancymaae]"
3,0.797,96.021,6.681,"[Vombatus_ursinus, Monodelphis_domestica]"
4,0.967,18.426,2.654,"[Meriones_unguiculatus, Macaca_fascicularis]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,514215.849,0.023,
ENSG00000083896,YTHDC1,226555.46,0.196,
ENSG00000167515,TRAPPC2L,125587.761,-0.268,
ENSG00000060762,MPC1,107117.18,-0.322,
ENSG00000105672,ETV2,44182.684,0.666,
ENSG00000188763,FZD9,33113.529,0.006,
ENSG00000119616,FCF1,21406.578,-0.251,
ENSG00000157343,ARMC12,7959.737,0.581,
ENSG00000142541,RPL13A,4115.549,0.019,
ENSG00000120992,LYPLA1,2484.976,-0.304,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,514215.849,0.023
ENSG00000083896,YTHDC1,226555.46,0.196
ENSG00000167515,TRAPPC2L,125587.761,-0.268
ENSG00000060762,MPC1,107117.18,-0.322
ENSG00000105672,ETV2,44182.684,0.666
ENSG00000188763,FZD9,33113.529,0.006
ENSG00000119616,FCF1,21406.578,-0.251
ENSG00000157343,ARMC12,7959.737,0.581
ENSG00000142541,RPL13A,4115.549,0.019
ENSG00000120992,LYPLA1,2484.976,-0.304

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.5,199.097,9.135,"[Phascolarctos_cinereus, Cavia_porcellus]"
1,0.97,22.028,2.36,"[Heterocephalus_glaber, Aotus_nancymaae]"
2,0.974,20.034,2.505,"[Callithrix_jacchus, Pan_paniscus]"
3,0.967,18.64,2.753,"[Macaca_fascicularis, Mus_caroli]"
4,0.941,34.289,3.5,"[Felis_catus, Ailuropoda_melanoleuca]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,589142.567,0.012,
ENSG00000083896,YTHDC1,263460.74,0.158,
ENSG00000167515,TRAPPC2L,137384.68,-0.289,
ENSG00000060762,MPC1,114969.782,-0.397,
ENSG00000105672,ETV2,36444.899,0.684,
ENSG00000136436,CALCOCO2,23430.498,0.658,
ENSG00000066923,STAG3,13499.469,0.49,
ENSG00000119616,FCF1,11220.095,-0.304,
ENSG00000010219,DYRK4,6414.98,0.372,
ENSG00000188747,NOXA1,5101.658,0.714,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,589142.567,0.012
ENSG00000083896,YTHDC1,263460.74,0.158
ENSG00000167515,TRAPPC2L,137384.68,-0.289
ENSG00000060762,MPC1,114969.782,-0.397
ENSG00000105672,ETV2,36444.899,0.684
ENSG00000136436,CALCOCO2,23430.498,0.658
ENSG00000066923,STAG3,13499.469,0.49
ENSG00000119616,FCF1,11220.095,-0.304
ENSG00000010219,DYRK4,6414.98,0.372
ENSG00000188747,NOXA1,5101.658,0.714

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.959,26.059,2.79,"[Pan_paniscus, Mus_musculus]"
1,0.98,14.102,2.236,"[Felis_catus, Otolemur_garnettii]"
2,0.684,172.823,8.651,"[Monodelphis_domestica, Tupaia_belangeri]"
3,0.945,34.107,3.531,"[Mus_spicilegus, Heterocephalus_glaber]"
4,0.978,12.945,2.301,"[Bos_taurus, Macaca_mulatta]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,506271.319,0.041,
ENSG00000083896,YTHDC1,222785.062,0.185,
ENSG00000167515,TRAPPC2L,119593.206,-0.231,
ENSG00000060762,MPC1,117049.26,-0.305,
ENSG00000105672,ETV2,39556.956,0.678,
ENSG00000173567,ADGRF3,30279.173,0.416,
ENSG00000157343,ARMC12,16467.604,0.535,
ENSG00000119616,FCF1,15102.241,-0.254,
ENSG00000136436,CALCOCO2,9896.973,0.712,
ENSG00000184898,RBM43,3517.164,0.241,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,506271.319,0.041
ENSG00000083896,YTHDC1,222785.062,0.185
ENSG00000167515,TRAPPC2L,119593.206,-0.231
ENSG00000060762,MPC1,117049.26,-0.305
ENSG00000105672,ETV2,39556.956,0.678
ENSG00000173567,ADGRF3,30279.173,0.416
ENSG00000157343,ARMC12,16467.604,0.535
ENSG00000119616,FCF1,15102.241,-0.254
ENSG00000136436,CALCOCO2,9896.973,0.712
ENSG00000184898,RBM43,3517.164,0.241

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.83,114.168,5.783,"[Callithrix_jacchus, Gorilla_gorilla]"
1,0.988,7.961,1.838,"[Microcebus_murinus, Sus_scrofa]"
2,0.979,17.389,2.67,"[Pan_paniscus, Macaca_nemestrina]"
3,0.375,247.994,11.022,"[Meriones_unguiculatus, Phascolarctos_cinereus]"
4,0.962,15.199,2.832,"[Rattus_norvegicus, Cavia_porcellus]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,510305.273,0.03,
ENSG00000083896,YTHDC1,194420.685,0.174,
ENSG00000060762,MPC1,113782.944,-0.318,
ENSG00000167515,TRAPPC2L,112217.05,-0.285,
ENSG00000105672,ETV2,33349.012,0.691,
ENSG00000142541,RPL13A,9201.099,0.009,
ENSG00000100650,SRSF5,8606.682,0.419,
ENSG00000129187,DCTD,4974.056,0.697,
ENSG00000065268,WDR18,3939.16,-0.661,
ENSG00000135845,PIGC,2600.141,0.363,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,510305.273,0.03
ENSG00000083896,YTHDC1,194420.685,0.174
ENSG00000060762,MPC1,113782.944,-0.318
ENSG00000167515,TRAPPC2L,112217.05,-0.285
ENSG00000105672,ETV2,33349.012,0.691
ENSG00000142541,RPL13A,9201.099,0.009
ENSG00000100650,SRSF5,8606.682,0.419
ENSG00000129187,DCTD,4974.056,0.697
ENSG00000065268,WDR18,3939.16,-0.661
ENSG00000135845,PIGC,2600.141,0.363

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.612,214.728,8.232,"[Tursiops_truncatus, Cavia_aperea]"
1,0.98,13.397,1.778,"[Tupaia_belangeri, Otolemur_garnettii]"
2,0.325,315.795,11.263,"[Phascolarctos_cinereus, Loxodonta_africana]"
3,0.738,129.088,6.213,"[Meriones_unguiculatus, Pan_troglodytes]"
4,0.964,19.506,2.741,"[Callithrix_jacchus, Capra_hircus]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,590621.579,0.049,
ENSG00000083896,YTHDC1,229566.626,0.209,
ENSG00000167515,TRAPPC2L,137051.984,-0.265,
ENSG00000060762,MPC1,126193.78,-0.322,
ENSG00000105672,ETV2,46622.058,0.685,
ENSG00000188763,FZD9,30842.848,0.009,
ENSG00000119616,FCF1,16856.524,-0.268,
ENSG00000136436,CALCOCO2,9961.501,0.598,
ENSG00000101190,TCFL5,6606.455,0.689,
ENSG00000173567,ADGRF3,6051.844,0.473,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,590621.579,0.049
ENSG00000083896,YTHDC1,229566.626,0.209
ENSG00000167515,TRAPPC2L,137051.984,-0.265
ENSG00000060762,MPC1,126193.78,-0.322
ENSG00000105672,ETV2,46622.058,0.685
ENSG00000188763,FZD9,30842.848,0.009
ENSG00000119616,FCF1,16856.524,-0.268
ENSG00000136436,CALCOCO2,9961.501,0.598
ENSG00000101190,TCFL5,6606.455,0.689
ENSG00000173567,ADGRF3,6051.844,0.473

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.966,24.722,2.697,"[Macaca_nemestrina, Heterocephalus_glaber]"
1,0.983,13.716,1.734,"[Mus_spicilegus, Ailuropoda_melanoleuca]"
2,0.967,17.273,2.702,"[Mus_musculus, Ovis_aries]"
3,0.949,31.111,3.113,"[Ictidomys_tridecemlineatus, Rhinopithecus_bieti]"
4,0.836,82.146,5.245,"[Cavia_aperea, Macaca_mulatta]"


In [21]:
from yspecies.results import FeatureSummary
summary = FeatureSummary(stage_one_lifespan)
stage_one_lifespan[0]

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,512897.728,0.04,
ENSG00000083896,YTHDC1,192584.667,0.179,
ENSG00000167515,TRAPPC2L,132820.645,-0.22,
ENSG00000060762,MPC1,111991.538,-0.308,
ENSG00000105672,ETV2,39996.305,0.695,
ENSG00000142541,RPL13A,9135.891,0.007,
ENSG00000065268,WDR18,3898.028,-0.543,
ENSG00000101190,TCFL5,3801.633,0.635,
ENSG00000066923,STAG3,2066.474,0.516,
ENSG00000164879,CA3,1187.993,-0.519,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,512897.728,0.04
ENSG00000083896,YTHDC1,192584.667,0.179
ENSG00000167515,TRAPPC2L,132820.645,-0.22
ENSG00000060762,MPC1,111991.538,-0.308
ENSG00000105672,ETV2,39996.305,0.695
ENSG00000142541,RPL13A,9135.891,0.007
ENSG00000065268,WDR18,3898.028,-0.543
ENSG00000101190,TCFL5,3801.633,0.635
ENSG00000066923,STAG3,2066.474,0.516
ENSG00000164879,CA3,1187.993,-0.519

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.549,189.944,8.955,"[Phascolarctos_cinereus, Mus_musculus]"
1,0.912,70.135,3.66,"[Equus_caballus, Rhinolophus_ferrumequinum]"
2,0.933,31.715,4.578,"[Rattus_norvegicus, Mesocricetus_auratus]"
3,0.769,129.762,5.609,"[Meriones_unguiculatus, Tursiops_truncatus]"
4,0.809,103.254,5.862,"[Tupaia_belangeri, Cavia_aperea]"


In [22]:
summary.selected.sort_values("kendall_tau_0", ascending=False)["symbol"].values

AttributeError: 'DataFrame' object has no attribute 'selected'

In [73]:
"gain" if len([c for c in summary.results[0].selected.columns if "gain" in c])>0 else "shap"

'gain'