# SHAP feature selection #
## Code to select feature with combination of LightGBM and SHAP ##

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [29]:
number_of_folds = 5 # this sets global setting of which how many bootstraps to use
repeats = 10
#first round of optimization
lgb_params = {
 'boosting_type': 'gbdt',
 'objective': 'regression',
 'lambda_l1': 3.5389955270605298,
 'lambda_l2': 1.6581000555478702,
 'metric': ["l1", "l2", "huber"],
 'max_leaves': 20,
 'max_depth': 8,
 'feature_fraction': 0.537151250668454,
 'bagging_fraction': 0.6407512594833191,
 'learning_rate': 0.06627424358150694,
 'min_data_in_leaf': 3,
 'drop_rate': 0.16918140176888039}
debug_local = True #to use local version

In [30]:
from pathlib import Path
import sys
import inspect

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '..', '..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [32]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.models import Metrics
from yspecies.selection import ShapSelector

In [33]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

Using matplotlib backend: TkAgg


### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [34]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [35]:
data = ExpressionDataset.from_folder(locations.interim.selected)
data

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(445, 12340)",12340,39,445,"(12340, 2)","(40, 19)"


## Setting up SHAP selection pipeline ##

Deciding on selection parameters (which fields to include, exclude, predict)

In [36]:
partition_params = PartitionParameters(number_of_folds, 0, 2, [],  42)


In [37]:
partition_shap_pipe = Pipeline([
    ("partitioner", DataPartitioner()),
    ('prepare_for_partitioning', TupleWith(lgb_params)),
    ("shap_computation", ShapSelector())
]
)
repeated_cv =  Repeat(partition_shap_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))

In [38]:
selection_pipeline =  Pipeline([
    ('extractor', DataExtractor()),
    ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
    ("partition_shap", repeated_cv)]
    )

## Setting up features to select ##

In [39]:
selection = select_lifespan = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = "lifespan", #column to predict
    categorical = ["tissue"])

In [40]:
select_lifespan = selection
select_mass = replace(selection, to_predict = "mass_g")
select_gestation = replace(selection, to_predict = "gestation")
select_mtgc = replace(selection, to_predict = "mtgc")

# First stage selection #

In [41]:
stage_one_lifespan = selection_pipeline.fit_transform((data, select_lifespan))
stage_one_lifespan

Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 3.88913	valid_0's l2: 34.4348	valid_0's huber: 3.15574
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 3.88913	valid_0's l2: 34.4348	valid_0's huber: 3.15574
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 5.07552	valid_0's l2: 59.3379	valid_0's huber: 4.23463
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 5.07552	valid_0's l2: 59.3379	valid_0's huber: 4.23463
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 3.49855	valid_0's l2: 33.6041	valid_0's huber: 2.82957
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 3.49855	valid_0's l2: 33.6041	valid_0's huber: 2.82957
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 3.96945	valid_0's l2: 26.2622	valid_0's huber: 3.20037
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 3.96945	valid_0's l

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[25]	valid_0's l1: 8.27336	valid_0's l2: 152.951	valid_0's huber: 7.04988
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[140]	valid_0's l1: 3.66283	valid_0's l2: 98.9854	valid_0's huber: 3.02306
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 2.58503	valid_0's l2: 18.7909	valid_0's huber: 2.01481
Did not meet early stopping. Best iteration is:
[199]	valid_0's l1: 2.58489	valid_0's l2: 18.7909	valid_0's huber: 2.01475
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 3.33061	valid_0's l2: 26.1408	valid_0's huber: 2.67011
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 3.33061	valid_0's l2: 26.1408	valid_0's huber: 2.67011
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 4.31224	valid_0's l2: 72.2134	valid_0's huber: 3.54395
Did n

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[30]	valid_0's l1: 8.04201	valid_0's l2: 130.572	valid_0's huber: 6.85058
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 2.12516	valid_0's l2: 18.957	valid_0's huber: 1.63424
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 2.12516	valid_0's l2: 18.957	valid_0's huber: 1.63424
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 3.17125	valid_0's l2: 27.2013	valid_0's huber: 2.55549
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 3.17125	valid_0's l2: 27.2013	valid_0's huber: 2.55549
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 2.06907	valid_0's l2: 12.436	valid_0's huber: 1.54729
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 2.06907	valid_0's l2: 12.436	valid_0's huber: 1.54729
Training until validation scores don't improve for 10 rou

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[17]	valid_0's l1: 8.51571	valid_0's l2: 126.937	valid_0's huber: 7.27017
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 2.23109	valid_0's l2: 16.7795	valid_0's huber: 1.70396
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 2.23109	valid_0's l2: 16.7795	valid_0's huber: 1.70396
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[19]	valid_0's l1: 8.1363	valid_0's l2: 138.901	valid_0's huber: 6.92427
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 21.2633	valid_0's l2: 1910.42	valid_0's huber: 18.8214
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 21.2633	valid_0's l2: 1910.42	valid_0's huber: 18.8214
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 4.10036	valid_0's l2: 41.3948	valid_0's huber: 3.37018
Did not

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 2.79092	valid_0's l2: 20.4013	valid_0's huber: 2.20564
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 2.79092	valid_0's l2: 20.4013	valid_0's huber: 2.20564
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 2.86272	valid_0's l2: 25.3078	valid_0's huber: 2.28525
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 2.86272	valid_0's l2: 25.3078	valid_0's huber: 2.28525
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[163]	valid_0's l1: 21.6571	valid_0's l2: 1971.48	valid_0's huber: 19.192
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[98]	valid_0's l1: 4.02662	valid_0's l2: 41.2638	valid_0's huber: 3.28755
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 3.70084	valid_0's l2: 93.2714	valid_0's huber: 3.03278
Did no

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 2.90341	valid_0's l2: 17.0806	valid_0's huber: 2.26579
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 2.90341	valid_0's l2: 17.0806	valid_0's huber: 2.26579
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[133]	valid_0's l1: 2.06825	valid_0's l2: 15.8185	valid_0's huber: 1.57008
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[182]	valid_0's l1: 3.8271	valid_0's l2: 86.5712	valid_0's huber: 3.14098
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[175]	valid_0's l1: 21.6254	valid_0's l2: 1937.04	valid_0's huber: 19.1391
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 1.99589	valid_0's l2: 10.7237	valid_0's huber: 1.47634
Did not meet early stopping. Best iteration is:
[199]	valid_0's l1: 1.99586	valid_0's l2: 10.

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[19]	valid_0's l1: 8.64043	valid_0's l2: 173.999	valid_0's huber: 7.38006
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 1.72738	valid_0's l2: 9.64564	valid_0's huber: 1.26057
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 1.72738	valid_0's l2: 9.64564	valid_0's huber: 1.26057
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 3.80407	valid_0's l2: 58.4331	valid_0's huber: 3.10601
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 3.80407	valid_0's l2: 58.4331	valid_0's huber: 3.10601
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 1.92844	valid_0's l2: 10.7455	valid_0's huber: 1.44985
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 1.92844	valid_0's l2: 10.7455	valid_0's huber: 1.44985
Training until validation scores don't improve for 10

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[25]	valid_0's l1: 7.60956	valid_0's l2: 132.392	valid_0's huber: 6.45782
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[33]	valid_0's l1: 7.91779	valid_0's l2: 122.334	valid_0's huber: 6.73104
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 5.61889	valid_0's l2: 121.038	valid_0's huber: 4.75237
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 5.61889	valid_0's l2: 121.038	valid_0's huber: 4.75237
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[19]	valid_0's l1: 10.5303	valid_0's l2: 216.311	valid_0's huber: 9.0814
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 2.23991	valid_0's l2: 12.9763	valid_0's huber: 1.71447
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 2.23991	valid_0's l2: 12.976

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[127]	valid_0's l1: 2.92949	valid_0's l2: 45.7961	valid_0's huber: 2.33054
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[187]	valid_0's l1: 2.39544	valid_0's l2: 26.3228	valid_0's huber: 1.86394
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 4.64778	valid_0's l2: 118.561	valid_0's huber: 3.90177
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 4.64778	valid_0's l2: 118.561	valid_0's huber: 3.90177
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[72]	valid_0's l1: 7.75601	valid_0's l2: 139.923	valid_0's huber: 6.62673
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[43]	valid_0's l1: 4.03212	valid_0's l2: 37.2597	valid_0's huber: 3.23924


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[152]	valid_0's l1: 3.51657	valid_0's l2: 83.9322	valid_0's huber: 2.87289
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[21]	valid_0's l1: 10.4004	valid_0's l2: 189.667	valid_0's huber: 8.96228
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[153]	valid_0's l1: 3.82282	valid_0's l2: 37.8346	valid_0's huber: 3.12812
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[176]	valid_0's l1: 3.09031	valid_0's l2: 45.8416	valid_0's huber: 2.4719
Training until validation scores don't improve for 10 rounds
[200]	valid_0's l1: 21.3302	valid_0's l2: 1927.37	valid_0's huber: 18.8882
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 21.3302	valid_0's l2: 1927.37	valid_0's huber: 18.8882


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,595468.075,0.023,
ENSG00000083896,YTHDC1,417638.774,0.173,
ENSG00000060762,MPC1,135515.969,-0.328,
ENSG00000167515,TRAPPC2L,127575.46,-0.318,
ENSG00000204498,NFKBIL1,50323.852,-0.276,
ENSG00000010219,DYRK4,37149.404,0.2,
ENSG00000105672,ETV2,35001.381,0.676,
ENSG00000119616,FCF1,23337.054,-0.217,
ENSG00000129187,DCTD,17696.302,0.65,
ENSG00000188763,FZD9,17617.694,0.054,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,595468.075,0.023
ENSG00000083896,YTHDC1,417638.774,0.173
ENSG00000060762,MPC1,135515.969,-0.328
ENSG00000167515,TRAPPC2L,127575.46,-0.318
ENSG00000204498,NFKBIL1,50323.852,-0.276
ENSG00000010219,DYRK4,37149.404,0.2
ENSG00000105672,ETV2,35001.381,0.676
ENSG00000119616,FCF1,23337.054,-0.217
ENSG00000129187,DCTD,17696.302,0.65
ENSG00000188763,FZD9,17617.694,0.054

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.939,34.435,3.889,"[Cavia_porcellus, Callithrix_jacchus]"
1,0.908,59.338,5.076,"[Meriones_unguiculatus, Pan_paniscus]"
2,0.955,33.604,3.499,"[Heterocephalus_glaber, Mus_spicilegus]"
3,0.936,26.262,3.969,"[Rattus_norvegicus, Mesocricetus_auratus]"
4,0.913,50.24,4.047,"[Tupaia_belangeri, Rhinolophus_ferrumequinum]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,546012.966,0.04,
ENSG00000083896,YTHDC1,355467.405,0.194,
ENSG00000060762,MPC1,193171.061,-0.362,
ENSG00000167515,TRAPPC2L,126474.783,-0.269,
ENSG00000105672,ETV2,42270.382,0.671,
ENSG00000119616,FCF1,22024.219,-0.269,
ENSG00000170835,CEL,19398.409,0.53,
ENSG00000188763,FZD9,17728.049,0.003,
ENSG00000010219,DYRK4,16239.635,0.296,
ENSG00000136436,CALCOCO2,13672.733,0.677,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,546012.966,0.04
ENSG00000083896,YTHDC1,355467.405,0.194
ENSG00000060762,MPC1,193171.061,-0.362
ENSG00000167515,TRAPPC2L,126474.783,-0.269
ENSG00000105672,ETV2,42270.382,0.671
ENSG00000119616,FCF1,22024.219,-0.269
ENSG00000170835,CEL,19398.409,0.53
ENSG00000188763,FZD9,17728.049,0.003
ENSG00000010219,DYRK4,16239.635,0.296
ENSG00000136436,CALCOCO2,13672.733,0.677

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.711,152.951,8.273,"[Phascolarctos_cinereus, Macaca_nemestrina]"
1,0.877,98.985,3.663,"[Loxodonta_africana, Ailuropoda_melanoleuca]"
2,0.972,18.791,2.585,"[Sus_scrofa, Otolemur_garnettii]"
3,0.944,26.141,3.331,"[Felis_catus, Cavia_porcellus]"
4,0.853,72.213,4.312,"[Mus_musculus, Cavia_aperea]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000167515,TRAPPC2L,164428.409,-0.2,
ENSG00000060762,MPC1,145144.968,-0.316,
ENSG00000105672,ETV2,49176.721,0.67,
ENSG00000188763,FZD9,35165.51,0.2,
ENSG00000119616,FCF1,19886.121,-0.258,
ENSG00000136436,CALCOCO2,12799.033,0.707,
ENSG00000170835,CEL,10684.702,0.495,
ENSG00000135845,PIGC,7456.299,0.351,
ENSG00000173567,ADGRF3,5251.937,0.491,
ENSG00000065268,WDR18,4999.188,-0.639,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000167515,TRAPPC2L,164428.409,-0.2
ENSG00000060762,MPC1,145144.968,-0.316
ENSG00000105672,ETV2,49176.721,0.67
ENSG00000188763,FZD9,35165.51,0.2
ENSG00000119616,FCF1,19886.121,-0.258
ENSG00000136436,CALCOCO2,12799.033,0.707
ENSG00000170835,CEL,10684.702,0.495
ENSG00000135845,PIGC,7456.299,0.351
ENSG00000173567,ADGRF3,5251.937,0.491
ENSG00000065268,WDR18,4999.188,-0.639

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.095,130.572,8.042,"[Cavia_aperea, Mesocricetus_auratus]"
1,0.923,18.957,2.125,"[Meriones_unguiculatus, Rhinolophus_ferrumequi..."
2,0.849,27.201,3.171,"[Ailuropoda_melanoleuca, Cavia_porcellus]"
3,0.965,12.436,2.069,"[Pan_paniscus, Suricata_suricatta]"
4,-8.967,2053.806,22.173,"[Tupaia_belangeri, Homo_sapiens]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000167515,TRAPPC2L,171225.465,-0.211,
ENSG00000060762,MPC1,103465.186,-0.286,
ENSG00000105672,ETV2,38127.91,0.69,
ENSG00000119616,FCF1,19887.126,-0.304,
ENSG00000136436,CALCOCO2,10866.15,0.615,
ENSG00000173567,ADGRF3,6446.178,0.515,
ENSG00000109099,PMP22,6063.119,-0.117,
ENSG00000134489,HRH4,3573.307,0.636,
ENSG00000065268,WDR18,3308.434,-0.668,
ENSG00000142541,RPL13A,3260.781,0.006,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000167515,TRAPPC2L,171225.465,-0.211
ENSG00000060762,MPC1,103465.186,-0.286
ENSG00000105672,ETV2,38127.91,0.69
ENSG00000119616,FCF1,19887.126,-0.304
ENSG00000136436,CALCOCO2,10866.15,0.615
ENSG00000173567,ADGRF3,6446.178,0.515
ENSG00000109099,PMP22,6063.119,-0.117
ENSG00000134489,HRH4,3573.307,0.636
ENSG00000065268,WDR18,3308.434,-0.668
ENSG00000142541,RPL13A,3260.781,0.006

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.114,126.937,8.516,"[Cavia_porcellus, Phascolarctos_cinereus]"
1,0.938,16.78,2.231,"[Mus_spicilegus, Ursus_americanus]"
2,0.091,138.901,8.136,"[Oryctolagus_cuniculus, Macaca_mulatta]"
3,-7.254,1910.425,21.263,"[Homo_sapiens, Vombatus_ursinus]"
4,0.815,41.395,4.1,"[Ailuropoda_melanoleuca, Mesocricetus_auratus]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000060762,MPC1,181375.781,-0.337,
ENSG00000167515,TRAPPC2L,129828.188,-0.381,
ENSG00000105672,ETV2,47244.867,0.681,
ENSG00000119616,FCF1,23202.847,-0.265,
ENSG00000101190,TCFL5,12995.384,0.664,
ENSG00000188763,FZD9,12765.669,-0.132,
ENSG00000164879,CA3,12377.505,-0.511,
ENSG00000106554,CHCHD3,9198.35,-0.306,
ENSG00000173567,ADGRF3,8831.89,0.482,
ENSG00000157343,ARMC12,8185.079,0.582,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000060762,MPC1,181375.781,-0.337
ENSG00000167515,TRAPPC2L,129828.188,-0.381
ENSG00000105672,ETV2,47244.867,0.681
ENSG00000119616,FCF1,23202.847,-0.265
ENSG00000101190,TCFL5,12995.384,0.664
ENSG00000188763,FZD9,12765.669,-0.132
ENSG00000164879,CA3,12377.505,-0.511
ENSG00000106554,CHCHD3,9198.35,-0.306
ENSG00000173567,ADGRF3,8831.89,0.482
ENSG00000157343,ARMC12,8185.079,0.582

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.903,20.401,2.791,"[Rhinolophus_ferrumequinum, Meriones_unguicula..."
1,0.897,25.308,2.863,"[Otolemur_garnettii, Felis_catus]"
2,-8.335,1971.484,21.657,"[Tupaia_belangeri, Homo_sapiens]"
3,0.786,41.264,4.027,"[Bos_taurus, Cavia_porcellus]"
4,0.62,93.272,3.701,"[Callithrix_jacchus, Loxodonta_africana]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000060762,MPC1,157524.156,-0.331,
ENSG00000167515,TRAPPC2L,138628.934,-0.27,
ENSG00000105672,ETV2,51924.893,0.683,
ENSG00000119616,FCF1,41328.844,-0.232,
ENSG00000188763,FZD9,16678.387,0.057,
ENSG00000170835,CEL,12363.136,0.601,
ENSG00000173567,ADGRF3,11682.535,0.486,
ENSG00000100650,SRSF5,10655.709,0.473,
ENSG00000101190,TCFL5,10529.023,0.6,
ENSG00000136436,CALCOCO2,8309.594,0.586,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000060762,MPC1,157524.156,-0.331
ENSG00000167515,TRAPPC2L,138628.934,-0.27
ENSG00000105672,ETV2,51924.893,0.683
ENSG00000119616,FCF1,41328.844,-0.232
ENSG00000188763,FZD9,16678.387,0.057
ENSG00000170835,CEL,12363.136,0.601
ENSG00000173567,ADGRF3,11682.535,0.486
ENSG00000100650,SRSF5,10655.709,0.473
ENSG00000101190,TCFL5,10529.023,0.6
ENSG00000136436,CALCOCO2,8309.594,0.586

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.937,17.081,2.903,"[Mus_caroli, Pan_paniscus]"
1,0.931,15.819,2.068,"[Mus_spicilegus, Canis_lupus_familiaris]"
2,0.55,86.571,3.827,"[Otolemur_garnettii, Loxodonta_africana]"
3,-9.986,1937.039,21.625,"[Homo_sapiens, Ailuropoda_melanoleuca]"
4,0.947,10.723,1.996,"[Mus_musculus, Macaca_nemestrina]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,573685.553,0.021,
ENSG00000083896,YTHDC1,378793.284,0.227,
ENSG00000167515,TRAPPC2L,180525.752,-0.203,
ENSG00000060762,MPC1,114829.377,-0.334,
ENSG00000148248,SURF4,68058.803,0.209,
ENSG00000204498,NFKBIL1,52305.045,-0.346,
ENSG00000188763,FZD9,49695.95,0.025,
ENSG00000105672,ETV2,40602.049,0.689,
ENSG00000119616,FCF1,21416.789,-0.261,
ENSG00000136436,CALCOCO2,12523.652,0.602,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,573685.553,0.021
ENSG00000083896,YTHDC1,378793.284,0.227
ENSG00000167515,TRAPPC2L,180525.752,-0.203
ENSG00000060762,MPC1,114829.377,-0.334
ENSG00000148248,SURF4,68058.803,0.209
ENSG00000204498,NFKBIL1,52305.045,-0.346
ENSG00000188763,FZD9,49695.95,0.025
ENSG00000105672,ETV2,40602.049,0.689
ENSG00000119616,FCF1,21416.789,-0.261
ENSG00000136436,CALCOCO2,12523.652,0.602

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.524,173.999,8.64,"[Macaca_mulatta, Oryctolagus_cuniculus]"
1,0.986,9.646,1.727,"[Suricata_suricatta, Ailuropoda_melanoleuca]"
2,0.895,58.433,3.804,"[Mus_musculus, Pan_troglodytes]"
3,0.983,10.745,1.928,"[Canis_lupus_familiaris, Aotus_nancymaae]"
4,0.975,13.364,2.228,"[Mus_spicilegus, Ovis_aries]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,496576.481,0.045,
ENSG00000083896,YTHDC1,314479.171,0.198,
ENSG00000167515,TRAPPC2L,153814.721,-0.229,
ENSG00000060762,MPC1,118483.304,-0.279,
ENSG00000105672,ETV2,44744.608,0.689,
ENSG00000119616,FCF1,21205.233,-0.224,
ENSG00000136436,CALCOCO2,9650.532,0.636,
ENSG00000129187,DCTD,8619.371,0.681,
ENSG00000101190,TCFL5,7301.48,0.601,
ENSG00000170835,CEL,6388.491,0.628,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,496576.481,0.045
ENSG00000083896,YTHDC1,314479.171,0.198
ENSG00000167515,TRAPPC2L,153814.721,-0.229
ENSG00000060762,MPC1,118483.304,-0.279
ENSG00000105672,ETV2,44744.608,0.689
ENSG00000119616,FCF1,21205.233,-0.224
ENSG00000136436,CALCOCO2,9650.532,0.636
ENSG00000129187,DCTD,8619.371,0.681
ENSG00000101190,TCFL5,7301.48,0.601
ENSG00000170835,CEL,6388.491,0.628

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.725,132.392,7.61,"[Phascolarctos_cinereus, Macaca_mulatta]"
1,0.718,122.334,7.918,"[Cavia_aperea, Mesocricetus_auratus]"
2,0.802,121.038,5.619,"[Tursiops_truncatus, Mus_caroli]"
3,0.281,216.311,10.53,"[Pan_paniscus, Oryctolagus_cuniculus]"
4,0.975,12.976,2.24,"[Capra_hircus, Sus_scrofa]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,607474.844,0.019,
ENSG00000083896,YTHDC1,415908.587,0.167,
ENSG00000167515,TRAPPC2L,153930.19,-0.325,
ENSG00000060762,MPC1,148882.597,-0.422,
ENSG00000105672,ETV2,36113.09,0.671,
ENSG00000119616,FCF1,26480.578,-0.282,
ENSG00000010219,DYRK4,22662.972,0.434,
ENSG00000136436,CALCOCO2,21747.492,0.583,
ENSG00000170835,CEL,11764.36,0.616,
ENSG00000173567,ADGRF3,6624.855,0.464,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,607474.844,0.019
ENSG00000083896,YTHDC1,415908.587,0.167
ENSG00000167515,TRAPPC2L,153930.19,-0.325
ENSG00000060762,MPC1,148882.597,-0.422
ENSG00000105672,ETV2,36113.09,0.671
ENSG00000119616,FCF1,26480.578,-0.282
ENSG00000010219,DYRK4,22662.972,0.434
ENSG00000136436,CALCOCO2,21747.492,0.583
ENSG00000170835,CEL,11764.36,0.616
ENSG00000173567,ADGRF3,6624.855,0.464

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.936,45.796,2.929,"[Equus_caballus, Suricata_suricatta]"
1,0.957,26.323,2.395,"[Tupaia_belangeri, Ovis_aries]"
2,0.827,118.561,4.648,"[Loxodonta_africana, Ictidomys_tridecemlineatus]"
3,0.665,139.923,7.756,"[Monodelphis_domestica, Rhinopithecus_bieti]"
4,0.938,37.26,4.032,"[Pan_paniscus, Aotus_nancymaae]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000060762,MPC1,167776.136,-0.333,
ENSG00000167515,TRAPPC2L,147125.371,-0.168,
ENSG00000105672,ETV2,42220.71,0.666,
ENSG00000119616,FCF1,25767.925,-0.246,
ENSG00000136436,CALCOCO2,23899.099,0.694,
ENSG00000106554,CHCHD3,8773.969,-0.415,
ENSG00000188763,FZD9,8711.924,0.046,
ENSG00000170835,CEL,7641.552,0.572,
ENSG00000142541,RPL13A,5083.707,-0.187,
ENSG00000134489,HRH4,3324.311,0.61,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000060762,MPC1,167776.136,-0.333
ENSG00000167515,TRAPPC2L,147125.371,-0.168
ENSG00000105672,ETV2,42220.71,0.666
ENSG00000119616,FCF1,25767.925,-0.246
ENSG00000136436,CALCOCO2,23899.099,0.694
ENSG00000106554,CHCHD3,8773.969,-0.415
ENSG00000188763,FZD9,8711.924,0.046
ENSG00000170835,CEL,7641.552,0.572
ENSG00000142541,RPL13A,5083.707,-0.187
ENSG00000134489,HRH4,3324.311,0.61

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.567,83.932,3.517,"[Aotus_nancymaae, Loxodonta_africana]"
1,-0.247,189.667,10.4,"[Monodelphis_domestica, Phascolarctos_cinereus]"
2,0.864,37.835,3.823,"[Pan_paniscus, Mesocricetus_auratus]"
3,0.8,45.842,3.09,"[Rhinopithecus_bieti, Equus_caballus]"
4,-10.533,1927.374,21.33,"[Homo_sapiens, Felis_catus]"


In [42]:
from yspecies.results import FeatureSummary
summary = FeatureSummary(stage_one_lifespan)
stage_one_lifespan[0]

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,595468.075,0.023,
ENSG00000083896,YTHDC1,417638.774,0.173,
ENSG00000060762,MPC1,135515.969,-0.328,
ENSG00000167515,TRAPPC2L,127575.46,-0.318,
ENSG00000204498,NFKBIL1,50323.852,-0.276,
ENSG00000010219,DYRK4,37149.404,0.2,
ENSG00000105672,ETV2,35001.381,0.676,
ENSG00000119616,FCF1,23337.054,-0.217,
ENSG00000129187,DCTD,17696.302,0.65,
ENSG00000188763,FZD9,17617.694,0.054,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,595468.075,0.023
ENSG00000083896,YTHDC1,417638.774,0.173
ENSG00000060762,MPC1,135515.969,-0.328
ENSG00000167515,TRAPPC2L,127575.46,-0.318
ENSG00000204498,NFKBIL1,50323.852,-0.276
ENSG00000010219,DYRK4,37149.404,0.2
ENSG00000105672,ETV2,35001.381,0.676
ENSG00000119616,FCF1,23337.054,-0.217
ENSG00000129187,DCTD,17696.302,0.65
ENSG00000188763,FZD9,17617.694,0.054

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.939,34.435,3.889,"[Cavia_porcellus, Callithrix_jacchus]"
1,0.908,59.338,5.076,"[Meriones_unguiculatus, Pan_paniscus]"
2,0.955,33.604,3.499,"[Heterocephalus_glaber, Mus_spicilegus]"
3,0.936,26.262,3.969,"[Rattus_norvegicus, Mesocricetus_auratus]"
4,0.913,50.24,4.047,"[Tupaia_belangeri, Rhinolophus_ferrumequinum]"


In [15]:
summary.selected.sort_values("kendall_tau_0", ascending=False)["symbol"].values

array(['CALCOCO2', 'MPG', 'NOXA1', 'ETV2', 'CEL', 'ARMC12', 'HRH4',
       'DCTD', 'KCNMB3', 'ADGRF3', 'TIMP1', 'TSPAN10', 'STAG3', 'GUCY2C',
       'DNAJB6', 'LIN7B', 'TRAPPC2L', 'MPC1', 'CTBS', 'CA3', 'NDUFA6',
       'LBP', 'PUSL1', 'MPC2', 'SNRPN', 'ADPRM', 'DPP9', 'WDR18'],
      dtype=object)

In [73]:
"gain" if len([c for c in summary.results[0].selected.columns if "gain" in c])>0 else "shap"

'gain'