# SHAP feature selection #
## Code to select feature with combination of LightGBM and SHAP ##

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [1]:
number_of_folds = 5 # this sets global setting of which how many bootstraps to use
repeats = 10
#first round of optimization
lgb_params = {"bagging_fraction": 0.9522534844058304, 
              "boosting_type": "dart", 
              "objective": "regression",
              "feature_fraction": 0.42236910941558053, 
              "lambda_l1": 0.020847266580277746, 
              "lambda_l2": 2.8448564854773326, 
              "learning_rate": 0.11484015430016059, 
              "max_depth": 3, 
              "max_leaves": 35, 
              "min_data_in_leaf": 9}
debug_local = True #to use local version

In [2]:
from pathlib import Path
import sys
import inspect

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']


In [3]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.models import Metrics
from yspecies.selection import ShapSelector

In [5]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

Using matplotlib backend: TkAgg


### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [6]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [7]:
data = ExpressionDataset.from_folder(locations.interim.selected)
data

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(445, 12340)",12340,39,445,"(12340, 2)","(40, 19)"


## Setting up SHAP selection pipeline ##

Deciding on selection parameters (which fields to include, exclude, predict)

In [8]:
partition_params = PartitionParameters(number_of_folds, 0, 2, [],  42)


In [9]:
partition_shap_pipe = Pipeline([
    ("partitioner", DataPartitioner()),
    ('prepare_for_partitioning', TupleWith(lgb_params)),
    ("shap_computation", ShapSelector())
]
)
repeated_cv =  Repeat(partition_shap_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))

In [10]:
selection_pipeline =  Pipeline([
    ('extractor', DataExtractor()),
    ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
    ("partition_shap", repeated_cv)]
    )

## Setting up features to select ##

In [11]:
selection = select_lifespan = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = "lifespan", #column to predict
    categorical = ["tissue"])

In [12]:
select_lifespan = selection
select_mass = replace(selection, to_predict = "mass_g")
select_gestation = replace(selection, to_predict = "gestation")
select_mtgc = replace(selection, to_predict = "mtgc")

# First stage selection #

In [13]:
stage_one_lifespan = selection_pipeline.fit_transform((data, select_lifespan))
stage_one_lifespan

Early stopping is not available in dart mode


[200]	valid_0's l2: 38.7282
[200]	valid_0's l2: 14.7218
[200]	valid_0's l2: 2070.36
[200]	valid_0's l2: 38.694
[200]	valid_0's l2: 123.122


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Early stopping is not available in dart mode


[200]	valid_0's l2: 133.245
[200]	valid_0's l2: 24.8638
[200]	valid_0's l2: 43.6909
[200]	valid_0's l2: 17.2242
[200]	valid_0's l2: 30.18


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Early stopping is not available in dart mode


[200]	valid_0's l2: 74.5903
[200]	valid_0's l2: 40.6323
[200]	valid_0's l2: 91.9006
[200]	valid_0's l2: 19.6218
[200]	valid_0's l2: 28.4661


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Early stopping is not available in dart mode


[200]	valid_0's l2: 34.5766
[200]	valid_0's l2: 24.5847
[200]	valid_0's l2: 346.908
[200]	valid_0's l2: 68.882
[200]	valid_0's l2: 44.7918


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Early stopping is not available in dart mode


[200]	valid_0's l2: 39.3201
[200]	valid_0's l2: 33.3467
[200]	valid_0's l2: 24.3971
[200]	valid_0's l2: 17.3194
[200]	valid_0's l2: 26.1053


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Early stopping is not available in dart mode


[200]	valid_0's l2: 98.8111
[200]	valid_0's l2: 105.233
[200]	valid_0's l2: 112.67
[200]	valid_0's l2: 50.5708
[200]	valid_0's l2: 27.4933


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Early stopping is not available in dart mode


[200]	valid_0's l2: 30.1187
[200]	valid_0's l2: 125.535
[200]	valid_0's l2: 27.0113
[200]	valid_0's l2: 160.354
[200]	valid_0's l2: 106.749


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Early stopping is not available in dart mode


[200]	valid_0's l2: 55.5938
[200]	valid_0's l2: 25.682
[200]	valid_0's l2: 1607.2
[200]	valid_0's l2: 29.9963
[200]	valid_0's l2: 113.973


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Early stopping is not available in dart mode


[200]	valid_0's l2: 22.1786
[200]	valid_0's l2: 59.7035
[200]	valid_0's l2: 38.8822
[200]	valid_0's l2: 79.7684
[200]	valid_0's l2: 156.484


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Early stopping is not available in dart mode


[200]	valid_0's l2: 34.2988
[200]	valid_0's l2: 2058.34
[200]	valid_0's l2: 102.973
[200]	valid_0's l2: 27.8676
[200]	valid_0's l2: 22.3528


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000167515,TRAPPC2L,124386.632,-0.237,
ENSG00000060762,MPC1,115967.76,-0.443,
ENSG00000066923,STAG3,42371.166,0.49,
ENSG00000136436,CALCOCO2,30505.962,0.762,
ENSG00000105672,ETV2,30083.124,0.711,
ENSG00000188763,FZD9,25725.001,-0.209,
ENSG00000119616,FCF1,25614.721,-0.225,
ENSG00000188747,NOXA1,20298.523,0.74,
ENSG00000129988,LBP,18346.884,-0.567,
ENSG00000170835,CEL,15777.882,0.612,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000167515,TRAPPC2L,124386.632,-0.237
ENSG00000060762,MPC1,115967.76,-0.443
ENSG00000066923,STAG3,42371.166,0.49
ENSG00000136436,CALCOCO2,30505.962,0.762
ENSG00000105672,ETV2,30083.124,0.711
ENSG00000188763,FZD9,25725.001,-0.209
ENSG00000119616,FCF1,25614.721,-0.225
ENSG00000188747,NOXA1,20298.523,0.74
ENSG00000129988,LBP,18346.884,-0.567
ENSG00000170835,CEL,15777.882,0.612

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.804,38.728,4.293,"[Suricata_suricatta, Heterocephalus_glaber]"
1,0.93,14.722,2.993,"[Canis_lupus_familiaris, Microcebus_murinus]"
2,-10.989,2070.36,23.301,"[Bos_taurus, Homo_sapiens]"
3,0.791,38.694,4.721,"[Mus_caroli, Mus_musculus]"
4,0.275,123.122,7.866,"[Monodelphis_domestica, Vombatus_ursinus]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,543844.983,0.019,
ENSG00000083896,YTHDC1,310155.474,0.162,
ENSG00000010219,DYRK4,139177.404,0.356,
ENSG00000204590,GNL1,137952.967,-0.21,
ENSG00000167515,TRAPPC2L,110730.029,-0.226,
ENSG00000060762,MPC1,110126.6,-0.42,
ENSG00000204498,NFKBIL1,72786.676,-0.59,
ENSG00000066923,STAG3,66056.117,0.533,
ENSG00000136436,CALCOCO2,37123.411,0.745,
ENSG00000102265,TIMP1,30098.408,0.531,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,543844.983,0.019
ENSG00000083896,YTHDC1,310155.474,0.162
ENSG00000010219,DYRK4,139177.404,0.356
ENSG00000204590,GNL1,137952.967,-0.21
ENSG00000167515,TRAPPC2L,110730.029,-0.226
ENSG00000060762,MPC1,110126.6,-0.42
ENSG00000204498,NFKBIL1,72786.676,-0.59
ENSG00000066923,STAG3,66056.117,0.533
ENSG00000136436,CALCOCO2,37123.411,0.745
ENSG00000102265,TIMP1,30098.408,0.531

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.72,133.245,8.143,"[Macaca_mulatta, Monodelphis_domestica]"
1,0.958,24.864,3.374,"[Sus_scrofa, Ailuropoda_melanoleuca]"
2,0.934,43.691,4.149,"[Mus_spicilegus, Tupaia_belangeri]"
3,0.967,17.224,2.729,"[Ovis_aries, Canis_lupus_familiaris]"
4,0.94,30.18,3.735,"[Otolemur_garnettii, Heterocephalus_glaber]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,533644.07,-0.002,
ENSG00000083896,YTHDC1,306895.246,0.187,
ENSG00000010219,DYRK4,152225.425,0.288,
ENSG00000204590,GNL1,135597.664,-0.021,
ENSG00000060762,MPC1,116085.003,-0.401,
ENSG00000167515,TRAPPC2L,113108.965,-0.149,
ENSG00000204498,NFKBIL1,75163.163,-0.453,
ENSG00000066923,STAG3,67962.652,0.473,
ENSG00000102265,TIMP1,29798.386,0.54,
ENSG00000164879,CA3,26439.911,-0.529,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,533644.07,-0.002
ENSG00000083896,YTHDC1,306895.246,0.187
ENSG00000010219,DYRK4,152225.425,0.288
ENSG00000204590,GNL1,135597.664,-0.021
ENSG00000060762,MPC1,116085.003,-0.401
ENSG00000167515,TRAPPC2L,113108.965,-0.149
ENSG00000204498,NFKBIL1,75163.163,-0.453
ENSG00000066923,STAG3,67962.652,0.473
ENSG00000102265,TIMP1,29798.386,0.54
ENSG00000164879,CA3,26439.911,-0.529

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.883,74.59,4.819,"[Suricata_suricatta, Pan_troglodytes]"
1,0.93,40.632,4.373,"[Heterocephalus_glaber, Ursus_americanus]"
2,0.808,91.901,5.3,"[Mus_musculus, Tursiops_truncatus]"
3,0.957,19.622,3.444,"[Mus_spicilegus, Cavia_porcellus]"
4,0.942,28.466,3.42,"[Macaca_fascicularis, Capra_hircus]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,511275.664,0.032,
ENSG00000083896,YTHDC1,272817.102,0.126,
ENSG00000010219,DYRK4,140830.256,0.292,
ENSG00000204590,GNL1,126310.221,-0.029,
ENSG00000167515,TRAPPC2L,122720.638,-0.241,
ENSG00000060762,MPC1,96165.813,-0.417,
ENSG00000148248,SURF4,79256.351,0.057,
ENSG00000066923,STAG3,74575.2,0.499,
ENSG00000136436,CALCOCO2,34624.129,0.771,
ENSG00000184983,NDUFA6,32640.165,-0.557,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,511275.664,0.032
ENSG00000083896,YTHDC1,272817.102,0.126
ENSG00000010219,DYRK4,140830.256,0.292
ENSG00000204590,GNL1,126310.221,-0.029
ENSG00000167515,TRAPPC2L,122720.638,-0.241
ENSG00000060762,MPC1,96165.813,-0.417
ENSG00000148248,SURF4,79256.351,0.057
ENSG00000066923,STAG3,74575.2,0.499
ENSG00000136436,CALCOCO2,34624.129,0.771
ENSG00000184983,NDUFA6,32640.165,-0.557

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.948,34.577,4.232,"[Vombatus_ursinus, Rhinolophus_ferrumequinum]"
1,0.95,24.585,3.714,"[Mus_musculus, Microcebus_murinus]"
2,0.367,346.908,12.091,"[Oryctolagus_cuniculus, Monodelphis_domestica]"
3,0.871,68.882,4.68,"[Pan_troglodytes, Callithrix_jacchus]"
4,0.909,44.792,4.466,"[Ictidomys_tridecemlineatus, Heterocephalus_gl..."

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,532689.119,0.031,
ENSG00000083896,YTHDC1,310106.309,0.126,
ENSG00000204590,GNL1,142136.469,-0.072,
ENSG00000167515,TRAPPC2L,131751.111,-0.101,
ENSG00000010219,DYRK4,120057.1,0.269,
ENSG00000204498,NFKBIL1,119760.068,-0.57,
ENSG00000060762,MPC1,115315.385,-0.378,
ENSG00000066923,STAG3,68787.631,0.451,
ENSG00000136436,CALCOCO2,30224.794,0.739,
ENSG00000119616,FCF1,29475.599,-0.36,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,532689.119,0.031
ENSG00000083896,YTHDC1,310106.309,0.126
ENSG00000204590,GNL1,142136.469,-0.072
ENSG00000167515,TRAPPC2L,131751.111,-0.101
ENSG00000010219,DYRK4,120057.1,0.269
ENSG00000204498,NFKBIL1,119760.068,-0.57
ENSG00000060762,MPC1,115315.385,-0.378
ENSG00000066923,STAG3,68787.631,0.451
ENSG00000136436,CALCOCO2,30224.794,0.739
ENSG00000119616,FCF1,29475.599,-0.36

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.939,39.32,3.726,"[Equus_caballus, Macaca_nemestrina]"
1,0.941,33.347,3.641,"[Ursus_americanus, Ailuropoda_melanoleuca]"
2,0.961,24.397,3.104,"[Pan_paniscus, Macaca_fascicularis]"
3,0.963,17.319,2.922,"[Capra_hircus, Ovis_aries]"
4,0.945,26.105,3.262,"[Mus_spicilegus, Aotus_nancymaae]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,522949.195,0.005,
ENSG00000083896,YTHDC1,308484.19,0.139,
ENSG00000010219,DYRK4,154247.308,0.28,
ENSG00000204590,GNL1,138195.513,-0.059,
ENSG00000167515,TRAPPC2L,117338.183,-0.191,
ENSG00000060762,MPC1,110150.556,-0.457,
ENSG00000204498,NFKBIL1,67965.662,-0.524,
ENSG00000066923,STAG3,67625.253,0.48,
ENSG00000164879,CA3,33381.961,-0.465,
ENSG00000136436,CALCOCO2,28981.571,0.785,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,522949.195,0.005
ENSG00000083896,YTHDC1,308484.19,0.139
ENSG00000010219,DYRK4,154247.308,0.28
ENSG00000204590,GNL1,138195.513,-0.059
ENSG00000167515,TRAPPC2L,117338.183,-0.191
ENSG00000060762,MPC1,110150.556,-0.457
ENSG00000204498,NFKBIL1,67965.662,-0.524
ENSG00000066923,STAG3,67625.253,0.48
ENSG00000164879,CA3,33381.961,-0.465
ENSG00000136436,CALCOCO2,28981.571,0.785

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.816,98.811,6.939,"[Mesocricetus_auratus, Otolemur_garnettii]"
1,0.721,105.233,7.624,"[Monodelphis_domestica, Rattus_norvegicus]"
2,0.813,112.67,6.742,"[Pan_troglodytes, Macaca_mulatta]"
3,0.908,50.571,4.741,"[Tupaia_belangeri, Capra_hircus]"
4,0.945,27.493,3.816,"[Mus_caroli, Macaca_nemestrina]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,469750.933,-0.002,
ENSG00000083896,YTHDC1,258014.886,0.106,
ENSG00000167515,TRAPPC2L,124314.983,-0.162,
ENSG00000204590,GNL1,117500.164,-0.003,
ENSG00000060762,MPC1,107287.05,-0.371,
ENSG00000010219,DYRK4,96229.848,0.26,
ENSG00000226979,LTA,92613.415,-0.024,
ENSG00000148248,SURF4,91383.441,0.159,
ENSG00000066923,STAG3,85055.878,0.52,
ENSG00000204498,NFKBIL1,68932.275,-0.464,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,469750.933,-0.002
ENSG00000083896,YTHDC1,258014.886,0.106
ENSG00000167515,TRAPPC2L,124314.983,-0.162
ENSG00000204590,GNL1,117500.164,-0.003
ENSG00000060762,MPC1,107287.05,-0.371
ENSG00000010219,DYRK4,96229.848,0.26
ENSG00000226979,LTA,92613.415,-0.024
ENSG00000148248,SURF4,91383.441,0.159
ENSG00000066923,STAG3,85055.878,0.52
ENSG00000204498,NFKBIL1,68932.275,-0.464

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.94,30.119,3.972,"[Mus_musculus, Rhinolophus_ferrumequinum]"
1,0.791,125.535,6.827,"[Gorilla_gorilla, Phascolarctos_cinereus]"
2,0.951,27.011,3.581,"[Suricata_suricatta, Felis_catus]"
3,0.699,160.354,6.784,"[Oryctolagus_cuniculus, Pan_troglodytes]"
4,0.776,106.749,5.814,"[Cavia_aperea, Aotus_nancymaae]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000060762,MPC1,124403.088,-0.405,
ENSG00000167515,TRAPPC2L,114621.437,-0.182,
ENSG00000066923,STAG3,76442.448,0.52,
ENSG00000136436,CALCOCO2,34632.149,0.777,
ENSG00000105672,ETV2,27325.753,0.681,
ENSG00000164879,CA3,24767.605,-0.567,
ENSG00000102265,TIMP1,17828.368,0.534,
ENSG00000188747,NOXA1,17235.599,0.701,
ENSG00000129988,LBP,16075.711,-0.559,
ENSG00000065268,WDR18,13327.01,-0.772,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000060762,MPC1,124403.088,-0.405
ENSG00000167515,TRAPPC2L,114621.437,-0.182
ENSG00000066923,STAG3,76442.448,0.52
ENSG00000136436,CALCOCO2,34632.149,0.777
ENSG00000105672,ETV2,27325.753,0.681
ENSG00000164879,CA3,24767.605,-0.567
ENSG00000102265,TIMP1,17828.368,0.534
ENSG00000188747,NOXA1,17235.599,0.701
ENSG00000129988,LBP,16075.711,-0.559
ENSG00000065268,WDR18,13327.01,-0.772

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.704,55.594,4.69,"[Ailuropoda_melanoleuca, Equus_caballus]"
1,0.865,25.682,3.425,"[Rhinopithecus_bieti, Felis_catus]"
2,-10.734,1607.197,22.104,"[Homo_sapiens, Monodelphis_domestica]"
3,0.829,29.996,3.847,"[Mus_musculus, Rhinolophus_ferrumequinum]"
4,0.434,113.973,4.71,"[Loxodonta_africana, Mus_spicilegus]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,477886.317,0.014,
ENSG00000083896,YTHDC1,273826.787,0.183,
ENSG00000010219,DYRK4,169544.86,0.3,
ENSG00000204590,GNL1,123811.017,-0.07,
ENSG00000167515,TRAPPC2L,114987.391,-0.119,
ENSG00000060762,MPC1,99343.435,-0.444,
ENSG00000066923,STAG3,86972.494,0.506,
ENSG00000226979,LTA,85623.637,0.084,
ENSG00000148248,SURF4,70749.258,0.04,
ENSG00000136436,CALCOCO2,32167.116,0.723,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,477886.317,0.014
ENSG00000083896,YTHDC1,273826.787,0.183
ENSG00000010219,DYRK4,169544.86,0.3
ENSG00000204590,GNL1,123811.017,-0.07
ENSG00000167515,TRAPPC2L,114987.391,-0.119
ENSG00000060762,MPC1,99343.435,-0.444
ENSG00000066923,STAG3,86972.494,0.506
ENSG00000226979,LTA,85623.637,0.084
ENSG00000148248,SURF4,70749.258,0.04
ENSG00000136436,CALCOCO2,32167.116,0.723

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.955,22.179,3.494,"[Bos_taurus, Cavia_porcellus]"
1,0.907,59.703,4.981,"[Vombatus_ursinus, Phascolarctos_cinereus]"
2,0.941,38.882,4.289,"[Tupaia_belangeri, Pan_paniscus]"
3,0.813,79.768,5.62,"[Mus_musculus, Oryctolagus_cuniculus]"
4,0.651,156.484,7.263,"[Gorilla_gorilla, Macaca_nemestrina]"

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000060762,MPC1,121107.049,-0.426,
ENSG00000167515,TRAPPC2L,115273.899,-0.139,
ENSG00000066923,STAG3,44782.534,0.461,
ENSG00000105672,ETV2,28348.435,0.674,
ENSG00000184983,NDUFA6,25526.28,-0.558,
ENSG00000188747,NOXA1,25030.04,0.697,
ENSG00000136436,CALCOCO2,22596.583,0.809,
ENSG00000164879,CA3,15563.706,-0.51,
ENSG00000170835,CEL,13664.749,0.684,
ENSG00000105993,DNAJB6,12437.615,0.34,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000060762,MPC1,121107.049,-0.426
ENSG00000167515,TRAPPC2L,115273.899,-0.139
ENSG00000066923,STAG3,44782.534,0.461
ENSG00000105672,ETV2,28348.435,0.674
ENSG00000184983,NDUFA6,25526.28,-0.558
ENSG00000188747,NOXA1,25030.04,0.697
ENSG00000136436,CALCOCO2,22596.583,0.809
ENSG00000164879,CA3,15563.706,-0.51
ENSG00000170835,CEL,13664.749,0.684
ENSG00000105993,DNAJB6,12437.615,0.34

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.743,34.299,4.783,"[Rattus_norvegicus, Canis_lupus_familiaris]"
1,-10.874,2058.335,23.385,"[Homo_sapiens, Otolemur_garnettii]"
2,0.305,102.973,6.026,"[Loxodonta_africana, Mus_musculus]"
3,0.849,27.868,3.833,"[Macaca_nemestrina, Felis_catus]"
4,0.871,22.353,3.465,"[Macaca_mulatta, Microcebus_murinus]"


In [75]:
from yspecies.results import FeatureSummary
summary = FeatureSummary(stage_one_lifespan)
stage_one_lifespan[0]

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan,Unnamed: 4_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,validation_species
ENSG00000276293,PIP4K2B,478450.068,0.048,
ENSG00000083896,YTHDC1,263331.356,0.079,
ENSG00000010219,DYRK4,188286.492,0.246,
ENSG00000167515,TRAPPC2L,126939.943,-0.233,
ENSG00000060762,MPC1,121102.08,-0.438,
ENSG00000204590,GNL1,111555.248,-0.032,
ENSG00000066923,STAG3,91786.359,0.489,
ENSG00000204498,NFKBIL1,91750.48,-0.311,
ENSG00000148248,SURF4,84496.348,-0.037,
ENSG00000136436,CALCOCO2,29367.246,0.757,

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000276293,PIP4K2B,478450.068,0.048
ENSG00000083896,YTHDC1,263331.356,0.079
ENSG00000010219,DYRK4,188286.492,0.246
ENSG00000167515,TRAPPC2L,126939.943,-0.233
ENSG00000060762,MPC1,121102.08,-0.438
ENSG00000204590,GNL1,111555.248,-0.032
ENSG00000066923,STAG3,91786.359,0.489
ENSG00000204498,NFKBIL1,91750.48,-0.311
ENSG00000148248,SURF4,84496.348,-0.037
ENSG00000136436,CALCOCO2,29367.246,0.757

Unnamed: 0,R^2,MSE,MAE,validation_species
0,0.939,30.702,4.064,"[Mus_musculus, Ursus_americanus]"
1,0.932,35.558,3.992,"[Heterocephalus_glaber, Sus_scrofa]"
2,0.819,101.797,5.663,"[Meriones_unguiculatus, Tursiops_truncatus]"
3,0.677,178.658,7.174,"[Pan_troglodytes, Oryctolagus_cuniculus]"
4,0.914,45.934,3.744,"[Aotus_nancymaae, Equus_caballus]"


In [15]:
summary.selected.sort_values("kendall_tau_0", ascending=False)["symbol"].values

array(['CALCOCO2', 'MPG', 'NOXA1', 'ETV2', 'CEL', 'ARMC12', 'HRH4',
       'DCTD', 'KCNMB3', 'ADGRF3', 'TIMP1', 'TSPAN10', 'STAG3', 'GUCY2C',
       'DNAJB6', 'LIN7B', 'TRAPPC2L', 'MPC1', 'CTBS', 'CA3', 'NDUFA6',
       'LBP', 'PUSL1', 'MPC2', 'SNRPN', 'ADPRM', 'DPP9', 'WDR18'],
      dtype=object)

In [73]:
"gain" if len([c for c in summary.results[0].selected.columns if "gain" in c])>0 else "shap"

'gain'