# SHAP feature selection #
## Code to select feature with combination of LightGBM and SHAP ##

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [1]:
n_folds = 5 # this sets global setting of which how many bootstraps to use
n_hold_out = 1
repeats = 10
importance_type = "split"

life_history = ["lifespan", "mass_kg", "mtGC", "metabolic_rate", "temperature", "gestation_days"]
debug_local = True #to use local version

In [2]:
from pathlib import Path
import sys
import inspect

#lgb_params["importance_type"] = importance_type

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']


In [3]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat, Collect
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.models import Metrics
from yspecies.selection import ShapSelector
from yspecies.results import FeatureSummary

NameError: name 'Nonee' is not defined

In [None]:
import optuna
from optuna import Study, Trial
from optuna import multi_objective
from optuna.multi_objective import trial
from optuna.multi_objective.study import MultiObjectiveStudy
from yspecies.tuning import MultiObjectiveResults

In [None]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [5]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [6]:
from loguru import logger
logger.add(locations.logs / "stage_two_shap_selection.log", rotation="12:00")     # New file is created each day at noon

1

In [7]:
stage_2_folder = locations.interim.dir / "stage_2" 
stage_2_folder.mkdir(exist_ok=True)
partitions_folder = stage_2_folder  / "partitions"
partitions_folder.mkdir(exist_ok=True)

In [8]:
data = ExpressionDataset.from_folder(locations.interim.dir / "stage_2" / "input")
data

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(408, 142)","(142, 37)",38,408,"(142, 2)","(38, 18)"


## Setting up features to select ##

In [9]:
default_selection = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = "lifespan", #column to predict
    categorical = ["tissue"],
    select_by = "shap",
    importance_type =  importance_type,
    feature_perturbation = "tree_path_dependent"
)
default_selection

Samples metadata,Species metadata,Genes,Predict label,not_validated species
"['tissue', 'species']",[],all,lifespan,[]


In [10]:
selections = OrderedDict([(trait, (data,  replace(default_selection, to_predict = trait))) for trait in life_history])
selections["lifespan"][0]

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(408, 142)","(142, 37)",38,408,"(142, 2)","(38, 18)"


### Setting up pipelines ###

In [19]:
from yspecies.helpers import PipelineFactory
factory = PipelineFactory(locations,repeats, n_folds, n_hold_out)

In [20]:
def opt_by_trait_2(trait: str):
    return locations.interim.optimization / f"{trait}_2.sqlite"

In [21]:
print("setting up quick pipelines")
quick_pipelines =  {trait:factory.make_shap_pipeline(study_path = opt_by_trait_2(trait), study_name=f"{trait}_r2_huber_kendall") for trait in life_history}
print("setting up pipelines with repeats")
pipelines = {trait:factory.make_repeated_shap_pipeline(study_path = opt_by_trait_2(trait), study_name=f"{trait}_r2_huber_kendall") for trait in life_history}
#pipelines = {trait:factory.make_shap_pipeline(trait) for trait in life_history}

#make_partitioning_shap_pipeline
pipelines.keys()

setting up quick pipelines
loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/lifespan_2.sqlite


create_study is experimental (supported from v1.4.0). The interface can change in the future.
NSGAIIMultiObjectiveSampler is experimental (supported from v1.5.0). The interface can change in the future.
RandomMultiObjectiveSampler is experimental (supported from v1.4.0). The interface can change in the future.
[I 2020-09-21 01:00:50,032] Using an existing study with name 'lifespan_r2_huber_kendall' instead of creating a new one.
MultiObjectiveStudy is experimental (supported from v1.4.0). The interface can change in the future.
FrozenMultiObjectiveTrial is experimental (supported from v1.4.0). The interface can change in the future.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/mass_kg_2.sqlite


[I 2020-09-21 01:00:51,190] Using an existing study with name 'mass_kg_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/mtGC_2.sqlite


[I 2020-09-21 01:00:51,211] Using an existing study with name 'mtGC_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/metabolic_rate_2.sqlite


[I 2020-09-21 01:00:51,230] Using an existing study with name 'metabolic_rate_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/temperature_2.sqlite


[I 2020-09-21 01:00:51,250] Using an existing study with name 'temperature_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/gestation_days_2.sqlite


[I 2020-09-21 01:00:51,270] Using an existing study with name 'gestation_days_r2_huber_kendall' instead of creating a new one.


setting up pipelines with repeats
loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/lifespan_2.sqlite


[I 2020-09-21 01:00:51,291] Using an existing study with name 'lifespan_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/mass_kg_2.sqlite


[I 2020-09-21 01:00:52,433] Using an existing study with name 'mass_kg_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/mtGC_2.sqlite


[I 2020-09-21 01:00:52,455] Using an existing study with name 'mtGC_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/metabolic_rate_2.sqlite


[I 2020-09-21 01:00:52,478] Using an existing study with name 'metabolic_rate_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/temperature_2.sqlite


[I 2020-09-21 01:00:52,501] Using an existing study with name 'temperature_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/gestation_days_2.sqlite


[I 2020-09-21 01:00:52,523] Using an existing study with name 'gestation_days_r2_huber_kendall' instead of creating a new one.


dict_keys(['lifespan', 'mass_kg', 'mtGC', 'metabolic_rate', 'temperature', 'gestation_days'])

# Run second stage #

In [22]:
stage_two = {}

In [23]:
def run_stage_two(trait: str, quick: bool = False):
    pipes = quick_pipelines if quick else pipelines
    result = pipes[trait].fit_transform(selections[trait])
    stage_two[trait] = result #ugly mutation of stage_one dicionary
    return result

In [25]:
lifespan_2.selected.to_csv(locations.output.stage_two / "lifespan.tsv", sep="\t", index_label="reference_gene")

In [24]:
lifespan_2 = run_stage_two("lifespan", False)
lifespan_2.selected

2020-09-21 01:00:56.474 | INFO     | yspecies.selection:fit:149 - ===== fitting models with seed 0 =====
2020-09-21 01:00:56.475 | INFO     | yspecies.selection:fit:150 - PARAMETERS:
{'bagging_fraction': 0.757456996980591, 'boosting_type': 'gbdt', 'drop_rate': 0.15938150347995844, 'feature_fraction': 0.41345305892024875, 'lambda_l1': 2.9528782080363634, 'lambda_l2': 2.3536556238167003, 'learning_rate': 0.05373601286514567, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'verbose': -1, 'early_stopping_round': 10}
2020-09-21 01:00:56.479 | INFO     | yspecies.selection:fit:153 - SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Homo_sapiens', 'Equus_caballus']
Found `early_stopping_round` in params. Will use it instead of argument


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[117]	valid_0's l1: 21.5888	valid_0's l2: 1953.78	valid_0's huber: 19.1044


2020-09-21 01:00:56.558 | INFO     | yspecies.selection:fit:153 - SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Ursus_americanus', 'Phascolarctos_cinereus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 3.26036	valid_0's l2: 38.7967	valid_0's huber: 2.62994
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 3.26036	valid_0's l2: 38.7967	valid_0's huber: 2.62994


2020-09-21 01:00:56.670 | INFO     | yspecies.selection:fit:153 - SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Suricata_suricatta', 'Mus_spicilegus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 2.4822	valid_0's l2: 22.1101	valid_0's huber: 1.91506
Did not meet early stopping. Best iteration is:
[249]	valid_0's l1: 2.48207	valid_0's l2: 22.0934	valid_0's huber: 1.91468


2020-09-21 01:00:56.785 | INFO     | yspecies.selection:fit:153 - SEED: 0 | FOLD: 3 | VALIDATION_SPECIES: ['Aotus_nancymaae', 'Macaca_fascicularis']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[232]	valid_0's l1: 2.49162	valid_0's l2: 14.5611	valid_0's huber: 1.91483


2020-09-21 01:01:10.508 | INFO     | yspecies.explanations:transform:494 - Metrics: 
Metrics(R2=0.6752438223792208, MAE=7.455702362301174, MSE=507.3084105826874, huber=6.389815101524114)
2020-09-21 01:01:10.513 | INFO     | yspecies.selection:fit:149 - ===== fitting models with seed 1 =====
2020-09-21 01:01:10.513 | INFO     | yspecies.selection:fit:150 - PARAMETERS:
{'bagging_fraction': 0.757456996980591, 'boosting_type': 'gbdt', 'drop_rate': 0.15938150347995844, 'feature_fraction': 0.41345305892024875, 'lambda_l1': 2.9528782080363634, 'lambda_l2': 2.3536556238167003, 'learning_rate': 0.05373601286514567, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'verbose': -1, 'early_stopping_round': 10, 'seed': 0}
2020-09-21 01:01:10.516 | INFO     | yspecies.selection:fit:153 - SEED: 1 | FOLD: 0 | VALIDATION_SPECIES: ['Mus_caroli', 'Vombatus_ursinus']
Found `early_stopping_round` in params. Will use it instead of argument


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[123]	valid_0's l1: 2.74609	valid_0's l2: 20.4348	valid_0's huber: 2.11214


2020-09-21 01:01:10.607 | INFO     | yspecies.selection:fit:153 - SEED: 1 | FOLD: 1 | VALIDATION_SPECIES: ['Ailuropoda_melanoleuca', 'Ursus_americanus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[132]	valid_0's l1: 3.4385	valid_0's l2: 39.9311	valid_0's huber: 2.76193


2020-09-21 01:01:10.697 | INFO     | yspecies.selection:fit:153 - SEED: 1 | FOLD: 2 | VALIDATION_SPECIES: ['Rhinolophus_ferrumequinum', 'Macaca_mulatta']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[173]	valid_0's l1: 2.89148	valid_0's l2: 24.6375	valid_0's huber: 2.25999


2020-09-21 01:01:10.789 | INFO     | yspecies.selection:fit:153 - SEED: 1 | FOLD: 3 | VALIDATION_SPECIES: ['Canis_lupus_familiaris', 'Homo_sapiens']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[202]	valid_0's l1: 18.8952	valid_0's l2: 1536.27	valid_0's huber: 16.6741


2020-09-21 01:01:22.795 | INFO     | yspecies.explanations:transform:494 - Metrics: 
Metrics(R2=0.7139923978383951, MAE=6.992828760716467, MSE=405.3193200606812, huber=5.950478004901392)
2020-09-21 01:01:22.801 | INFO     | yspecies.selection:fit:149 - ===== fitting models with seed 2 =====
2020-09-21 01:01:22.802 | INFO     | yspecies.selection:fit:150 - PARAMETERS:
{'bagging_fraction': 0.757456996980591, 'boosting_type': 'gbdt', 'drop_rate': 0.15938150347995844, 'feature_fraction': 0.41345305892024875, 'lambda_l1': 2.9528782080363634, 'lambda_l2': 2.3536556238167003, 'learning_rate': 0.05373601286514567, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'verbose': -1, 'early_stopping_round': 10, 'seed': 1}
2020-09-21 01:01:22.806 | INFO     | yspecies.selection:fit:153 - SEED: 2 | FOLD: 0 | VALIDATION_SPECIES: ['Canis_lupus_familiaris', 'Ailuropoda_melanoleuca']
Found `early_stopping_round` in params. Will use it ins

Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[92]	valid_0's l1: 2.22157	valid_0's l2: 19.0285	valid_0's huber: 1.67311


2020-09-21 01:01:22.879 | INFO     | yspecies.selection:fit:153 - SEED: 2 | FOLD: 1 | VALIDATION_SPECIES: ['Bos_taurus', 'Callithrix_jacchus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 3.97443	valid_0's l2: 47.6262	valid_0's huber: 3.23459
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 3.97443	valid_0's l2: 47.6262	valid_0's huber: 3.23459


2020-09-21 01:01:22.996 | INFO     | yspecies.selection:fit:153 - SEED: 2 | FOLD: 2 | VALIDATION_SPECIES: ['Rhinolophus_ferrumequinum', 'Macaca_nemestrina']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[174]	valid_0's l1: 3.99204	valid_0's l2: 43.1393	valid_0's huber: 3.24388


2020-09-21 01:01:23.112 | INFO     | yspecies.selection:fit:153 - SEED: 2 | FOLD: 3 | VALIDATION_SPECIES: ['Mesocricetus_auratus', 'Cavia_aperea']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 4.4965	valid_0's l2: 46.5168	valid_0's huber: 3.69979
Did not meet early stopping. Best iteration is:
[248]	valid_0's l1: 4.49453	valid_0's l2: 46.469	valid_0's huber: 3.69832


2020-09-21 01:01:34.545 | INFO     | yspecies.explanations:transform:494 - Metrics: 
Metrics(R2=0.9462962766896739, MAE=3.670643621349281, MSE=39.065735804557846, huber=2.963054656809254)
2020-09-21 01:01:34.550 | INFO     | yspecies.selection:fit:149 - ===== fitting models with seed 3 =====
2020-09-21 01:01:34.551 | INFO     | yspecies.selection:fit:150 - PARAMETERS:
{'bagging_fraction': 0.757456996980591, 'boosting_type': 'gbdt', 'drop_rate': 0.15938150347995844, 'feature_fraction': 0.41345305892024875, 'lambda_l1': 2.9528782080363634, 'lambda_l2': 2.3536556238167003, 'learning_rate': 0.05373601286514567, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'verbose': -1, 'early_stopping_round': 10, 'seed': 2}
2020-09-21 01:01:34.554 | INFO     | yspecies.selection:fit:153 - SEED: 3 | FOLD: 0 | VALIDATION_SPECIES: ['Otolemur_garnettii', 'Equus_caballus']
Found `early_stopping_round` in params. Will use it instead of arg

Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[121]	valid_0's l1: 2.4796	valid_0's l2: 22.4618	valid_0's huber: 1.91821


2020-09-21 01:01:34.686 | INFO     | yspecies.selection:fit:153 - SEED: 3 | FOLD: 1 | VALIDATION_SPECIES: ['Ailuropoda_melanoleuca', 'Homo_sapiens']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[103]	valid_0's l1: 21.751	valid_0's l2: 1911.08	valid_0's huber: 19.2417


2020-09-21 01:01:34.783 | INFO     | yspecies.selection:fit:153 - SEED: 3 | FOLD: 2 | VALIDATION_SPECIES: ['Suricata_suricatta', 'Phascolarctos_cinereus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[181]	valid_0's l1: 2.78355	valid_0's l2: 18.3474	valid_0's huber: 2.15456


2020-09-21 01:01:34.896 | INFO     | yspecies.selection:fit:153 - SEED: 3 | FOLD: 3 | VALIDATION_SPECIES: ['Heterocephalus_glaber', 'Monodelphis_domestica']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 6.79453	valid_0's l2: 99.6633	valid_0's huber: 5.76087
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 6.79453	valid_0's l2: 99.6633	valid_0's huber: 5.76087


2020-09-21 01:01:44.394 | INFO     | yspecies.explanations:transform:494 - Metrics: 
Metrics(R2=0.6257634368927816, MAE=8.452163222129077, MSE=512.8884833433789, huber=7.25954363463277)
2020-09-21 01:01:44.399 | INFO     | yspecies.selection:fit:149 - ===== fitting models with seed 4 =====
2020-09-21 01:01:44.400 | INFO     | yspecies.selection:fit:150 - PARAMETERS:
{'bagging_fraction': 0.757456996980591, 'boosting_type': 'gbdt', 'drop_rate': 0.15938150347995844, 'feature_fraction': 0.41345305892024875, 'lambda_l1': 2.9528782080363634, 'lambda_l2': 2.3536556238167003, 'learning_rate': 0.05373601286514567, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'verbose': -1, 'early_stopping_round': 10, 'seed': 3}
2020-09-21 01:01:44.403 | INFO     | yspecies.selection:fit:153 - SEED: 4 | FOLD: 0 | VALIDATION_SPECIES: ['Macaca_mulatta', 'Gorilla_gorilla']
Found `early_stopping_round` in params. Will use it instead of argument

Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[221]	valid_0's l1: 5.71982	valid_0's l2: 116.928	valid_0's huber: 4.81536


2020-09-21 01:01:44.525 | INFO     | yspecies.selection:fit:153 - SEED: 4 | FOLD: 1 | VALIDATION_SPECIES: ['Aotus_nancymaae', 'Ailuropoda_melanoleuca']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[145]	valid_0's l1: 1.76314	valid_0's l2: 7.78293	valid_0's huber: 1.27083


2020-09-21 01:01:44.609 | INFO     | yspecies.selection:fit:153 - SEED: 4 | FOLD: 2 | VALIDATION_SPECIES: ['Ursus_americanus', 'Meriones_unguiculatus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[167]	valid_0's l1: 2.46817	valid_0's l2: 18.0957	valid_0's huber: 1.87437


2020-09-21 01:01:44.714 | INFO     | yspecies.selection:fit:153 - SEED: 4 | FOLD: 3 | VALIDATION_SPECIES: ['Mus_spicilegus', 'Equus_caballus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[131]	valid_0's l1: 2.79231	valid_0's l2: 55.363	valid_0's huber: 2.21219


2020-09-21 01:01:56.545 | INFO     | yspecies.explanations:transform:494 - Metrics: 
Metrics(R2=0.8357595748117022, MAE=3.1858597229242074, MSE=49.5424741007465, huber=2.540847554247527)
2020-09-21 01:01:56.551 | INFO     | yspecies.selection:fit:149 - ===== fitting models with seed 5 =====
2020-09-21 01:01:56.551 | INFO     | yspecies.selection:fit:150 - PARAMETERS:
{'bagging_fraction': 0.757456996980591, 'boosting_type': 'gbdt', 'drop_rate': 0.15938150347995844, 'feature_fraction': 0.41345305892024875, 'lambda_l1': 2.9528782080363634, 'lambda_l2': 2.3536556238167003, 'learning_rate': 0.05373601286514567, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'verbose': -1, 'early_stopping_round': 10, 'seed': 4}
2020-09-21 01:01:56.555 | INFO     | yspecies.selection:fit:153 - SEED: 5 | FOLD: 0 | VALIDATION_SPECIES: ['Rattus_norvegicus', 'Sus_scrofa']
Found `early_stopping_round` in params. Will use it instead of argument


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[114]	valid_0's l1: 4.17931	valid_0's l2: 30.7018	valid_0's huber: 3.38356


2020-09-21 01:01:56.612 | INFO     | yspecies.selection:fit:153 - SEED: 5 | FOLD: 1 | VALIDATION_SPECIES: ['Rhinopithecus_bieti', 'Ursus_americanus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 4.45687	valid_0's l2: 62.933	valid_0's huber: 3.6671
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 4.45687	valid_0's l2: 62.933	valid_0's huber: 3.6671


2020-09-21 01:01:56.720 | INFO     | yspecies.selection:fit:153 - SEED: 5 | FOLD: 2 | VALIDATION_SPECIES: ['Ailuropoda_melanoleuca', 'Mus_spicilegus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 3.39009	valid_0's l2: 29.5465	valid_0's huber: 2.70735
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 3.39009	valid_0's l2: 29.5465	valid_0's huber: 2.70735


2020-09-21 01:01:56.831 | INFO     | yspecies.selection:fit:153 - SEED: 5 | FOLD: 3 | VALIDATION_SPECIES: ['Tupaia_belangeri', 'Meriones_unguiculatus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[228]	valid_0's l1: 3.19739	valid_0's l2: 42.9577	valid_0's huber: 2.54302


2020-09-21 01:02:10.358 | INFO     | yspecies.explanations:transform:494 - Metrics: 
Metrics(R2=0.9417398387694671, MAE=3.805913862121055, MSE=41.534746130764894, huber=3.076001165993597)
2020-09-21 01:02:10.363 | INFO     | yspecies.selection:fit:149 - ===== fitting models with seed 6 =====
2020-09-21 01:02:10.364 | INFO     | yspecies.selection:fit:150 - PARAMETERS:
{'bagging_fraction': 0.757456996980591, 'boosting_type': 'gbdt', 'drop_rate': 0.15938150347995844, 'feature_fraction': 0.41345305892024875, 'lambda_l1': 2.9528782080363634, 'lambda_l2': 2.3536556238167003, 'learning_rate': 0.05373601286514567, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'verbose': -1, 'early_stopping_round': 10, 'seed': 5}
2020-09-21 01:02:10.369 | INFO     | yspecies.selection:fit:153 - SEED: 6 | FOLD: 0 | VALIDATION_SPECIES: ['Heterocephalus_glaber', 'Ursus_americanus']
Found `early_stopping_round` in params. Will use it instead o

Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[180]	valid_0's l1: 3.46638	valid_0's l2: 32.1031	valid_0's huber: 2.76778


2020-09-21 01:02:10.451 | INFO     | yspecies.selection:fit:153 - SEED: 6 | FOLD: 1 | VALIDATION_SPECIES: ['Rattus_norvegicus', 'Capra_hircus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 5.67722	valid_0's l2: 61.5234	valid_0's huber: 4.72441
Did not meet early stopping. Best iteration is:
[245]	valid_0's l1: 5.6755	valid_0's l2: 61.4916	valid_0's huber: 4.72314


2020-09-21 01:02:10.552 | INFO     | yspecies.selection:fit:153 - SEED: 6 | FOLD: 2 | VALIDATION_SPECIES: ['Canis_lupus_familiaris', 'Rhinopithecus_bieti']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 4.19513	valid_0's l2: 47.1787	valid_0's huber: 3.42774
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 4.19513	valid_0's l2: 47.1787	valid_0's huber: 3.42774


2020-09-21 01:02:10.658 | INFO     | yspecies.selection:fit:153 - SEED: 6 | FOLD: 3 | VALIDATION_SPECIES: ['Microcebus_murinus', 'Gorilla_gorilla']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[157]	valid_0's l1: 5.41639	valid_0's l2: 87.5754	valid_0's huber: 4.51419


2020-09-21 01:02:22.761 | INFO     | yspecies.explanations:transform:494 - Metrics: 
Metrics(R2=0.9184722982041374, MAE=4.688352808490409, MSE=57.08718185034132, huber=3.858716208476385)
2020-09-21 01:02:22.766 | INFO     | yspecies.selection:fit:149 - ===== fitting models with seed 7 =====
2020-09-21 01:02:22.770 | INFO     | yspecies.selection:fit:150 - PARAMETERS:
{'bagging_fraction': 0.757456996980591, 'boosting_type': 'gbdt', 'drop_rate': 0.15938150347995844, 'feature_fraction': 0.41345305892024875, 'lambda_l1': 2.9528782080363634, 'lambda_l2': 2.3536556238167003, 'learning_rate': 0.05373601286514567, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'verbose': -1, 'early_stopping_round': 10, 'seed': 6}
2020-09-21 01:02:22.774 | INFO     | yspecies.selection:fit:153 - SEED: 7 | FOLD: 0 | VALIDATION_SPECIES: ['Pan_paniscus', 'Microcebus_murinus']
Found `early_stopping_round` in params. Will use it instead of argume

Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 2.79416	valid_0's l2: 20.1696	valid_0's huber: 2.16502
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 2.79416	valid_0's l2: 20.1696	valid_0's huber: 2.16502


2020-09-21 01:02:22.888 | INFO     | yspecies.selection:fit:153 - SEED: 7 | FOLD: 1 | VALIDATION_SPECIES: ['Equus_caballus', 'Phascolarctos_cinereus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 4.6999	valid_0's l2: 94.0568	valid_0's huber: 3.91667
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 4.6999	valid_0's l2: 94.0568	valid_0's huber: 3.91667


2020-09-21 01:02:23.004 | INFO     | yspecies.selection:fit:153 - SEED: 7 | FOLD: 2 | VALIDATION_SPECIES: ['Aotus_nancymaae', 'Oryctolagus_cuniculus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[197]	valid_0's l1: 3.88987	valid_0's l2: 49.6432	valid_0's huber: 3.1665


2020-09-21 01:02:23.094 | INFO     | yspecies.selection:fit:153 - SEED: 7 | FOLD: 3 | VALIDATION_SPECIES: ['Sus_scrofa', 'Ailuropoda_melanoleuca']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[190]	valid_0's l1: 3.00906	valid_0's l2: 37.5192	valid_0's huber: 2.37852


2020-09-21 01:02:37.491 | INFO     | yspecies.explanations:transform:494 - Metrics: 
Metrics(R2=0.9356033790868941, MAE=3.598248893310192, MSE=50.3472015039728, huber=2.9066481027232984)
2020-09-21 01:02:37.498 | INFO     | yspecies.selection:fit:149 - ===== fitting models with seed 8 =====
2020-09-21 01:02:37.498 | INFO     | yspecies.selection:fit:150 - PARAMETERS:
{'bagging_fraction': 0.757456996980591, 'boosting_type': 'gbdt', 'drop_rate': 0.15938150347995844, 'feature_fraction': 0.41345305892024875, 'lambda_l1': 2.9528782080363634, 'lambda_l2': 2.3536556238167003, 'learning_rate': 0.05373601286514567, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'verbose': -1, 'early_stopping_round': 10, 'seed': 7}
2020-09-21 01:02:37.503 | INFO     | yspecies.selection:fit:153 - SEED: 8 | FOLD: 0 | VALIDATION_SPECIES: ['Equus_caballus', 'Capra_hircus']
Found `early_stopping_round` in params. Will use it instead of argument


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[237]	valid_0's l1: 2.83226	valid_0's l2: 33.3782	valid_0's huber: 2.21693


2020-09-21 01:02:37.617 | INFO     | yspecies.selection:fit:153 - SEED: 8 | FOLD: 1 | VALIDATION_SPECIES: ['Macaca_mulatta', 'Cavia_porcellus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[144]	valid_0's l1: 3.37294	valid_0's l2: 40.4578	valid_0's huber: 2.68332


2020-09-21 01:02:37.688 | INFO     | yspecies.selection:fit:153 - SEED: 8 | FOLD: 2 | VALIDATION_SPECIES: ['Ailuropoda_melanoleuca', 'Phascolarctos_cinereus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 2.82914	valid_0's l2: 24.5583	valid_0's huber: 2.20728
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 2.82914	valid_0's l2: 24.5583	valid_0's huber: 2.20728


2020-09-21 01:02:37.793 | INFO     | yspecies.selection:fit:153 - SEED: 8 | FOLD: 3 | VALIDATION_SPECIES: ['Suricata_suricatta', 'Heterocephalus_glaber']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[180]	valid_0's l1: 3.69998	valid_0's l2: 59.2097	valid_0's huber: 2.98637


2020-09-21 01:02:51.559 | INFO     | yspecies.explanations:transform:494 - Metrics: 
Metrics(R2=0.9447215132199963, MAE=3.1835790783324223, MSE=39.400996057941924, huber=2.5206127857746226)
2020-09-21 01:02:51.564 | INFO     | yspecies.selection:fit:149 - ===== fitting models with seed 9 =====
2020-09-21 01:02:51.565 | INFO     | yspecies.selection:fit:150 - PARAMETERS:
{'bagging_fraction': 0.757456996980591, 'boosting_type': 'gbdt', 'drop_rate': 0.15938150347995844, 'feature_fraction': 0.41345305892024875, 'lambda_l1': 2.9528782080363634, 'lambda_l2': 2.3536556238167003, 'learning_rate': 0.05373601286514567, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'verbose': -1, 'early_stopping_round': 10, 'seed': 8}
2020-09-21 01:02:51.569 | INFO     | yspecies.selection:fit:153 - SEED: 9 | FOLD: 0 | VALIDATION_SPECIES: ['Oryctolagus_cuniculus', 'Bos_taurus']
Found `early_stopping_round` in params. Will use it instead of ar

Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[105]	valid_0's l1: 3.35317	valid_0's l2: 32.4867	valid_0's huber: 2.65184


2020-09-21 01:02:51.637 | INFO     | yspecies.selection:fit:153 - SEED: 9 | FOLD: 1 | VALIDATION_SPECIES: ['Gorilla_gorilla', 'Tupaia_belangeri']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 5.59288	valid_0's l2: 83.5413	valid_0's huber: 4.67803
Did not meet early stopping. Best iteration is:
[249]	valid_0's l1: 5.59134	valid_0's l2: 83.4606	valid_0's huber: 4.67679


2020-09-21 01:02:51.740 | INFO     | yspecies.selection:fit:153 - SEED: 9 | FOLD: 2 | VALIDATION_SPECIES: ['Mus_musculus', 'Ictidomys_tridecemlineatus']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[175]	valid_0's l1: 2.48341	valid_0's l2: 14.9513	valid_0's huber: 1.88996


2020-09-21 01:02:51.831 | INFO     | yspecies.selection:fit:153 - SEED: 9 | FOLD: 3 | VALIDATION_SPECIES: ['Homo_sapiens', 'Otolemur_garnettii']


Training until validation scores don't improve for 10 rounds
Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 21.7979	valid_0's l2: 1941.3	valid_0's huber: 19.2941
Did not meet early stopping. Best iteration is:
[249]	valid_0's l1: 21.7976	valid_0's l2: 1941.42	valid_0's huber: 19.2943


2020-09-21 01:03:04.841 | INFO     | yspecies.explanations:transform:494 - Metrics: 
Metrics(R2=0.6538523640752132, MAE=8.306377456176657, MSE=518.0797102896187, huber=7.122683660075034)


Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,...,shap_5,kendall_tau_5,shap_6,kendall_tau_6,shap_7,kendall_tau_7,shap_8,kendall_tau_8,shap_9,kendall_tau_9
ENSG00000167515,TRAPPC2L,10.0,22.250,-0.286302,28.50,-0.321438,15.75,-0.222454,19.50,-0.262519,...,20.00,-0.198476,21.00,-0.216406,24.25,-0.467742,23.00,-0.259238,20.25,-0.318612
ENSG00000010219,DYRK4,10.0,21.300,0.499266,19.25,0.463278,32.25,0.442642,21.00,0.531262,...,26.25,0.465746,24.50,0.568590,17.75,0.508938,20.50,0.359008,21.75,0.503933
ENSG00000165501,LRR1,10.0,18.725,0.714162,18.50,0.718542,21.00,0.651679,20.00,0.782146,...,17.75,0.711321,21.25,0.766833,21.25,0.704064,19.75,0.712844,16.25,0.714709
ENSG00000142002,DPP9,10.0,18.175,-0.790741,30.25,-0.755341,19.00,-0.817381,18.00,-0.808785,...,18.25,-0.857853,12.25,-0.790998,13.75,-0.782503,14.50,-0.811251,21.25,-0.784845
ENSG00000105672,ETV2,10.0,17.450,0.695006,29.50,0.707315,13.00,0.723869,19.25,0.655049,...,26.75,0.742832,19.25,0.706593,12.25,0.663985,8.25,0.669160,18.00,0.659691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000162624,LHX8,1.0,4.250,-0.661514,,,,,,,...,,,4.25,-0.661514,,,,,,
ENSG00000100908,EMC9,1.0,3.750,0.275034,,,,,,,...,,,,,3.75,0.275034,,,,
ENSG00000108556,CHRNE,1.0,3.250,-0.641954,,,,,,,...,,,,,,,3.25,-0.641954,,
ENSG00000144214,LYG1,1.0,3.000,0.016361,,,,,,,...,3.00,0.016361,,,,,,,,


In [28]:
lifespan_2.metrics_average

Metrics(R2=0.8191444901967483, MAE=5.333966978785094, MSE=222.05742597246913, huber=4.458840087515799)

In [29]:
lifespan_2.validation_metrics["R^2"].mean()

0.8420848215012047

In [None]:
lifespan_2.plot(max_display=30, plot_size=0.4)

In [26]:
lifespan_2.selected.to_csv(locations.output.stage_two / "lifespan.tsv", sep="\t", index_label="reference_gene")

In [None]:
summary_stage_two_plot_path = locations.output.dir / "plots" / "stage_two_summary_lifespan.svg"
lifespan_2.plot(title=40, plot_size=0.5, save=summary_stage_two_plot_path )

In [None]:
lifespan_2.write(partitions_folder , "lifespan_stage_two")