# SHAP feature selection #
## Code to select feature with combination of LightGBM and SHAP ##

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [1]:
n_folds = 5 # this sets global setting of which how many bootstraps to use
n_hold_out = 1
repeats = 5
importance_type = "split"

life_history = ["lifespan", "mass_kg", "mtGC", "metabolic_rate", "temperature", "gestation_days"]
debug_local = True #to use local version

In [2]:
from pathlib import Path
import sys
import inspect

#lgb_params["importance_type"] = importance_type

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']


In [3]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat, Collect
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.models import Metrics
from yspecies.selection import ShapSelector
from yspecies.results import FeatureSummary

In [5]:
import optuna
from optuna import Study, Trial
from optuna import multi_objective
from optuna.multi_objective import trial
from optuna.multi_objective.study import MultiObjectiveStudy
from yspecies.tuning import MultiObjectiveResults

In [6]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

Using matplotlib backend: TkAgg


### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [7]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

## Setting up Features to select ##

In [8]:
default_selection = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = "lifespan", #column to predict
    categorical = ["tissue"],
    select_by = "shap",
    importance_type =  importance_type,
    feature_perturbation = "tree_path_dependent"
)
default_selection

Samples metadata,Species metadata,Genes,Predict label,not_validated species
"['tissue', 'species']",[],all,lifespan,[]


In [9]:
loader = DataLoader(locations, default_selection)
selections = loader.load_life_history()
selections["lifespan"][0]

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(408, 12323)","(12323, 37)",38,408,"(12323, 2)","(38, 18)"


### Loading Hyperoptimization studies ###

In [10]:
def load_study(trait: str):
    url = f'sqlite:///' +str((locations.interim.optimization / (trait+".sqlite")).absolute())
    print('loading (if exists) study from '+url)
    storage = optuna.storages.RDBStorage(
        url=url
        #engine_kwargs={'check_same_thread': False}
    )
    return optuna.multi_objective.study.create_study(directions=['maximize','minimize','maximize'], storage = storage, study_name = f"{trait}_r2_huber_kendall", load_if_exists = True)

## Setting up SHAP selection pipeline ##

### Deciding on selection parameters (which fields to include, exclude, predict)  ###

In [11]:
partition_params = PartitionParameters(n_folds, n_hold_out, 2,   42)


In [12]:
def make_pipeline(trait: str):
    study = load_study(trait)
    if len(study.get_pareto_front_trials())>0 :
        metrics, params = MultiObjectiveResults.from_study(study).best_metrics_params_r2()
    else:
        params =  lgb_params = {"bagging_fraction": 0.9522534844058304,
                  "boosting_type": "dart",
                  "objective": "regression",
                  "feature_fraction": 0.42236910941558053,
                  "lambda_l1": 0.020847266580277746,
                  "lambda_l2": 2.8448564854773326,
                  "learning_rate": 0.11484015430016059,
                  "max_depth": 3,
                  "max_leaves": 35,
                  "min_data_in_leaf": 9,
                  "num_iterations": 250,
                  "metrics": ["l1", "l2", "huber"]
                 }
    partition_shap_pipe = Pipeline([
    ("partitioner", DataPartitioner()),
    ('prepare_for_selection', TupleWith(params)),
    ("shap_computation", ShapSelector())
    ]
    )
    repeated_cv =  Repeat(partition_shap_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))
    return Pipeline(
        [
        ('extractor', DataExtractor()),
        ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
        ("partition_shap", repeated_cv),
        ("summarize", Collect(fold=lambda results: FeatureSummary(results)))
        ]
        )

In [13]:
pipelines = {trait:make_pipeline(trait) for trait in life_history}
pipelines.keys()

loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/lifespan.sqlite


create_study is experimental (supported from v1.4.0). The interface can change in the future.
NSGAIIMultiObjectiveSampler is experimental (supported from v1.5.0). The interface can change in the future.
RandomMultiObjectiveSampler is experimental (supported from v1.4.0). The interface can change in the future.
[I 2020-08-30 15:15:22,045] Using an existing study with name 'lifespan_r2_huber_kendall' instead of creating a new one.
MultiObjectiveStudy is experimental (supported from v1.4.0). The interface can change in the future.
FrozenMultiObjectiveTrial is experimental (supported from v1.4.0). The interface can change in the future.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/mass_kg.sqlite


[I 2020-08-30 15:15:22,889] Using an existing study with name 'mass_kg_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/mtGC.sqlite


[I 2020-08-30 15:15:23,152] Using an existing study with name 'mtGC_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/metabolic_rate.sqlite


[I 2020-08-30 15:15:24,377] Using an existing study with name 'metabolic_rate_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/temperature.sqlite


[I 2020-08-30 15:15:24,485] Using an existing study with name 'temperature_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/gestation_days.sqlite


[I 2020-08-30 15:15:24,737] Using an existing study with name 'gestation_days_r2_huber_kendall' instead of creating a new one.


dict_keys(['lifespan', 'mass_kg', 'mtGC', 'metabolic_rate', 'temperature', 'gestation_days'])

# First stage selection (shap ) #

In [16]:
stage_one = {}

### Lifespan ###

In [19]:
def run_stage_one(trait: str):
    result = pipelines[trait].fit_transform(selections[trait])
    stage_one[trait] = result #ugly mutation of stage_one dicionary
    return result

In [20]:
stage_one_lifespan = run_stage_one("lifespan")
stage_one_lifespan.selected

2020-08-30 15:20:37.106 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 0 =====
2020-08-30 15:20:37.107 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9223944053685549, 'boosting_type': 'gbdt', 'drop_rate': 0.29847846764282016, 'feature_fraction': 0.30501714460224083, 'lambda_l1': 0.21894440516534, 'lambda_l2': 1.3848191603958375, 'learning_rate': 0.08210966706622723, 'max_depth': 3, 'max_leaves': 17, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber']}
2020-08-30 15:20:37.136 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Gorilla_gorilla', 'Pan_paniscus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[158]	valid_0's l1: 8.56432	valid_0's l2: 189.457	valid_0's huber: 7.35476


2020-08-30 15:20:40.328 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Heterocephalus_glaber', 'Capra_hircus']


Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 3.93228	valid_0's l2: 62.8388	valid_0's huber: 3.21679
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 3.93228	valid_0's l2: 62.8388	valid_0's huber: 3.21679


2020-08-30 15:20:45.353 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Bos_taurus', 'Otolemur_garnettii']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[133]	valid_0's l1: 3.6893	valid_0's l2: 41.9248	valid_0's huber: 2.9942


2020-08-30 15:20:48.259 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 3 | VALIDATION_SPECIES: ['Macaca_nemestrina', 'Microcebus_murinus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[103]	valid_0's l1: 3.36189	valid_0's l2: 52.7821	valid_0's huber: 2.68951


2020-08-30 15:21:08.726 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 1 =====
2020-08-30 15:21:08.727 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9223944053685549, 'boosting_type': 'gbdt', 'drop_rate': 0.29847846764282016, 'feature_fraction': 0.30501714460224083, 'lambda_l1': 0.21894440516534, 'lambda_l2': 1.3848191603958375, 'learning_rate': 0.08210966706622723, 'max_depth': 3, 'max_leaves': 17, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 0}
2020-08-30 15:21:08.751 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 0 | VALIDATION_SPECIES: ['Felis_catus', 'Ursus_americanus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[140]	valid_0's l1: 3.75572	valid_0's l2: 35.9971	valid_0's huber: 3.03913


2020-08-30 15:21:11.687 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 1 | VALIDATION_SPECIES: ['Heterocephalus_glaber', 'Vombatus_ursinus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[160]	valid_0's l1: 5.29124	valid_0's l2: 109.206	valid_0's huber: 4.43577


2020-08-30 15:21:15.240 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 2 | VALIDATION_SPECIES: ['Macaca_fascicularis', 'Rhinopithecus_bieti']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[149]	valid_0's l1: 3.24637	valid_0's l2: 36.8948	valid_0's huber: 2.58649


2020-08-30 15:21:18.200 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 3 | VALIDATION_SPECIES: ['Gorilla_gorilla', 'Ovis_aries']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[110]	valid_0's l1: 6.099	valid_0's l2: 157.97	valid_0's huber: 5.16087


2020-08-30 15:21:36.138 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 2 =====
2020-08-30 15:21:36.139 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9223944053685549, 'boosting_type': 'gbdt', 'drop_rate': 0.29847846764282016, 'feature_fraction': 0.30501714460224083, 'lambda_l1': 0.21894440516534, 'lambda_l2': 1.3848191603958375, 'learning_rate': 0.08210966706622723, 'max_depth': 3, 'max_leaves': 17, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 1}
2020-08-30 15:21:36.161 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 0 | VALIDATION_SPECIES: ['Rhinopithecus_bieti', 'Ursus_americanus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[91]	valid_0's l1: 3.85509	valid_0's l2: 46.2187	valid_0's huber: 3.13373


2020-08-30 15:21:38.117 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 1 | VALIDATION_SPECIES: ['Aotus_nancymaae', 'Phascolarctos_cinereus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[198]	valid_0's l1: 4.91538	valid_0's l2: 83.6016	valid_0's huber: 4.07485


2020-08-30 15:21:41.354 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 2 | VALIDATION_SPECIES: ['Vombatus_ursinus', 'Ailuropoda_melanoleuca']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[132]	valid_0's l1: 3.55846	valid_0's l2: 37.4065	valid_0's huber: 2.85189


2020-08-30 15:21:43.966 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 3 | VALIDATION_SPECIES: ['Rattus_norvegicus', 'Monodelphis_domestica']


Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 7.25384	valid_0's l2: 103.039	valid_0's huber: 6.14312
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 7.25384	valid_0's l2: 103.039	valid_0's huber: 6.14312


2020-08-30 15:22:07.445 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 3 =====
2020-08-30 15:22:07.446 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9223944053685549, 'boosting_type': 'gbdt', 'drop_rate': 0.29847846764282016, 'feature_fraction': 0.30501714460224083, 'lambda_l1': 0.21894440516534, 'lambda_l2': 1.3848191603958375, 'learning_rate': 0.08210966706622723, 'max_depth': 3, 'max_leaves': 17, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 2}
2020-08-30 15:22:07.469 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 0 | VALIDATION_SPECIES: ['Microcebus_murinus', 'Pan_paniscus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[129]	valid_0's l1: 3.13351	valid_0's l2: 24.39	valid_0's huber: 2.47091


2020-08-30 15:22:10.547 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 1 | VALIDATION_SPECIES: ['Ursus_americanus', 'Gorilla_gorilla']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[172]	valid_0's l1: 8.42191	valid_0's l2: 191.915	valid_0's huber: 7.21505


2020-08-30 15:22:13.827 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 2 | VALIDATION_SPECIES: ['Bos_taurus', 'Capra_hircus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[103]	valid_0's l1: 4.33818	valid_0's l2: 65.1108	valid_0's huber: 3.56845


2020-08-30 15:22:17.346 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 3 | VALIDATION_SPECIES: ['Rhinolophus_ferrumequinum', 'Mus_musculus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[186]	valid_0's l1: 2.986	valid_0's l2: 38.9358	valid_0's huber: 2.32669


2020-08-30 15:22:41.291 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 4 =====
2020-08-30 15:22:41.291 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9223944053685549, 'boosting_type': 'gbdt', 'drop_rate': 0.29847846764282016, 'feature_fraction': 0.30501714460224083, 'lambda_l1': 0.21894440516534, 'lambda_l2': 1.3848191603958375, 'learning_rate': 0.08210966706622723, 'max_depth': 3, 'max_leaves': 17, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 3}
2020-08-30 15:22:41.312 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 0 | VALIDATION_SPECIES: ['Macaca_fascicularis', 'Pan_troglodytes']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[167]	valid_0's l1: 3.31748	valid_0's l2: 36.1027	valid_0's huber: 2.63876


2020-08-30 15:22:45.462 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 1 | VALIDATION_SPECIES: ['Otolemur_garnettii', 'Ursus_americanus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[188]	valid_0's l1: 3.78015	valid_0's l2: 72.4207	valid_0's huber: 3.09368


2020-08-30 15:22:53.686 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 2 | VALIDATION_SPECIES: ['Heterocephalus_glaber', 'Cavia_aperea']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[97]	valid_0's l1: 5.05256	valid_0's l2: 59.1748	valid_0's huber: 4.2051


2020-08-30 15:22:57.589 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 3 | VALIDATION_SPECIES: ['Pan_paniscus', 'Gorilla_gorilla']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[123]	valid_0's l1: 6.38498	valid_0's l2: 125.938	valid_0's huber: 5.39846


Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ENSG00000204498,NFKBIL1,5.0,7.6,-0.106,7.0,-0.156,7.75,-0.036,8.0,-0.124,7.5,-0.072,7.75,-0.14
ENSG00000010219,DYRK4,5.0,7.55,0.277,8.0,0.306,8.75,0.283,7.5,0.262,8.5,0.272,5.0,0.261
ENSG00000185880,TRIM69,5.0,4.65,0.145,4.25,0.102,4.0,0.203,4.0,0.136,3.5,0.09,7.5,0.193
ENSG00000105672,ETV2,5.0,3.55,0.728,2.75,0.796,3.0,0.723,2.75,0.734,4.25,0.68,5.0,0.708
ENSG00000167515,TRAPPC2L,4.0,5.75,-0.21,,,4.75,-0.297,4.25,-0.18,7.25,-0.18,6.75,-0.183
ENSG00000132436,FIGNL1,4.0,3.938,0.506,4.0,0.468,,,3.25,0.521,3.75,0.525,4.75,0.512
ENSG00000165501,LRR1,4.0,2.125,0.694,2.25,0.669,,,2.0,0.742,1.5,0.652,2.75,0.712
ENSG00000188747,NOXA1,4.0,2.062,0.696,1.25,0.679,,,2.25,0.706,2.25,0.684,2.5,0.716
ENSG00000066923,STAG3,3.0,6.333,0.449,,,5.0,0.38,8.5,0.453,,,5.5,0.514
ENSG00000171121,KCNMB3,3.0,4.417,0.668,5.0,0.704,4.0,0.654,,,4.25,0.646,,


### Mass_kg ###

In [21]:
stage_one_mass = run_stage_one("mass_kg")
stage_one_mass.selected

2020-08-30 15:23:17.659 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 0 =====
2020-08-30 15:23:17.660 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.49885668399953875, 'boosting_type': 'dart', 'drop_rate': 0.2372958868935867, 'feature_fraction': 0.49141007534802034, 'lambda_l1': 0.656412473127467, 'lambda_l2': 1.9535470163293591, 'learning_rate': 0.08702343727058373, 'max_depth': 3, 'max_leaves': 21, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber']}
2020-08-30 15:23:17.676 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Monodelphis_domestica', 'Aotus_nancymaae']
Early stopping is not available in dart mode


[250]	valid_0's l1: 13.6373	valid_0's l2: 469.528	valid_0's huber: 11.8875


2020-08-30 15:23:22.375 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Sus_scrofa', 'Oryctolagus_cuniculus']


[250]	valid_0's l1: 32.1894	valid_0's l2: 2498.65	valid_0's huber: 28.5926


2020-08-30 15:23:27.002 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Mesocricetus_auratus', 'Cavia_porcellus']


[250]	valid_0's l1: 6.9055	valid_0's l2: 138.739	valid_0's huber: 5.83807


2020-08-30 15:23:33.278 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 3 | VALIDATION_SPECIES: ['Ictidomys_tridecemlineatus', 'Phascolarctos_cinereus']


[250]	valid_0's l1: 8.61981	valid_0's l2: 185.69	valid_0's huber: 7.36733


2020-08-30 15:23:54.132 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 1 =====
2020-08-30 15:23:54.132 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.49885668399953875, 'boosting_type': 'dart', 'drop_rate': 0.2372958868935867, 'feature_fraction': 0.49141007534802034, 'lambda_l1': 0.656412473127467, 'lambda_l2': 1.9535470163293591, 'learning_rate': 0.08702343727058373, 'max_depth': 3, 'max_leaves': 21, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 0}
2020-08-30 15:23:54.148 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 0 | VALIDATION_SPECIES: ['Aotus_nancymaae', 'Mesocricetus_auratus']
Early stopping is not available in dart mode


[250]	valid_0's l1: 12.1495	valid_0's l2: 560.782	valid_0's huber: 10.5631


2020-08-30 15:24:02.265 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 1 | VALIDATION_SPECIES: ['Monodelphis_domestica', 'Rattus_norvegicus']


[250]	valid_0's l1: 6.74933	valid_0's l2: 190.296	valid_0's huber: 5.70282


2020-08-30 15:24:09.518 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 2 | VALIDATION_SPECIES: ['Ictidomys_tridecemlineatus', 'Sus_scrofa']


[250]	valid_0's l1: 15.0071	valid_0's l2: 513.145	valid_0's huber: 13.1282


2020-08-30 15:24:13.576 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 3 | VALIDATION_SPECIES: ['Homo_sapiens', 'Cavia_porcellus']


[250]	valid_0's l1: 26.9604	valid_0's l2: 1360.96	valid_0's huber: 23.8706


2020-08-30 15:24:31.426 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 2 =====
2020-08-30 15:24:31.426 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.49885668399953875, 'boosting_type': 'dart', 'drop_rate': 0.2372958868935867, 'feature_fraction': 0.49141007534802034, 'lambda_l1': 0.656412473127467, 'lambda_l2': 1.9535470163293591, 'learning_rate': 0.08702343727058373, 'max_depth': 3, 'max_leaves': 21, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 1}
2020-08-30 15:24:31.440 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 0 | VALIDATION_SPECIES: ['Rattus_norvegicus', 'Meriones_unguiculatus']
Early stopping is not available in dart mode


[250]	valid_0's l1: 4.64554	valid_0's l2: 134.749	valid_0's huber: 3.81813


2020-08-30 15:24:35.500 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 1 | VALIDATION_SPECIES: ['Suricata_suricatta', 'Mesocricetus_auratus']


[250]	valid_0's l1: 7.71452	valid_0's l2: 263.764	valid_0's huber: 6.586


2020-08-30 15:24:39.293 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 2 | VALIDATION_SPECIES: ['Phascolarctos_cinereus', 'Cavia_porcellus']


[250]	valid_0's l1: 4.50041	valid_0's l2: 80.4002	valid_0's huber: 3.68972


2020-08-30 15:24:46.729 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 3 | VALIDATION_SPECIES: ['Homo_sapiens', 'Aotus_nancymaae']


[250]	valid_0's l1: 28.4758	valid_0's l2: 1563.33	valid_0's huber: 25.2571


2020-08-30 15:25:06.009 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 3 =====
2020-08-30 15:25:06.009 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.49885668399953875, 'boosting_type': 'dart', 'drop_rate': 0.2372958868935867, 'feature_fraction': 0.49141007534802034, 'lambda_l1': 0.656412473127467, 'lambda_l2': 1.9535470163293591, 'learning_rate': 0.08702343727058373, 'max_depth': 3, 'max_leaves': 21, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 2}
2020-08-30 15:25:06.023 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 0 | VALIDATION_SPECIES: ['Callithrix_jacchus', 'Rattus_norvegicus']
Early stopping is not available in dart mode


[250]	valid_0's l1: 12.3502	valid_0's l2: 437.423	valid_0's huber: 10.7513


2020-08-30 15:25:10.511 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 1 | VALIDATION_SPECIES: ['Ictidomys_tridecemlineatus', 'Oryctolagus_cuniculus']


[250]	valid_0's l1: 27.5895	valid_0's l2: 2590.34	valid_0's huber: 24.5009


2020-08-30 15:25:16.689 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 2 | VALIDATION_SPECIES: ['Sus_scrofa', 'Aotus_nancymaae']


[250]	valid_0's l1: 28.2829	valid_0's l2: 2822.6	valid_0's huber: 25.0638


2020-08-30 15:25:22.018 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 3 | VALIDATION_SPECIES: ['Meriones_unguiculatus', 'Mesocricetus_auratus']


[250]	valid_0's l1: 11.7358	valid_0's l2: 277.944	valid_0's huber: 10.1772


2020-08-30 15:25:40.997 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 4 =====
2020-08-30 15:25:40.997 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.49885668399953875, 'boosting_type': 'dart', 'drop_rate': 0.2372958868935867, 'feature_fraction': 0.49141007534802034, 'lambda_l1': 0.656412473127467, 'lambda_l2': 1.9535470163293591, 'learning_rate': 0.08702343727058373, 'max_depth': 3, 'max_leaves': 21, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 3}
2020-08-30 15:25:41.012 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 0 | VALIDATION_SPECIES: ['Callithrix_jacchus', 'Heterocephalus_glaber']
Early stopping is not available in dart mode


[250]	valid_0's l1: 17.5029	valid_0's l2: 652.079	valid_0's huber: 15.3757


2020-08-30 15:25:45.877 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 1 | VALIDATION_SPECIES: ['Cavia_porcellus', 'Aotus_nancymaae']


[250]	valid_0's l1: 20.7769	valid_0's l2: 707.388	valid_0's huber: 18.306


2020-08-30 15:25:50.473 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 2 | VALIDATION_SPECIES: ['Homo_sapiens', 'Rattus_norvegicus']


[250]	valid_0's l1: 17.6793	valid_0's l2: 921.116	valid_0's huber: 15.5183


2020-08-30 15:25:54.151 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 3 | VALIDATION_SPECIES: ['Phascolarctos_cinereus', 'Ictidomys_tridecemlineatus']


[250]	valid_0's l1: 15.7326	valid_0's l2: 461.445	valid_0's huber: 13.7654


Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ENSG00000054277,OPN3,4.0,48.312,0.124,29.25,-0.048,,,66.0,0.395,29.5,0.025,68.5,0.123
ENSG00000144214,LYG1,4.0,11.688,0.37,9.5,0.434,17.0,0.4,12.5,0.293,,,7.75,0.351
ENSG00000164304,CAGE1,3.0,14.167,-0.272,20.0,-0.284,3.75,-0.343,,,18.75,-0.188,,
ENSG00000270647,TAF15,3.0,12.417,0.008,,,12.5,-0.032,10.25,-0.092,,,14.5,0.148
ENSG00000124172,ATP5F1E,3.0,11.833,-0.314,,,8.5,-0.255,3.0,-0.251,24.0,-0.437,,
ENSG00000126062,TMEM115,3.0,9.0,-0.032,6.0,0.053,9.5,-0.073,,,,,11.5,-0.075
ENSG00000066923,STAG3,2.0,35.0,0.173,37.0,0.219,,,,,33.0,0.127,,
ENSG00000127952,STYXL1,2.0,19.375,0.33,20.0,0.195,,,,,18.75,0.465,,
ENSG00000010219,DYRK4,2.0,16.625,0.24,18.75,0.182,,,,,14.5,0.298,,
ENSG00000107020,PLGRKT,2.0,11.5,-0.011,10.0,-0.063,13.0,0.041,,,,,,


### MtGC ###

In [22]:
stage_one_mtGC = run_stage_one("mtGC")
stage_one_mtGC.selected

2020-08-30 15:26:13.439 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 0 =====
2020-08-30 15:26:13.440 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.4882827581871194, 'boosting_type': 'gbdt', 'drop_rate': 0.16290376621758335, 'feature_fraction': 0.3059934876406196, 'lambda_l1': 0.017007142345511594, 'lambda_l2': 0.568546479848152, 'learning_rate': 0.08231879540454069, 'max_depth': 7, 'max_leaves': 24, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber']}
2020-08-30 15:26:13.459 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Callithrix_jacchus', 'Sus_scrofa']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[104]	valid_0's l1: 0.434639	valid_0's l2: 0.398746	valid_0's huber: 0.168501


2020-08-30 15:26:19.259 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Rattus_norvegicus', 'Gorilla_gorilla']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[38]	valid_0's l1: 0.840033	valid_0's l2: 1.03454	valid_0's huber: 0.440867


2020-08-30 15:26:21.446 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Macaca_mulatta', 'Equus_caballus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[105]	valid_0's l1: 0.538695	valid_0's l2: 0.859574	valid_0's huber: 0.293793


2020-08-30 15:26:26.845 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 3 | VALIDATION_SPECIES: ['Tupaia_belangeri', 'Ovis_aries']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[92]	valid_0's l1: 0.587668	valid_0's l2: 0.613843	valid_0's huber: 0.276131


2020-08-30 15:26:41.559 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 1 =====
2020-08-30 15:26:41.560 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.4882827581871194, 'boosting_type': 'gbdt', 'drop_rate': 0.16290376621758335, 'feature_fraction': 0.3059934876406196, 'lambda_l1': 0.017007142345511594, 'lambda_l2': 0.568546479848152, 'learning_rate': 0.08231879540454069, 'max_depth': 7, 'max_leaves': 24, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 0}
2020-08-30 15:26:41.582 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 0 | VALIDATION_SPECIES: ['Macaca_nemestrina', 'Felis_catus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[114]	valid_0's l1: 0.361037	valid_0's l2: 0.317055	valid_0's huber: 0.133178


2020-08-30 15:26:47.151 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 1 | VALIDATION_SPECIES: ['Callithrix_jacchus', 'Heterocephalus_glaber']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[75]	valid_0's l1: 0.528265	valid_0's l2: 0.821885	valid_0's huber: 0.293618


2020-08-30 15:26:51.192 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 2 | VALIDATION_SPECIES: ['Tupaia_belangeri', 'Sus_scrofa']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[67]	valid_0's l1: 0.454341	valid_0's l2: 0.630486	valid_0's huber: 0.223213


2020-08-30 15:26:55.017 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 3 | VALIDATION_SPECIES: ['Gorilla_gorilla', 'Bos_taurus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[98]	valid_0's l1: 0.610187	valid_0's l2: 0.947523	valid_0's huber: 0.312579


2020-08-30 15:27:13.837 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 2 =====
2020-08-30 15:27:13.838 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.4882827581871194, 'boosting_type': 'gbdt', 'drop_rate': 0.16290376621758335, 'feature_fraction': 0.3059934876406196, 'lambda_l1': 0.017007142345511594, 'lambda_l2': 0.568546479848152, 'learning_rate': 0.08231879540454069, 'max_depth': 7, 'max_leaves': 24, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 1}
2020-08-30 15:27:13.854 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 0 | VALIDATION_SPECIES: ['Heterocephalus_glaber', 'Sus_scrofa']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[93]	valid_0's l1: 0.522014	valid_0's l2: 0.541792	valid_0's huber: 0.226346


2020-08-30 15:27:17.639 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 1 | VALIDATION_SPECIES: ['Oryctolagus_cuniculus', 'Meriones_unguiculatus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[211]	valid_0's l1: 0.501367	valid_0's l2: 0.481527	valid_0's huber: 0.213256


2020-08-30 15:27:24.584 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 2 | VALIDATION_SPECIES: ['Felis_catus', 'Capra_hircus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[65]	valid_0's l1: 0.482629	valid_0's l2: 0.60641	valid_0's huber: 0.222722


2020-08-30 15:27:27.739 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 3 | VALIDATION_SPECIES: ['Equus_caballus', 'Pan_paniscus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[179]	valid_0's l1: 0.802519	valid_0's l2: 1.65936	valid_0's huber: 0.494971


2020-08-30 15:27:47.842 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 3 =====
2020-08-30 15:27:47.842 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.4882827581871194, 'boosting_type': 'gbdt', 'drop_rate': 0.16290376621758335, 'feature_fraction': 0.3059934876406196, 'lambda_l1': 0.017007142345511594, 'lambda_l2': 0.568546479848152, 'learning_rate': 0.08231879540454069, 'max_depth': 7, 'max_leaves': 24, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 2}
2020-08-30 15:27:47.861 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 0 | VALIDATION_SPECIES: ['Ursus_americanus', 'Phascolarctos_cinereus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[97]	valid_0's l1: 0.648251	valid_0's l2: 0.979086	valid_0's huber: 0.346601


2020-08-30 15:27:52.384 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 1 | VALIDATION_SPECIES: ['Ictidomys_tridecemlineatus', 'Macaca_mulatta']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[40]	valid_0's l1: 1.01205	valid_0's l2: 3.1729	valid_0's huber: 0.675189


2020-08-30 15:27:54.915 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 2 | VALIDATION_SPECIES: ['Heterocephalus_glaber', 'Gorilla_gorilla']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[141]	valid_0's l1: 0.794431	valid_0's l2: 1.50396	valid_0's huber: 0.483264


2020-08-30 15:28:00.425 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 3 | VALIDATION_SPECIES: ['Rattus_norvegicus', 'Ovis_aries']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[10]	valid_0's l1: 0.820942	valid_0's l2: 1.27202	valid_0's huber: 0.440937


2020-08-30 15:28:08.927 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 4 =====
2020-08-30 15:28:08.928 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.4882827581871194, 'boosting_type': 'gbdt', 'drop_rate': 0.16290376621758335, 'feature_fraction': 0.3059934876406196, 'lambda_l1': 0.017007142345511594, 'lambda_l2': 0.568546479848152, 'learning_rate': 0.08231879540454069, 'max_depth': 7, 'max_leaves': 24, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 3}
2020-08-30 15:28:08.946 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 0 | VALIDATION_SPECIES: ['Mesocricetus_auratus', 'Vombatus_ursinus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[67]	valid_0's l1: 0.949134	valid_0's l2: 1.68755	valid_0's huber: 0.580794


2020-08-30 15:28:12.626 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 1 | VALIDATION_SPECIES: ['Mus_musculus', 'Macaca_nemestrina']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[80]	valid_0's l1: 0.888383	valid_0's l2: 1.40014	valid_0's huber: 0.527671


2020-08-30 15:28:16.548 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 2 | VALIDATION_SPECIES: ['Pan_troglodytes', 'Heterocephalus_glaber']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[145]	valid_0's l1: 0.737914	valid_0's l2: 1.36587	valid_0's huber: 0.439515


2020-08-30 15:28:22.673 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 3 | VALIDATION_SPECIES: ['Capra_hircus', 'Tupaia_belangeri']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[127]	valid_0's l1: 0.69111	valid_0's l2: 1.32116	valid_0's huber: 0.400158


Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ENSG00000171121,KCNMB3,5.0,10.3,0.639,12.0,0.661,7.75,0.612,14.0,0.631,7.5,0.63,10.25,0.662
ENSG00000122034,GTF3A,5.0,9.85,0.622,10.0,0.61,11.0,0.626,9.75,0.681,10.25,0.669,8.25,0.524
ENSG00000144451,SPAG16,5.0,6.05,0.348,5.75,0.322,5.75,0.31,5.75,0.295,3.5,0.353,9.5,0.46
ENSG00000196419,XRCC6,4.0,11.562,0.656,10.25,0.686,14.25,0.614,7.5,0.697,,,14.25,0.626
ENSG00000188747,NOXA1,4.0,6.25,0.648,5.25,0.715,11.75,0.561,6.5,0.656,,,1.5,0.661
ENSG00000167515,TRAPPC2L,4.0,4.562,-0.556,4.25,-0.534,5.25,-0.546,2.5,-0.585,,,6.25,-0.561
ENSG00000164304,CAGE1,4.0,3.938,-0.731,5.75,-0.773,3.75,-0.7,2.5,-0.702,,,3.75,-0.747
ENSG00000156384,SFR1,4.0,3.312,-0.662,4.0,-0.686,4.5,-0.646,,,2.25,-0.648,2.5,-0.669
ENSG00000048140,TSPAN17,4.0,2.062,-0.401,,,1.75,-0.34,1.75,-0.433,1.25,-0.525,3.5,-0.307
ENSG00000006282,SPATA20,3.0,8.667,0.677,,,6.0,0.655,9.75,0.684,,,10.25,0.693


### Metabolism ###

In [26]:
stage_one_metabolic_rate = run_stage_one("metabolic_rate")
stage_one_metabolic_rate.selected

2020-08-30 15:38:10.081 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 0 =====
2020-08-30 15:38:10.082 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.7949279232062532, 'boosting_type': 'dart', 'drop_rate': 0.19407161947050122, 'feature_fraction': 0.8486129005166576, 'lambda_l1': 2.9464767054398204, 'lambda_l2': 0.817423041659147, 'learning_rate': 0.08579903520990408, 'max_depth': 7, 'max_leaves': 15, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber']}
2020-08-30 15:38:10.098 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Monodelphis_domestica', 'Cavia_porcellus']
Early stopping is not available in dart mode


[250]	valid_0's l1: 13.689	valid_0's l2: 323.66	valid_0's huber: 11.9461


2020-08-30 15:38:16.472 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Sus_scrofa', 'Phascolarctos_cinereus']


[250]	valid_0's l1: 24.1289	valid_0's l2: 1198.32	valid_0's huber: 21.3451


2020-08-30 15:38:26.632 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Meriones_unguiculatus', 'Rattus_norvegicus']


[250]	valid_0's l1: 6.69656	valid_0's l2: 105.174	valid_0's huber: 5.64116


2020-08-30 15:38:33.696 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 3 | VALIDATION_SPECIES: ['Callithrix_jacchus', 'Oryctolagus_cuniculus']


[250]	valid_0's l1: 26.0579	valid_0's l2: 1415.06	valid_0's huber: 23.1499


2020-08-30 15:38:55.152 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 1 =====
2020-08-30 15:38:55.152 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.7949279232062532, 'boosting_type': 'dart', 'drop_rate': 0.19407161947050122, 'feature_fraction': 0.8486129005166576, 'lambda_l1': 2.9464767054398204, 'lambda_l2': 0.817423041659147, 'learning_rate': 0.08579903520990408, 'max_depth': 7, 'max_leaves': 15, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 0}
2020-08-30 15:38:55.166 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 0 | VALIDATION_SPECIES: ['Cavia_porcellus', 'Meriones_unguiculatus']
Early stopping is not available in dart mode


[250]	valid_0's l1: 16.8239	valid_0's l2: 527.07	valid_0's huber: 14.7542


2020-08-30 15:39:03.315 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 1 | VALIDATION_SPECIES: ['Monodelphis_domestica', 'Ictidomys_tridecemlineatus']


[250]	valid_0's l1: 10.9229	valid_0's l2: 246.687	valid_0's huber: 9.44605


2020-08-30 15:39:11.030 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 2 | VALIDATION_SPECIES: ['Callithrix_jacchus', 'Sus_scrofa']


[250]	valid_0's l1: 19.729	valid_0's l2: 654.16	valid_0's huber: 17.3778


2020-08-30 15:39:24.213 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 3 | VALIDATION_SPECIES: ['Homo_sapiens', 'Rattus_norvegicus']


[250]	valid_0's l1: 16.9896	valid_0's l2: 1044.88	valid_0's huber: 14.9402


2020-08-30 15:39:49.742 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 2 =====
2020-08-30 15:39:49.743 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.7949279232062532, 'boosting_type': 'dart', 'drop_rate': 0.19407161947050122, 'feature_fraction': 0.8486129005166576, 'lambda_l1': 2.9464767054398204, 'lambda_l2': 0.817423041659147, 'learning_rate': 0.08579903520990408, 'max_depth': 7, 'max_leaves': 15, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 1}
2020-08-30 15:39:49.757 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 0 | VALIDATION_SPECIES: ['Ictidomys_tridecemlineatus', 'Mus_musculus']
Early stopping is not available in dart mode


[250]	valid_0's l1: 6.75577	valid_0's l2: 311.85	valid_0's huber: 5.70121


2020-08-30 15:40:04.325 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 1 | VALIDATION_SPECIES: ['Suricata_suricatta', 'Meriones_unguiculatus']


[250]	valid_0's l1: 12.5032	valid_0's l2: 425.342	valid_0's huber: 10.8792


2020-08-30 15:40:20.551 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 2 | VALIDATION_SPECIES: ['Oryctolagus_cuniculus', 'Rattus_norvegicus']


[250]	valid_0's l1: 5.90115	valid_0's l2: 135.335	valid_0's huber: 4.94472


2020-08-30 15:40:33.369 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 3 | VALIDATION_SPECIES: ['Homo_sapiens', 'Cavia_porcellus']


[250]	valid_0's l1: 30.8045	valid_0's l2: 1822.39	valid_0's huber: 27.3415


2020-08-30 15:41:05.047 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 3 =====
2020-08-30 15:41:05.047 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.7949279232062532, 'boosting_type': 'dart', 'drop_rate': 0.19407161947050122, 'feature_fraction': 0.8486129005166576, 'lambda_l1': 2.9464767054398204, 'lambda_l2': 0.817423041659147, 'learning_rate': 0.08579903520990408, 'max_depth': 7, 'max_leaves': 15, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 2}
2020-08-30 15:41:05.061 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 0 | VALIDATION_SPECIES: ['Mesocricetus_auratus', 'Ictidomys_tridecemlineatus']
Early stopping is not available in dart mode


[250]	valid_0's l1: 11.8305	valid_0's l2: 385.304	valid_0's huber: 10.2693


2020-08-30 15:41:16.973 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 1 | VALIDATION_SPECIES: ['Callithrix_jacchus', 'Phascolarctos_cinereus']


[250]	valid_0's l1: 21.1696	valid_0's l2: 921.692	valid_0's huber: 18.6852


2020-08-30 15:41:26.528 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 2 | VALIDATION_SPECIES: ['Sus_scrofa', 'Cavia_porcellus']


[250]	valid_0's l1: 12.2521	valid_0's l2: 408.632	valid_0's huber: 10.6506


2020-08-30 15:41:36.520 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 3 | VALIDATION_SPECIES: ['Mus_musculus', 'Meriones_unguiculatus']


[250]	valid_0's l1: 17.8136	valid_0's l2: 622.005	valid_0's huber: 15.6413


2020-08-30 15:41:58.289 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 4 =====
2020-08-30 15:41:58.289 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.7949279232062532, 'boosting_type': 'dart', 'drop_rate': 0.19407161947050122, 'feature_fraction': 0.8486129005166576, 'lambda_l1': 2.9464767054398204, 'lambda_l2': 0.817423041659147, 'learning_rate': 0.08579903520990408, 'max_depth': 7, 'max_leaves': 15, 'min_data_in_leaf': 8, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 3}
2020-08-30 15:41:58.305 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 0 | VALIDATION_SPECIES: ['Mesocricetus_auratus', 'Rattus_norvegicus']
Early stopping is not available in dart mode


[250]	valid_0's l1: 5.47479	valid_0's l2: 115.961	valid_0's huber: 4.57356


2020-08-30 15:42:06.944 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 1 | VALIDATION_SPECIES: ['Cavia_porcellus', 'Homo_sapiens']


[250]	valid_0's l1: 32.4665	valid_0's l2: 2265.26	valid_0's huber: 28.818


2020-08-30 15:42:16.122 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 2 | VALIDATION_SPECIES: ['Ictidomys_tridecemlineatus', 'Oryctolagus_cuniculus']


[250]	valid_0's l1: 16.7229	valid_0's l2: 669.705	valid_0's huber: 14.6837


2020-08-30 15:42:25.870 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 3 | VALIDATION_SPECIES: ['Callithrix_jacchus', 'Monodelphis_domestica']


[250]	valid_0's l1: 18.325	valid_0's l2: 528.165	valid_0's huber: 16.0927


Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ENSG00000054277,OPN3,3.0,51.417,0.129,2.75,0.037,,,77.0,0.204,,,74.5,0.145
ENSG00000164304,CAGE1,3.0,25.833,-0.248,34.5,-0.353,6.75,-0.267,,,36.25,-0.126,,
ENSG00000175806,MSRA,3.0,23.167,-0.076,24.25,-0.022,15.0,-0.004,,,30.25,-0.204,,
ENSG00000270647,TAF15,3.0,21.75,-0.019,,,25.0,0.006,27.0,-0.015,13.25,-0.048,,
ENSG00000124172,ATP5F1E,3.0,16.667,-0.261,23.5,-0.219,11.25,-0.235,,,15.25,-0.328,,
ENSG00000171747,LGALS4,3.0,10.083,-0.601,14.0,-0.65,,,5.25,-0.648,,,11.0,-0.504
ENSG00000156467,UQCRB,3.0,3.667,-0.263,,,4.75,-0.433,2.25,-0.525,4.0,0.168,,
ENSG00000101546,RBFA,3.0,3.25,0.473,2.75,0.215,3.25,0.719,,,,,3.75,0.484
ENSG00000103005,USB1,3.0,2.667,0.106,4.75,0.185,1.0,0.513,,,,,2.25,-0.382
ENSG00000010219,DYRK4,2.0,59.25,0.347,,,,,113.0,0.182,5.5,0.512,,


### Temperature ###

In [27]:
stage_one_temperature = run_stage_one("temperature")
stage_one_temperature.selected

2020-08-30 15:42:51.437 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 0 =====
2020-08-30 15:42:51.438 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9219040738483888, 'boosting_type': 'gbdt', 'drop_rate': 0.14333380957397163, 'feature_fraction': 0.4690000754748763, 'lambda_l1': 0.4937008772491574, 'lambda_l2': 2.2888258490505713, 'learning_rate': 0.08533038187327897, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 4, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber']}
2020-08-30 15:42:51.455 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Rattus_norvegicus', 'Suricata_suricatta']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[88]	valid_0's l1: 0.370175	valid_0's l2: 0.267517	valid_0's huber: 0.123547


2020-08-30 15:42:53.739 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Gorilla_gorilla', 'Cavia_porcellus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[44]	valid_0's l1: 1.13262	valid_0's l2: 2.19211	valid_0's huber: 0.716867


2020-08-30 15:42:55.457 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Tupaia_belangeri', 'Mesocricetus_auratus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[40]	valid_0's l1: 0.718624	valid_0's l2: 0.938771	valid_0's huber: 0.363117


2020-08-30 15:42:57.259 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 3 | VALIDATION_SPECIES: ['Bos_taurus', 'Ictidomys_tridecemlineatus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[130]	valid_0's l1: 0.398999	valid_0's l2: 0.393822	valid_0's huber: 0.149527


2020-08-30 15:43:07.325 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 1 =====
2020-08-30 15:43:07.326 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9219040738483888, 'boosting_type': 'gbdt', 'drop_rate': 0.14333380957397163, 'feature_fraction': 0.4690000754748763, 'lambda_l1': 0.4937008772491574, 'lambda_l2': 2.2888258490505713, 'learning_rate': 0.08533038187327897, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 4, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 0}
2020-08-30 15:43:07.343 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 0 | VALIDATION_SPECIES: ['Equus_caballus', 'Macaca_mulatta']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[82]	valid_0's l1: 0.369468	valid_0's l2: 0.423096	valid_0's huber: 0.156856


2020-08-30 15:43:09.694 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 1 | VALIDATION_SPECIES: ['Rattus_norvegicus', 'Felis_catus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[70]	valid_0's l1: 0.511518	valid_0's l2: 0.53483	valid_0's huber: 0.216492


2020-08-30 15:43:11.619 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 2 | VALIDATION_SPECIES: ['Bos_taurus', 'Suricata_suricatta']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[159]	valid_0's l1: 0.40811	valid_0's l2: 0.316003	valid_0's huber: 0.152098


2020-08-30 15:43:15.441 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 3 | VALIDATION_SPECIES: ['Cavia_porcellus', 'Callithrix_jacchus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[108]	valid_0's l1: 0.997066	valid_0's l2: 1.65448	valid_0's huber: 0.626087


2020-08-30 15:43:27.849 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 2 =====
2020-08-30 15:43:27.849 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9219040738483888, 'boosting_type': 'gbdt', 'drop_rate': 0.14333380957397163, 'feature_fraction': 0.4690000754748763, 'lambda_l1': 0.4937008772491574, 'lambda_l2': 2.2888258490505713, 'learning_rate': 0.08533038187327897, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 4, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 1}
2020-08-30 15:43:27.867 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 0 | VALIDATION_SPECIES: ['Felis_catus', 'Suricata_suricatta']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[80]	valid_0's l1: 0.322496	valid_0's l2: 0.255662	valid_0's huber: 0.116796


2020-08-30 15:43:30.184 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 1 | VALIDATION_SPECIES: ['Homo_sapiens', 'Monodelphis_domestica']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[96]	valid_0's l1: 1.73129	valid_0's l2: 6.98961	valid_0's huber: 1.31306


2020-08-30 15:43:32.356 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 2 | VALIDATION_SPECIES: ['Macaca_mulatta', 'Phascolarctos_cinereus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[223]	valid_0's l1: 0.387416	valid_0's l2: 0.481081	valid_0's huber: 0.171961


2020-08-30 15:43:37.542 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 3 | VALIDATION_SPECIES: ['Mesocricetus_auratus', 'Sus_scrofa']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[113]	valid_0's l1: 0.455226	valid_0's l2: 0.424315	valid_0's huber: 0.186182


2020-08-30 15:43:53.227 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 3 =====
2020-08-30 15:43:53.228 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9219040738483888, 'boosting_type': 'gbdt', 'drop_rate': 0.14333380957397163, 'feature_fraction': 0.4690000754748763, 'lambda_l1': 0.4937008772491574, 'lambda_l2': 2.2888258490505713, 'learning_rate': 0.08533038187327897, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 4, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 2}
2020-08-30 15:43:53.245 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 0 | VALIDATION_SPECIES: ['Macaca_fascicularis', 'Tupaia_belangeri']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[85]	valid_0's l1: 0.467809	valid_0's l2: 0.729141	valid_0's huber: 0.24153


2020-08-30 15:43:55.683 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 1 | VALIDATION_SPECIES: ['Felis_catus', 'Cavia_porcellus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[11]	valid_0's l1: 1.50846	valid_0's l2: 3.50347	valid_0's huber: 1.03792


2020-08-30 15:43:56.805 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 2 | VALIDATION_SPECIES: ['Gorilla_gorilla', 'Ictidomys_tridecemlineatus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[98]	valid_0's l1: 0.60481	valid_0's l2: 0.880355	valid_0's huber: 0.34126


2020-08-30 15:43:59.712 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 3 | VALIDATION_SPECIES: ['Macaca_mulatta', 'Bos_taurus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[188]	valid_0's l1: 0.388685	valid_0's l2: 0.523766	valid_0's huber: 0.16871


2020-08-30 15:44:11.831 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 4 =====
2020-08-30 15:44:11.831 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9219040738483888, 'boosting_type': 'gbdt', 'drop_rate': 0.14333380957397163, 'feature_fraction': 0.4690000754748763, 'lambda_l1': 0.4937008772491574, 'lambda_l2': 2.2888258490505713, 'learning_rate': 0.08533038187327897, 'max_depth': 3, 'max_leaves': 19, 'min_data_in_leaf': 4, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 3}
2020-08-30 15:44:11.847 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 0 | VALIDATION_SPECIES: ['Mus_musculus', 'Equus_caballus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[52]	valid_0's l1: 0.616743	valid_0's l2: 0.527652	valid_0's huber: 0.254108


2020-08-30 15:44:13.528 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 1 | VALIDATION_SPECIES: ['Oryctolagus_cuniculus', 'Felis_catus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[69]	valid_0's l1: 0.850531	valid_0's l2: 1.76595	valid_0's huber: 0.513949


2020-08-30 15:44:16.387 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 2 | VALIDATION_SPECIES: ['Phascolarctos_cinereus', 'Bos_taurus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[123]	valid_0's l1: 0.494692	valid_0's l2: 0.451215	valid_0's huber: 0.193475


2020-08-30 15:44:20.612 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 3 | VALIDATION_SPECIES: ['Mesocricetus_auratus', 'Homo_sapiens']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[89]	valid_0's l1: 0.477636	valid_0's l2: 0.429008	valid_0's huber: 0.187743


Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ENSG00000087460,GNAS,4.0,19.062,0.333,17.25,0.373,19.25,0.441,18.0,0.218,,,21.75,0.299
ENSG00000099341,PSMD8,3.0,4.0,0.488,4.75,0.368,2.25,0.578,,,,,5.0,0.518
ENSG00000187840,EIF4EBP1,3.0,2.75,0.103,3.5,0.093,,,1.5,0.249,,,3.25,-0.033
ENSG00000176731,RBIS,2.0,4.625,-0.442,,,3.75,-0.456,,,,,5.5,-0.428
ENSG00000100220,RTCB,2.0,3.875,-0.453,,,,,4.0,-0.533,,,3.75,-0.373
ENSG00000185838,GNB1L,2.0,3.375,0.361,,,5.5,0.505,1.25,0.217,,,,
ENSG00000134717,BTF3L4,2.0,2.875,0.575,,,1.75,0.647,,,,,4.0,0.503
ENSG00000115204,MPV17,2.0,2.75,-0.611,,,,,3.25,-0.656,,,2.25,-0.566
ENSG00000132423,COQ3,2.0,2.625,0.55,,,,,3.5,0.524,,,1.75,0.576
ENSG00000152380,FAM151B,2.0,1.75,0.363,,,2.0,0.311,,,,,1.5,0.416


### Gestation ###

In [28]:
stage_one_gestation = run_stage_one("gestation_days")
stage_one_gestation.selected

2020-08-30 15:44:31.766 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 0 =====
2020-08-30 15:44:31.767 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9550809144864345, 'boosting_type': 'gbdt', 'drop_rate': 0.1520029135951902, 'feature_fraction': 0.39359679175101914, 'lambda_l1': 2.2144413347584315, 'lambda_l2': 2.012506929626787, 'learning_rate': 0.0890025107137548, 'max_depth': 3, 'max_leaves': 18, 'min_data_in_leaf': 5, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber']}
2020-08-30 15:44:31.789 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Homo_sapiens', 'Pan_paniscus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[161]	valid_0's l1: 39.8179	valid_0's l2: 4295.93	valid_0's huber: 35.4727


2020-08-30 15:44:35.903 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Capra_hircus', 'Cavia_porcellus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[145]	valid_0's l1: 15.5697	valid_0's l2: 708.603	valid_0's huber: 13.6444


2020-08-30 15:44:41.359 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Cavia_aperea', 'Microcebus_murinus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[69]	valid_0's l1: 18.2422	valid_0's l2: 695.193	valid_0's huber: 16.0303


2020-08-30 15:44:44.322 | INFO     | yspecies.selection:fit:82 - SEED: 0 | FOLD: 3 | VALIDATION_SPECIES: ['Rhinopithecus_bieti', 'Ailuropoda_melanoleuca']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[148]	valid_0's l1: 13.4738	valid_0's l2: 544.213	valid_0's huber: 11.7486


2020-08-30 15:45:07.720 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 1 =====
2020-08-30 15:45:07.721 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9550809144864345, 'boosting_type': 'gbdt', 'drop_rate': 0.1520029135951902, 'feature_fraction': 0.39359679175101914, 'lambda_l1': 2.2144413347584315, 'lambda_l2': 2.012506929626787, 'learning_rate': 0.0890025107137548, 'max_depth': 3, 'max_leaves': 18, 'min_data_in_leaf': 5, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 0}
2020-08-30 15:45:07.744 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 0 | VALIDATION_SPECIES: ['Aotus_nancymaae', 'Macaca_mulatta']


Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 13.8875	valid_0's l2: 707.725	valid_0's huber: 12.1443
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 13.8875	valid_0's l2: 707.725	valid_0's huber: 12.1443


2020-08-30 15:45:13.592 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 1 | VALIDATION_SPECIES: ['Capra_hircus', 'Callithrix_jacchus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[162]	valid_0's l1: 11.9706	valid_0's l2: 555.74	valid_0's huber: 10.3968


2020-08-30 15:45:18.068 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 2 | VALIDATION_SPECIES: ['Macaca_nemestrina', 'Rhinolophus_ferrumequinum']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[214]	valid_0's l1: 12.5151	valid_0's l2: 385.813	valid_0's huber: 10.8986


2020-08-30 15:45:23.834 | INFO     | yspecies.selection:fit:82 - SEED: 1 | FOLD: 3 | VALIDATION_SPECIES: ['Homo_sapiens', 'Suricata_suricatta']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[143]	valid_0's l1: 33.7852	valid_0's l2: 3875.3	valid_0's huber: 30.027


2020-08-30 15:45:52.065 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 2 =====
2020-08-30 15:45:52.065 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9550809144864345, 'boosting_type': 'gbdt', 'drop_rate': 0.1520029135951902, 'feature_fraction': 0.39359679175101914, 'lambda_l1': 2.2144413347584315, 'lambda_l2': 2.012506929626787, 'learning_rate': 0.0890025107137548, 'max_depth': 3, 'max_leaves': 18, 'min_data_in_leaf': 5, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 1}
2020-08-30 15:45:52.089 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 0 | VALIDATION_SPECIES: ['Rhinolophus_ferrumequinum', 'Macaca_mulatta']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[196]	valid_0's l1: 19.1334	valid_0's l2: 1021.32	valid_0's huber: 16.8455


2020-08-30 15:45:57.588 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 1 | VALIDATION_SPECIES: ['Canis_lupus_familiaris', 'Ursus_americanus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[142]	valid_0's l1: 21.1162	valid_0's l2: 1001.42	valid_0's huber: 18.6231


2020-08-30 15:46:02.599 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 2 | VALIDATION_SPECIES: ['Callithrix_jacchus', 'Macaca_fascicularis']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[193]	valid_0's l1: 17.2206	valid_0's l2: 743.281	valid_0's huber: 15.1057


2020-08-30 15:46:07.732 | INFO     | yspecies.selection:fit:82 - SEED: 2 | FOLD: 3 | VALIDATION_SPECIES: ['Mesocricetus_auratus', 'Rattus_norvegicus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[111]	valid_0's l1: 14.2532	valid_0's l2: 449.164	valid_0's huber: 12.4341


2020-08-30 15:46:31.210 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 3 =====
2020-08-30 15:46:31.214 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9550809144864345, 'boosting_type': 'gbdt', 'drop_rate': 0.1520029135951902, 'feature_fraction': 0.39359679175101914, 'lambda_l1': 2.2144413347584315, 'lambda_l2': 2.012506929626787, 'learning_rate': 0.0890025107137548, 'max_depth': 3, 'max_leaves': 18, 'min_data_in_leaf': 5, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 2}
2020-08-30 15:46:31.235 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 0 | VALIDATION_SPECIES: ['Ailuropoda_melanoleuca', 'Pan_paniscus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[223]	valid_0's l1: 14.111	valid_0's l2: 569.271	valid_0's huber: 12.3172


2020-08-30 15:46:36.132 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 1 | VALIDATION_SPECIES: ['Macaca_mulatta', 'Homo_sapiens']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[100]	valid_0's l1: 43.0106	valid_0's l2: 5338.21	valid_0's huber: 38.3446


2020-08-30 15:46:38.854 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 2 | VALIDATION_SPECIES: ['Cavia_aperea', 'Cavia_porcellus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[123]	valid_0's l1: 19.0801	valid_0's l2: 810.265	valid_0's huber: 16.7873


2020-08-30 15:46:42.110 | INFO     | yspecies.selection:fit:82 - SEED: 3 | FOLD: 3 | VALIDATION_SPECIES: ['Ovis_aries', 'Mus_caroli']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[77]	valid_0's l1: 11.7831	valid_0's l2: 368.66	valid_0's huber: 10.2158


2020-08-30 15:46:59.989 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 4 =====
2020-08-30 15:46:59.989 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9550809144864345, 'boosting_type': 'gbdt', 'drop_rate': 0.1520029135951902, 'feature_fraction': 0.39359679175101914, 'lambda_l1': 2.2144413347584315, 'lambda_l2': 2.012506929626787, 'learning_rate': 0.0890025107137548, 'max_depth': 3, 'max_leaves': 18, 'min_data_in_leaf': 5, 'objective': 'regression', 'metrics': ['l1', 'l2', 'huber'], 'seed': 3}
2020-08-30 15:47:00.012 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 0 | VALIDATION_SPECIES: ['Macaca_nemestrina', 'Bos_taurus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[167]	valid_0's l1: 24.5222	valid_0's l2: 2686.3	valid_0's huber: 21.6867


2020-08-30 15:47:05.981 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 1 | VALIDATION_SPECIES: ['Microcebus_murinus', 'Macaca_mulatta']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[79]	valid_0's l1: 15.1239	valid_0's l2: 797.648	valid_0's huber: 13.235


2020-08-30 15:47:08.905 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 2 | VALIDATION_SPECIES: ['Capra_hircus', 'Meriones_unguiculatus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[67]	valid_0's l1: 15.3828	valid_0's l2: 550.402	valid_0's huber: 13.4608


2020-08-30 15:47:11.432 | INFO     | yspecies.selection:fit:82 - SEED: 4 | FOLD: 3 | VALIDATION_SPECIES: ['Pan_paniscus', 'Homo_sapiens']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[78]	valid_0's l1: 37.5402	valid_0's l2: 3736.82	valid_0's huber: 33.4197


Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ENSG00000006282,SPATA20,5.0,6.5,0.658,5.75,0.668,5.0,0.618,9.75,0.66,5.5,0.67,6.5,0.672
ENSG00000171121,KCNMB3,5.0,5.8,0.629,6.0,0.659,4.25,0.63,6.75,0.609,7.75,0.604,4.25,0.643
ENSG00000172531,PPP1CA,5.0,5.75,-0.775,8.25,-0.828,7.0,-0.769,4.25,-0.751,4.75,-0.732,4.5,-0.794
ENSG00000105672,ETV2,5.0,4.65,0.692,4.75,0.714,4.5,0.697,4.75,0.663,5.0,0.71,4.25,0.676
ENSG00000154328,NEIL2,5.0,2.7,0.6,3.25,0.637,3.25,0.61,2.25,0.595,2.25,0.585,2.5,0.573
ENSG00000010219,DYRK4,4.0,12.938,0.503,12.75,0.701,13.25,0.644,13.25,0.369,,,12.5,0.297
ENSG00000066923,STAG3,4.0,6.062,0.391,6.5,0.399,5.25,0.357,6.25,0.392,,,6.25,0.415
ENSG00000188747,NOXA1,4.0,3.625,0.648,2.5,0.658,3.5,0.605,3.75,0.65,4.75,0.679,,
ENSG00000162959,MEMO1,4.0,3.5,-0.265,6.0,-0.217,2.75,-0.286,,,3.25,-0.201,2.0,-0.354
ENSG00000163584,RPL22L1,4.0,3.375,-0.553,,,3.25,-0.485,4.25,-0.623,2.0,-0.624,4.0,-0.48


## Intersect first stages ##

In [1]:
def join_genes():
    return [key for key, value in stage_one]

## Second stage selection ##

In [2]:
join_genes()

NameError: name 'stage_one' is not defined