# Hyper parameters optimization #

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [None]:
number_of_folds = 5 # this sets global setting of which how many bootstraps to use
repeats = 10
n_trials = 50
#first round of optimization
lgb_params = {"bagging_fraction": 0.9522534844058304, 
              "boosting_type": "dart", 
              "objective": "regression",
              "feature_fraction": 0.42236910941558053, 
              "lambda_l1": 0.020847266580277746, 
              "lambda_l2": 2.8448564854773326, 
              "learning_rate": 0.11484015430016059, 
              "max_depth": 3, 
              "max_leaves": 35, 
              "min_data_in_leaf": 9,
              "num_iterations": 150
             }
debug_local = True #to use local version

In [None]:
from pathlib import Path
import sys
import inspect

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

In [None]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat, Collect
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.selection import ShapSelector
from yspecies.tuning import Tune
from yspecies.models import ResultsCV, CrossValidator
import optuna
from optuna import Study, Trial

In [5]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

Using matplotlib backend: TkAgg


### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [6]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [7]:
data = ExpressionDataset.from_folder(locations.interim.selected)
data

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(445, 12340)",12340,39,445,"(12340, 2)","(40, 19)"


## Checking that crossvalidation works ##

Deciding on selection parameters (which fields to include, exclude, predict)

In [8]:
partition_params = PartitionParameters(number_of_folds, 1, 2, [],  42)


In [11]:
partition_cv_pipe = Pipeline([
    ('partitioner', DataPartitioner()),
    ('prepare_for_partitioning', TupleWith(lgb_params)),
    ('crossvalidator', CrossValidator())
]
)

In [56]:
repeated_cv_pipe =  Repeat(partition_cv_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))
cv_pipeline =  Pipeline([
    ('extractor', DataExtractor()),
    ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
    ("partition_cv", repeated_cv_pipe)
    ]
    )

## Setting up features to select ##

In [14]:
selection = select_lifespan = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = "lifespan", #column to predict
    categorical = ["tissue"])

In [15]:
select_lifespan = selection
select_mass = replace(selection, to_predict = "mass_g")
select_gestation = replace(selection, to_predict = "gestation")
select_mtgc = replace(selection, to_predict = "mtgc")

## Checking cross-validation ##

In [16]:
cv_res = cv_pipeline.fit_transform((data, select_lifespan))
ResultsCV.take_best(cv_res)

Found `num_iterations` in params. Will use it instead of argument
Early stopping is not available in dart mode


[150]	cv_agg's l1: 7.43114 + 6.21527	cv_agg's l2: 100.42 + 105.483	cv_agg's huber: 6.32338 + 5.55987
[150]	cv_agg's l1: 2.40938 + 0.220618	cv_agg's l2: 33.1796 + 2.89302	cv_agg's huber: 1.83441 + 0.192833
[150]	cv_agg's l1: 2.35449 + 0.17208	cv_agg's l2: 32.8908 + 4.3389	cv_agg's huber: 1.79183 + 0.140331


1.3668827684898497

In [17]:
[c.last("huber")for c in cv_res]

# Optimization #

In [30]:
url = f'sqlite:///' +str((locations.metrics.lifespan / "study.sqlite").absolute())
print('loading (if exists) study from '+url)
storage = optuna.storages.RDBStorage(
    url=url
    #engine_kwargs={'check_same_thread': False}
)

loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/metrics/lifespan/study.sqlite


In [21]:
def objective_parameters(trial: Trial) -> dict:
    return {
        'objective': 'regression',
        'metric': {'mae', 'mse', 'huber'},
        'verbosity': -1,
        'boosting_type': trial.suggest_categorical('boosting_type', ['dart', 'gbdt']),
        'lambda_l1': trial.suggest_uniform('lambda_l1', 0.01, 4.0),
        'lambda_l2': trial.suggest_uniform('lambda_l2', 0.01, 4.0),
        'max_leaves': trial.suggest_int("max_leaves", 15, 25),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 1.0),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 3, 8),
        'drop_rate': trial.suggest_uniform('drop_rate', 0.1, 0.3),
        "verbose": -1
    }
optimization_parameters = objective_parameters

In [41]:
from yspecies.workflow import SplitReduce

def side(i: int):
    print(i)
    return i

prepare_partition = SplitReduce(
    outputs = DataPartitioner(), 
    split = lambda x: [(x[0], replace(partition_params, seed=side(x[2])))], 
    reduce = lambda x, output: (output[0], x[1]) 
)                               
partition_and_cv = Pipeline(
    [
        ("prepare partition", prepare_partition),
        ('crossvalidator', CrossValidator())
    ]
)

partition_and_cv_repeat =  Pipeline([
    ("repeat_cv_pipe", Repeat(partition_and_cv, repeats, 
                              lambda x, i: [x[0], x[1], i] )),
    ("collect_mean", Collect(fold=lambda results: np.array([r.last("huber") for r in results]).mean()))
    ]
    )

p = Pipeline([
     ('extractor', DataExtractor()),
     ('tune', Tune(partition_and_cv_repeat, n_trials = 2, parameters_space = optimization_parameters))    
])

In [43]:
best = p.fit_transform((data, select_lifespan))
best

0
[200]	cv_agg's l1: 1.73608 + 0.616772	cv_agg's l2: 7.16199 + 5.39369	cv_agg's huber: 1.22658 + 0.540433
1
[200]	cv_agg's l1: 1.88651 + 0.26057	cv_agg's l2: 14.8303 + 3.9075	cv_agg's huber: 1.36376 + 0.214897
2
[200]	cv_agg's l1: 1.82129 + 0.139542	cv_agg's l2: 13.3656 + 1.89064	cv_agg's huber: 1.30457 + 0.118265


[I 2020-08-13 12:23:39,143] Trial 0 finished with value: 1.2983059083818629 and parameters: {'boosting_type': 'dart', 'lambda_l1': 1.5488814639964033, 'lambda_l2': 0.9436691893935235, 'max_leaves': 20, 'max_depth': 4, 'feature_fraction': 0.5361944688083496, 'bagging_fraction': 0.8645957050899689, 'learning_rate': 0.08851919639499403, 'min_data_in_leaf': 6, 'drop_rate': 0.2229770791364106}. Best is trial 0 with value: 1.2983059083818629.


0
Training until validation scores don't improve for 10 rounds
[200]	cv_agg's l1: 0.091296 + 0.00948117	cv_agg's l2: 0.0975205 + 0.0664387	cv_agg's huber: 0.03581 + 0.0124123
Did not meet early stopping. Best iteration is:
[200]	cv_agg's l1: 0.091296 + 0.00948117	cv_agg's l2: 0.0975205 + 0.0664387	cv_agg's huber: 0.03581 + 0.0124123
1
Training until validation scores don't improve for 10 rounds
[200]	cv_agg's l1: 0.08863 + 0.0149316	cv_agg's l2: 0.110497 + 0.0315277	cv_agg's huber: 0.0428087 + 0.00756716
Did not meet early stopping. Best iteration is:
[200]	cv_agg's l1: 0.08863 + 0.0149316	cv_agg's l2: 0.110497 + 0.0315277	cv_agg's huber: 0.0428087 + 0.00756716
2
Training until validation scores don't improve for 10 rounds
[200]	cv_agg's l1: 0.0975158 + 0.00530665	cv_agg's l2: 0.23236 + 0.0499444	cv_agg's huber: 0.0535257 + 0.00945229
Did not meet early stopping. Best iteration is:
[200]	cv_agg's l1: 0.0975158 + 0.00530665	cv_agg's l2: 0.23236 + 0.0499444	cv_agg's huber: 0.0535257 + 0.

[I 2020-08-13 12:25:08,464] Trial 1 finished with value: 0.04404813160918101 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 3.5389955270605298, 'lambda_l2': 1.6581000555478702, 'max_leaves': 20, 'max_depth': 8, 'feature_fraction': 0.537151250668454, 'bagging_fraction': 0.6407512594833191, 'learning_rate': 0.06627424358150694, 'min_data_in_leaf': 3, 'drop_rate': 0.16918140176888039}. Best is trial 1 with value: 0.04404813160918101.


{'boosting_type': 'gbdt',
 'lambda_l1': 3.5389955270605298,
 'lambda_l2': 1.6581000555478702,
 'max_leaves': 20,
 'max_depth': 8,
 'feature_fraction': 0.537151250668454,
 'bagging_fraction': 0.6407512594833191,
 'learning_rate': 0.06627424358150694,
 'min_data_in_leaf': 3,
 'drop_rate': 0.16918140176888039}

In [55]:
best["metric"] =  ["mae", "mse", "huber"]
best['objective'] = 'regression'
best

{'boosting_type': 'gbdt',
 'lambda_l1': 3.5389955270605298,
 'lambda_l2': 1.6581000555478702,
 'max_leaves': 20,
 'max_depth': 8,
 'feature_fraction': 0.537151250668454,
 'bagging_fraction': 0.6407512594833191,
 'learning_rate': 0.06627424358150694,
 'min_data_in_leaf': 3,
 'drop_rate': 0.16918140176888039,
 'metric': ['mae', 'mse', 'huber'],
 'objective': 'regression'}

## Getting shap results with the best parameters ##

In [51]:
def make_shap(params: dict):
    partition_shap_pipe = Pipeline([
        ("partitioner", DataPartitioner()),
        ('prepare_for_partitioning', TupleWith(lgb_params)),
        ("shap_computation", ShapSelector())
    ]
    )
    repeated_cv =  Repeat(partition_shap_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))
    return Pipeline([
        ('extractor', DataExtractor()),
        ('prepare_for_partitioning', TupleWith(params)), # to extract the data required for ML from the dataset
        ("partition_shap", repeated_cv)]
        )

In [52]:
p_shap = make_shap(best)
results = p_shap.fit_transform((data, select_lifespan))

TypeError: replace() should be called on dataclass instances