# Hyper parameters optimization #

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [1]:
n_folds = 5 # this sets global setting of which how many bootstraps to use
n_hold_out = 1
repeats = 2
n_trials = 2
#first round of optimization
lgb_params = {"bagging_fraction": 0.9522534844058304, 
              "boosting_type": "dart", 
              "objective": "regression",
              "feature_fraction": 0.42236910941558053, 
              "lambda_l1": 0.020847266580277746, 
              "lambda_l2": 2.8448564854773326, 
              "learning_rate": 0.11484015430016059, 
              "max_depth": 3, 
              "max_leaves": 35, 
              "min_data_in_leaf": 9,
              "num_iterations": 150
             }
debug_local = True #to use local version

In [2]:
from pathlib import Path
import sys
import inspect

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']


In [3]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat, Collect
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.selection import ShapSelector
from yspecies.tuning import Tune
from yspecies.models import ResultsCV, CrossValidator
from yspecies.results import FeatureSummary
import optuna
from optuna import Study, Trial

In [5]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

Using matplotlib backend: TkAgg


### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [6]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [7]:
data = ExpressionDataset.from_folder(locations.interim.selected)
data

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(408, 12323)",12323,38,408,"(12323, 2)","(38, 19)"


## Setting up ShapSelector ##

Deciding on selection parameters (which fields to include, exclude, predict)

In [8]:
partition_params = PartitionParameters(n_folds, n_hold_out, 2, [],  42)

## Setting up features to select ##

In [9]:
selection = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = "lifespan", #column to predict
    categorical = ["tissue"],
    select_by = "shap",
    importance_type = "split",
    clean_y_na = True
)

In [10]:
select_lifespan = replace(selection, select_by = "shap")

In [11]:
url = f'sqlite:///' +str((locations.metrics.lifespan / "study.sqlite").absolute())
print('loading (if exists) study from '+url)
storage = optuna.storages.RDBStorage(
    url=url
    #engine_kwargs={'check_same_thread': False}
)

study = optuna.multi_objective.study.create_study(directions=['minimize','maximize'], storage = storage)

loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/metrics/lifespan/study.sqlite


create_study is experimental (supported from v1.4.0). The interface can change in the future.
NSGAIIMultiObjectiveSampler is experimental (supported from v1.5.0). The interface can change in the future.
RandomMultiObjectiveSampler is experimental (supported from v1.4.0). The interface can change in the future.
[I 2020-08-18 09:29:47,062] A new study created with name: no-name-1584ed93-cb15-4e2a-9bda-bbd6a1ca515e
MultiObjectiveStudy is experimental (supported from v1.4.0). The interface can change in the future.


In [12]:
def objective_parameters(trial: Trial) -> dict:
    return {
        'objective': 'regression',
        'metric': {'mae', 'mse', 'huber'},
        'verbosity': -1,
        'boosting_type': trial.suggest_categorical('boosting_type', ['dart', 'gbdt']),
        'lambda_l1': trial.suggest_uniform('lambda_l1', 0.01, 4.0),
        'lambda_l2': trial.suggest_uniform('lambda_l2', 0.01, 4.0),
        'max_leaves': trial.suggest_int("max_leaves", 15, 25),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 1.0),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 3, 8),
        'drop_rate': trial.suggest_uniform('drop_rate', 0.1, 0.3),
        "verbose": -1
    }
optimization_parameters = objective_parameters

In [13]:
from yspecies.workflow import SplitReduce

def side(i: int):
    print(i)
    return i

prepare_partition = SplitReduce(
    outputs = DataPartitioner(), 
    split = lambda x: [(x[0], replace(partition_params, seed=side(x[2])))], 
    reduce = lambda x, output: (output[0], x[1]) 
)                               
partition_and_cv = Pipeline(
    [
        ("prepare partition", prepare_partition),
        ("shap_computation", ShapSelector()) #('crossvalidator', CrossValidator())        
    ]
)

partition_and_cv_repeat =  Pipeline([
    ("repeat_cv_pipe", Repeat(partition_and_cv, repeats, lambda x, i: [x[0], x[1], i] )),
    #("collect_mean", Collect(fold=lambda results: np.array([r.last("huber") for r in results]).mean()))
    ("collect_mean", Collect(fold=lambda results: (FeatureSummary(results).metrics_average.huber, FeatureSummary(results).kendall_tau_abs_mean)))    
    ]
    )

p = Pipeline([
     ('extractor', DataExtractor()),
     ('tune', Tune(partition_and_cv_repeat, study = study, n_trials = n_trials, parameters_space = optimization_parameters))    
])

In [14]:
best = p.fit_transform((data, select_lifespan))
best

MultiObjectiveStudy is experimental (supported from v1.4.0). The interface can change in the future.
FrozenMultiObjectiveTrial is experimental (supported from v1.4.0). The interface can change in the future.
MultiObjectiveStudy is experimental (supported from v1.4.0). The interface can change in the future.
FrozenMultiObjectiveTrial is experimental (supported from v1.4.0). The interface can change in the future.
FrozenMultiObjectiveTrial is experimental (supported from v1.4.0). The interface can change in the future.
MultiObjectiveTrial is experimental (supported from v1.4.0). The interface can change in the future.
MultiObjectiveStudy is experimental (supported from v1.4.0). The interface can change in the future.
MultiObjectiveStudy is experimental (supported from v1.4.0). The interface can change in the future.
FrozenMultiObjectiveTrial is experimental (supported from v1.4.0). The interface can change in the future.


0
fitting models with seed 0
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[32]	valid_0's l2: 2291.56	valid_0's l1: 23.3237	valid_0's huber: 20.6557
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[41]	valid_0's l2: 73.6083	valid_0's l1: 3.96306	valid_0's huber: 3.24032
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[73]	valid_0's l2: 19.2496	valid_0's l1: 2.25843	valid_0's huber: 1.74065
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[54]	valid_0's l2: 14.4106	valid_0's l1: 2.36801	valid_0's huber: 1.82635
1
fitting models with seed 1
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[79]	valid_0's l2: 29.4456	valid_0's l1: 2.79624	valid_0's huber: 2.18508
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration 

MultiObjectiveTrial is experimental (supported from v1.4.0). The interface can change in the future.
MultiObjectiveStudy is experimental (supported from v1.4.0). The interface can change in the future.
[I 2020-08-18 09:30:42,860] Trial 0 finished with values: [6.762114187920778, 0.5289807294894449] with parameters: {'boosting_type': 'gbdt', 'lambda_l1': 0.2598052522656523, 'lambda_l2': 0.8013208237937728, 'max_leaves': 24, 'max_depth': 5, 'feature_fraction': 0.8946050280420148, 'bagging_fraction': 0.34949427002764666, 'learning_rate': 0.09997483102228122, 'min_data_in_leaf': 8, 'drop_rate': 0.10554524329530808}.
MultiObjectiveStudy is experimental (supported from v1.4.0). The interface can change in the future.
FrozenMultiObjectiveTrial is experimental (supported from v1.4.0). The interface can change in the future.
MultiObjectiveStudy is experimental (supported from v1.4.0). The interface can change in the future.
FrozenMultiObjectiveTrial is experimental (supported from v1.4.0). The 

0
fitting models with seed 0
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	valid_0's l2: 2340.31	valid_0's l1: 25.782	valid_0's huber: 22.8052
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[76]	valid_0's l2: 76.8522	valid_0's l1: 3.89148	valid_0's huber: 3.19825
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l2: 21.7979	valid_0's l1: 2.2507	valid_0's huber: 1.73437
Did not meet early stopping. Best iteration is:
[150]	valid_0's l2: 21.7979	valid_0's l1: 2.2507	valid_0's huber: 1.73437
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[60]	valid_0's l2: 31.8605	valid_0's l1: 3.33287	valid_0's huber: 2.64409
1
fitting models with seed 1
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[112]	valid_0's l2: 37.0216	valid_0's l1: 2.85944	valid_0's huber: 2.26452
Train

MultiObjectiveTrial is experimental (supported from v1.4.0). The interface can change in the future.
MultiObjectiveStudy is experimental (supported from v1.4.0). The interface can change in the future.
[I 2020-08-18 09:32:36,632] Trial 1 finished with values: [6.968988903673328, 0.3852042116842162] with parameters: {'boosting_type': 'gbdt', 'lambda_l1': 1.88191972385573, 'lambda_l2': 1.1438753568303472, 'max_leaves': 18, 'max_depth': 6, 'feature_fraction': 0.924226353085214, 'bagging_fraction': 0.5913316363581838, 'learning_rate': 0.05971395883581899, 'min_data_in_leaf': 3, 'drop_rate': 0.2995817149289465}.
FrozenMultiObjectiveTrial is experimental (supported from v1.4.0). The interface can change in the future.


MultiObjectiveResults(trials=[<optuna.multi_objective.trial.FrozenMultiObjectiveTrial object at 0x7f7aff2affd0>])

In [15]:
print(best)

MultiObjectiveResults(trials=[<optuna.multi_objective.trial.FrozenMultiObjectiveTrial object at 0x7f7aff2affd0>])


In [17]:
print(best.params)

[{'boosting_type': 'gbdt', 'lambda_l1': 0.2598052522656523, 'lambda_l2': 0.8013208237937728, 'max_leaves': 24, 'max_depth': 5, 'feature_fraction': 0.8946050280420148, 'bagging_fraction': 0.34949427002764666, 'learning_rate': 0.09997483102228122, 'min_data_in_leaf': 8, 'drop_rate': 0.10554524329530808}]


In [16]:
print(best.results)

[(6.762114187920778, 0.5289807294894449)]
