# SHAP feature selection #
## Code to select feature with combination of LightGBM and SHAP ##

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [17]:
n_folds = 5 # this sets global setting of which how many bootstraps to use
n_hold_out = 1
repeats = 5
importance_type = "split"

life_history = ["lifespan", "mass_kg", "mtGC", "metabolic_rate", "temperature", "gestation_days"]
debug_local = True #to use local version

In [18]:
from pathlib import Path
import sys
import inspect

#lgb_params["importance_type"] = importance_type

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [20]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat, Collect
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.models import Metrics
from yspecies.selection import ShapSelector
from yspecies.results import FeatureSummary

In [21]:
import optuna
from optuna import Study, Trial
from optuna import multi_objective
from optuna.multi_objective import trial
from optuna.multi_objective.study import MultiObjectiveStudy
from yspecies.tuning import MultiObjectiveResults

In [22]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

Using matplotlib backend: TkAgg


### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [23]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

## Setting up Features to select ##

In [24]:
default_selection = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = "lifespan", #column to predict
    categorical = ["tissue"],
    select_by = "shap",
    importance_type =  importance_type,
    feature_perturbation = "tree_path_dependent"
)
default_selection

Samples metadata,Species metadata,Genes,Predict label,not_validated species
"['tissue', 'species']",[],all,lifespan,[]


In [25]:
loader = DataLoader(locations, default_selection)
selections = loader.load_life_history()
selections["lifespan"][0]

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(408, 12323)","(12323, 37)",38,408,"(12323, 2)","(38, 18)"


### Loading Hyperoptimization studies ###

In [26]:
def load_study(trait: str):
    url = f'sqlite:///' +str((locations.interim.optimization / (trait+".sqlite")).absolute())
    print('loading (if exists) study from '+url)
    storage = optuna.storages.RDBStorage(
        url=url
        #engine_kwargs={'check_same_thread': False}
    )
    return optuna.multi_objective.study.create_study(directions=['maximize','minimize','maximize'], storage = storage, study_name = f"{trait}_r2_huber_kendall", load_if_exists = True)

## Setting up SHAP selection pipeline ##

### Deciding on selection parameters (which fields to include, exclude, predict)  ###

In [27]:
partition_params = PartitionParameters(n_folds, n_hold_out, 2,   42)


In [28]:
def make_pipeline(trait: str):
    study = load_study(trait)
    if len(study.get_pareto_front_trials())>0 :
        metrics, params = MultiObjectiveResults.from_study(study).best_metrics_params_r2()
    else:
        params =  lgb_params = {"bagging_fraction": 0.9522534844058304,
                  "boosting_type": "dart",
                  "objective": "regression",
                  "feature_fraction": 0.42236910941558053,
                  "lambda_l1": 0.020847266580277746,
                  "lambda_l2": 2.8448564854773326,
                  "learning_rate": 0.11484015430016059,
                  "max_depth": 3,
                  "max_leaves": 35,
                  "min_data_in_leaf": 9,
                  "num_iterations": 250,
                  "metrics": ["l1", "l2", "huber"]
                 }
    partition_shap_pipe = Pipeline([
    ("partitioner", DataPartitioner()),
    ('prepare_for_selection', TupleWith(params)),
    ("shap_computation", ShapSelector())
    ]
    )
    repeated_cv =  Repeat(partition_shap_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))
    return Pipeline(
        [
        ('extractor', DataExtractor()),
        ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
        ("partition_shap", repeated_cv),
        ("summarize", Collect(fold=lambda results: FeatureSummary(results)))
        ]
        )

In [None]:
print("?")

In [29]:
selections["lifespan"][1]

Samples metadata,Species metadata,Genes,Predict label,not_validated species
"['tissue', 'species']",[],all,lifespan,"['Mus_caroli', 'Homo_sapiens']"


In [30]:
pipelines = {trait:make_pipeline(trait) for trait in life_history}
pipelines.keys()

loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/lifespan.sqlite


create_study is experimental (supported from v1.4.0). The interface can change in the future.
[I 2020-08-26 18:11:47,970] Using an existing study with name 'lifespan_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/mass_kg.sqlite


[I 2020-08-26 18:11:48,693] Using an existing study with name 'mass_kg_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/mtGC.sqlite


[I 2020-08-26 18:11:48,763] Using an existing study with name 'mtGC_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/metabolic_rate.sqlite


[I 2020-08-26 18:11:50,295] Using an existing study with name 'metabolic_rate_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/temperature.sqlite


[I 2020-08-26 18:11:50,351] Using an existing study with name 'temperature_r2_huber_kendall' instead of creating a new one.


loading (if exists) study from sqlite:////data/sources/yspecies/notebooks/../data/interim/optimization/gestation_days.sqlite


[I 2020-08-26 18:11:50,460] Using an existing study with name 'gestation_days_r2_huber_kendall' instead of creating a new one.


dict_keys(['lifespan', 'mass_kg', 'mtGC', 'metabolic_rate', 'temperature', 'gestation_days'])

# First stage selection (shap ) #

### Lifespan ###

In [34]:
def make_run(trait: str):
    return pipelines[trait].fit_transform(selections[trait])

In [None]:
stage_one_lifespan = make_run("lifespan")
stage_one_lifespan.selected

2020-08-26 19:03:49.218 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 0 =====
2020-08-26 19:03:49.252 | INFO     | yspecies.selection:fit:81 - SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Gorilla_gorilla', 'Pan_paniscus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[158]	valid_0's l1: 8.56432	valid_0's l2: 189.457	valid_0's huber: 7.35476


2020-08-26 19:06:19.736 | INFO     | yspecies.selection:fit:81 - SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Heterocephalus_glaber', 'Capra_hircus']


Training until validation scores don't improve for 10 rounds
[250]	valid_0's l1: 3.93228	valid_0's l2: 62.8388	valid_0's huber: 3.21679
Did not meet early stopping. Best iteration is:
[250]	valid_0's l1: 3.93228	valid_0's l2: 62.8388	valid_0's huber: 3.21679


2020-08-26 19:10:12.183 | INFO     | yspecies.selection:fit:81 - SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Bos_taurus', 'Otolemur_garnettii']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[133]	valid_0's l1: 3.6893	valid_0's l2: 41.9248	valid_0's huber: 2.9942


2020-08-26 19:12:34.694 | INFO     | yspecies.selection:fit:81 - SEED: 0 | FOLD: 3 | VALIDATION_SPECIES: ['Macaca_nemestrina', 'Microcebus_murinus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[103]	valid_0's l1: 3.36189	valid_0's l2: 52.7821	valid_0's huber: 2.68951


2020-08-26 19:14:11.035 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 1 =====
2020-08-26 19:14:11.063 | INFO     | yspecies.selection:fit:81 - SEED: 1 | FOLD: 0 | VALIDATION_SPECIES: ['Felis_catus', 'Ursus_americanus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[140]	valid_0's l1: 3.75572	valid_0's l2: 35.9971	valid_0's huber: 3.03913


2020-08-26 19:16:30.381 | INFO     | yspecies.selection:fit:81 - SEED: 1 | FOLD: 1 | VALIDATION_SPECIES: ['Heterocephalus_glaber', 'Vombatus_ursinus']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[160]	valid_0's l1: 5.29124	valid_0's l2: 109.206	valid_0's huber: 4.43577


2020-08-26 19:19:15.686 | INFO     | yspecies.selection:fit:81 - SEED: 1 | FOLD: 2 | VALIDATION_SPECIES: ['Macaca_fascicularis', 'Rhinopithecus_bieti']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[149]	valid_0's l1: 3.24637	valid_0's l2: 36.8948	valid_0's huber: 2.58649


2020-08-26 19:21:35.651 | INFO     | yspecies.selection:fit:81 - SEED: 1 | FOLD: 3 | VALIDATION_SPECIES: ['Gorilla_gorilla', 'Ovis_aries']


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[110]	valid_0's l1: 6.099	valid_0's l2: 157.97	valid_0's huber: 5.16087


2020-08-26 19:23:54.216 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 2 =====
2020-08-26 19:23:54.243 | INFO     | yspecies.selection:fit:81 - SEED: 2 | FOLD: 0 | VALIDATION_SPECIES: ['Rhinopithecus_bieti', 'Ursus_americanus']


Training until validation scores don't improve for 10 rounds


In [None]:
stage_one_lifespan.metrics

### Mass_kg ###

In [None]:
stage_one_mass = make_run("mass_kg")
stage_one_mass.selected

### MtGC ###

In [None]:
stage_one_mtGC = make_run("mtGC")
stage_one_mtGC.selected

### Metabolism ###

In [None]:
#stage_one_metabolic_rate = make_run("metabolic_rate")
stage_one_metabolic_rate.selected

### Temperature ###

In [None]:
stage_one_temperature = make_run("temperature")
stage_one_temperature

### Gestation ###

In [None]:
stage_one_gestation = make_run("gestation_days"])
stage_one_gestation.selected

## Intersect first stages ##

## Second stage selection ##