# SHAP feature selection #
## Code to select feature with combination of LightGBM and SHAP ##

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [36]:
number_of_folds = 5 # this sets global setting of which how many bootstraps to use
repeats = 15
#first round of optimization
lgb_params = {
 "objective": "regression",
 'boosting_type': 'gbdt', 
 'metric': ['mae','mse', 'huber'],
 'lambda_l1': 0.03674546022666247, 
 'lambda_l2': 2.5025758383109715, 
 'max_leaves': 22,
 'max_depth': 6, 
 'feature_fraction': 0.741996402547356, 
 'bagging_fraction': 0.8929839112500518, 
 'learning_rate': 0.0892243160116563, 
 'min_data_in_leaf': 5, 
 'drop_rate': 0.14842616274718734    
}

debug_local = True #to use local version

In [37]:
from pathlib import Path
import sys
import inspect

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '..', '..', '..', '..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [39]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.models import Metrics
from yspecies.selection import ShapSelector

In [40]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

Using matplotlib backend: TkAgg


### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [41]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [42]:
data = ExpressionDataset.from_folder(locations.interim.selected)
data

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(445, 12340)",12340,39,445,"(12340, 2)","(40, 19)"


## Setting up SHAP selection pipeline ##

Deciding on selection parameters (which fields to include, exclude, predict)

In [43]:
partition_params = PartitionParameters(number_of_folds, 0, 2, [],  42)


In [44]:
partition_shap_pipe = Pipeline([
    ("partitioner", DataPartitioner()),
    ('prepare_for_partitioning', TupleWith(lgb_params)),
    ("shap_computation", ShapSelector())
]
)
repeated_cv =  Repeat(partition_shap_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))

In [45]:
selection_pipeline =  Pipeline([
    ('extractor', DataExtractor()),
    ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
    ("partition_shap", repeated_cv)]
    )

## Setting up features to select ##

In [46]:
selection = select_lifespan = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = "lifespan", #column to predict
    categorical = ["tissue"])

In [47]:
select_lifespan = selection
select_mass = replace(selection, to_predict = "mass_g")
select_gestation = replace(selection, to_predict = "gestation")
select_mtgc = replace(selection, to_predict = "mtgc")

# First stage selection #

In [None]:
stage_one_lifespan = selection_pipeline.fit_transform((data, select_lifespan))
stage_one_lifespan

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[15]	valid_0's l1: 10.7666	valid_0's l2: 242.451	valid_0's huber: 9.28714
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[90]	valid_0's l1: 2.33282	valid_0's l2: 17.7251	valid_0's huber: 1.83076
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[90]	valid_0's l1: 3.66061	valid_0's l2: 56.0589	valid_0's huber: 2.99792
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[104]	valid_0's l1: 7.33974	valid_0's l2: 144.605	valid_0's huber: 6.27978
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[84]	valid_0's l1: 3.11934	valid_0's l2: 22.2578	valid_0's huber: 2.48032


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[149]	valid_0's l1: 4.21986	valid_0's l2: 81.8091	valid_0's huber: 3.48915
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[23]	valid_0's l1: 5.82709	valid_0's l2: 58.1638	valid_0's huber: 4.8496
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[106]	valid_0's l1: 21.6611	valid_0's l2: 1885.6	valid_0's huber: 19.1666
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[64]	valid_0's l1: 8.21728	valid_0's l2: 181.643	valid_0's huber: 7.06653
Training until validation scores don't improve for 10 rounds


In [None]:
from yspecies.results import FeatureSummary
summary = FeatureSummary(stage_one_lifespan)
stage_one_lifespan[0]

In [None]:
summary.selected.sort_values("kendall_tau_0", ascending=False)["symbol"].values

In [73]:
"gain" if len([c for c in summary.results[0].selected.columns if "gain" in c])>0 else "shap"

'gain'