# SHAP feature selection #
## Code to select feature with combination of LightGBM and SHAP ##

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [1]:
n_folds = 5 # this sets global setting of which how many bootstraps to use
n_hold_out = 1
repeats = 5

#first round of optimization
lgb_params = {
 "objective": "regression",
 'boosting_type': 'gbdt', 
 'metric': ['mae','mse', 'huber'],
 'lambda_l1': 0.03674546022666247, 
 'lambda_l2': 2.5025758383109715, 
 'max_leaves': 22,
 'max_depth': 6, 
 'feature_fraction': 0.741996402547356, 
 'bagging_fraction': 0.8929839112500518, 
 'learning_rate': 0.0892243160116563, 
 'min_data_in_leaf': 5, 
 'drop_rate': 0.14842616274718734    
}

debug_local = True #to use local version

In [2]:
from pathlib import Path
import sys
import inspect

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']


In [3]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat, Collect
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.models import Metrics
from yspecies.selection import ShapSelector
from yspecies.results import FeatureSummary

In [5]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

Using matplotlib backend: TkAgg


### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [6]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [7]:
data = ExpressionDataset.from_folder(locations.interim.selected)
data

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(408, 12323)",12323,38,408,"(12323, 2)","(38, 19)"


## Setting up SHAP selection pipeline ##

Deciding on selection parameters (which fields to include, exclude, predict)

In [8]:
partition_params = PartitionParameters(n_folds, n_hold_out, 2, [],  42)


In [9]:
partition_shap_pipe = Pipeline([
    ("partitioner", DataPartitioner()),
    ('prepare_for_partitioning', TupleWith(lgb_params)),
    ("shap_computation", ShapSelector())
]
)
repeated_cv =  Repeat(partition_shap_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))

In [10]:
selection_pipeline =  Pipeline(
    [
    ('extractor', DataExtractor()),
    ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
    ("partition_shap", repeated_cv),
    ("summarize", Collect(fold=lambda results: FeatureSummary(results)))
    ]
    )

## Setting up features to select ##

In [11]:
selection = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = "lifespan", #column to predict
    categorical = ["tissue"],
    select_by = "gain",
    importance_type = "gain",
    clean_y_na = True
)

# First stage selection (gain ) #

#### Lifespan ####

select_lifespan = selection
stage_one_lifespan = selection_pipeline.fit_transform((data, select_lifespan))
print(stage_one_lifespan.average_metrics)
stage_one_lifespan.selected

In [13]:
stage_one_lifespan.selected["symbol"].values

array(['TRAPPC2L', 'ETV2', 'MAN2B2', 'RPL13A'], dtype=object)

In [14]:
select_mass = replace(selection, to_predict = "mass_g")
select_gestation = replace(selection, to_predict = "gestation")
select_mtgc = replace(selection, to_predict = "mtGC")

#### Mass ####

In [15]:
select_mass = replace(selection, to_predict = "mass_g")
#stage_one_mass = selection_pipeline.fit_transform((data, select_mass))
#print(stage_one_mass.average_metrics)
#stage_one_mass.selected

# First stage selection (shap ) #

In [16]:
select_lifespan_shap = replace(selection, select_by = "shap")
stage_one_lifespan_shap = selection_pipeline.fit_transform((data, select_lifespan_shap))
stage_one_lifespan_shap.selected

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[18]	valid_0's l1: 24.9011	valid_0's l2: 2316.23	valid_0's huber: 22.0143
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[57]	valid_0's l1: 4.07394	valid_0's l2: 71.875	valid_0's huber: 3.36645
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[66]	valid_0's l1: 2.40126	valid_0's l2: 22.9658	valid_0's huber: 1.85596
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[79]	valid_0's l1: 2.69824	valid_0's l2: 18.4971	valid_0's huber: 2.13464


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 2.71138	valid_0's l2: 37.5296	valid_0's huber: 2.1187
Did not meet early stopping. Best iteration is:
[145]	valid_0's l1: 2.70909	valid_0's l2: 37.5889	valid_0's huber: 2.1185
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[58]	valid_0's l1: 3.55021	valid_0's l2: 63.651	valid_0's huber: 2.90928
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[67]	valid_0's l1: 2.5182	valid_0's l2: 20.3522	valid_0's huber: 1.97831
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[91]	valid_0's l1: 21.2755	valid_0's l2: 1972.72	valid_0's huber: 18.8399


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[96]	valid_0's l1: 2.40414	valid_0's l2: 44.6316	valid_0's huber: 1.8861
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 5.77937	valid_0's l2: 188.825	valid_0's huber: 4.919
Did not meet early stopping. Best iteration is:
[149]	valid_0's l1: 5.77935	valid_0's l2: 188.821	valid_0's huber: 4.91897
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[96]	valid_0's l1: 3.8328	valid_0's l2: 65.5365	valid_0's huber: 3.15756
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[93]	valid_0's l1: 5.36552	valid_0's l2: 118.891	valid_0's huber: 4.52341


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[54]	valid_0's l1: 2.71397	valid_0's l2: 32.651	valid_0's huber: 2.14147
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[82]	valid_0's l1: 24.1687	valid_0's l2: 2438.12	valid_0's huber: 21.4688
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[44]	valid_0's l1: 3.02751	valid_0's l2: 23.022	valid_0's huber: 2.37458
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[7]	valid_0's l1: 15.1503	valid_0's l2: 318.811	valid_0's huber: 13.2325


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[110]	valid_0's l1: 6.95511	valid_0's l2: 177.136	valid_0's huber: 5.95134
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[113]	valid_0's l1: 1.67222	valid_0's l2: 9.68986	valid_0's huber: 1.24903
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[123]	valid_0's l1: 2.26108	valid_0's l2: 17.4806	valid_0's huber: 1.75173
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[82]	valid_0's l1: 3.0272	valid_0's l2: 61.3472	valid_0's huber: 2.43967


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Unnamed: 0_level_0,symbol,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000167515,TRAPPC2L,1473.507,-0.176,1105.753,-0.294,1395.502,-0.217,1551.837,-0.243,204.763,-0.236
ENSG00000105672,ETV2,1171.758,0.709,952.87,0.664,975.064,0.685,756.851,0.683,1066.609,0.683
ENSG00000013288,MAN2B2,267.46,-0.431,228.884,-0.399,162.715,-0.383,121.464,-0.421,259.828,-0.559
ENSG00000142541,RPL13A,85.417,-0.049,166.262,-0.233,300.272,-0.239,67.537,-0.033,252.596,-0.132
ENSG00000150779,TIMM8B,35.282,-0.474,42.51,-0.54,56.095,-0.411,21.934,-0.483,11.023,-0.5
ENSG00000140025,EFCAB11,15.458,0.543,23.979,0.589,93.537,0.663,23.757,0.548,34.074,0.576
ENSG00000111850,SMIM8,5.193,-0.44,1.93,-0.471,0.077,-0.552,4.902,-0.189,5.786,-0.547
ENSG00000189043,NDUFA4,3.088,0.084,33.968,0.01,15.419,-0.206,10.124,0.253,3.831,-0.152


In [19]:
stage_one_lifespan_shap.selected["symbol"].values

array(['TRAPPC2L', 'ETV2', 'MAN2B2', 'RPL13A', 'TIMM8B', 'EFCAB11',
       'SMIM8', 'NDUFA4'], dtype=object)