# SHAP feature selection #
## Code to select feature with combination of LightGBM and SHAP ##

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [1]:
n_folds = 5 # this sets global setting of which how many bootstraps to use
n_hold_out = 1
repeats = 5
importance_type = "split"

life_history = ["lifespan", "mass_g", "mtGC", "metabolic_rate", "temperature", "gestation_days"]

lgb_params = {"objective": "regression",
              'boosting_type': 'gbdt', 
              'lambda_l1': 2.649670285109348, 
              'lambda_l2': 3.651743005278647, 
              'max_leaves': 21, 
              'max_depth': 3, 
              'feature_fraction': 0.7381836300988616, 
              'bagging_fraction': 0.5287709904685758, 
              'learning_rate': 0.054438364299744225, 
              'min_data_in_leaf': 7, 
              'drop_rate': 0.13171689004108006,
              'metric': ['mae','mse', 'huber'],
             }

life_history = ["lifespan", "mass_kg", "mtGC", "metabolic_rate", "temperature", "gestation_days"]
debug_local = True #to use local version

In [2]:
from pathlib import Path
import sys
import inspect

lgb_params["importance_type"] = importance_type

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']


In [3]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat, Collect
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.models import Metrics
from yspecies.selection import ShapSelector
from yspecies.results import FeatureSummary

In [5]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

Using matplotlib backend: TkAgg


### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [6]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

## Setting up Features to select ##

In [7]:
default_selection = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = "lifespan", #column to predict
    categorical = ["tissue"],
    select_by = "shap",
    importance_type =  importance_type,
    feature_perturbation = "tree_path_dependent"
)
default_selection

Samples metadata,Species metadata,Genes,Predict label,not_validated species
"['tissue', 'species']",[],all,lifespan,[]


In [21]:
loader = DataLoader(locations, default_selection)
selections = loader.load_life_history()
selections["lifespan"][0]

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(414, 12337)","(12337, 40)",41,414,"(12337, 2)","(41, 18)"


## Setting up SHAP selection pipeline ##

### Deciding on selection parameters (which fields to include, exclude, predict)  ###

In [22]:
partition_params = PartitionParameters(n_folds, n_hold_out, 2,   42)


In [23]:
partition_shap_pipe = Pipeline([
    ("partitioner", DataPartitioner()),
    ('prepare_for_partitioning', TupleWith(lgb_params)),
    ("shap_computation", ShapSelector())
]
)
repeated_cv =  Repeat(partition_shap_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))

In [24]:
selection_pipeline =  Pipeline(
    [
    ('extractor', DataExtractor()),
    ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
    ("partition_shap", repeated_cv),
    ("summarize", Collect(fold=lambda results: FeatureSummary(results)))
    ]
    )

In [25]:
selections["lifespan"][1]

Samples metadata,Species metadata,Genes,Predict label,not_validated species
"['tissue', 'species']",[],all,lifespan,"['Mus_caroli', 'Homo_sapiens']"


# First stage selection (shap ) #

### Lifespan ###

In [28]:
stage_one_lifespan = selection_pipeline.fit_transform(selections["lifespan"])
stage_one_lifespan.selected

===== fitting models with seed 0 =====
SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Gorilla_gorilla', 'Equus_caballus']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 6.30796	valid_0's l2: 137.579	valid_0's huber: 5.35246
Did not meet early stopping. Best iteration is:
[150]	valid_0's l1: 6.30796	valid_0's l2: 137.579	valid_0's huber: 5.35246
SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Ursus_americanus', 'Dasypus_novemcinctus']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 5.78401	valid_0's l2: 177.442	valid_0's huber: 4.87094
Did not meet early stopping. Best iteration is:
[150]	valid_0's l1: 5.78401	valid_0's l2: 177.442	valid_0's huber: 4.87094
SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Capra_hircus', 'Mesocricetus_auratus']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 4.72325	valid_0's l2: 65.2991	valid_0's huber: 3.89018
Did not meet early stopping. Best iteration is:
[150]	valid_0's

Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ENSG00000010219,DYRK4,5.0,19.0,0.28,20.0,0.301,18.25,0.309,20.25,0.229,17.25,0.238,19.25,0.322
ENSG00000204498,NFKBIL1,5.0,9.9,-0.066,10.25,-0.039,8.0,-0.013,10.25,-0.041,10.25,-0.152,10.75,-0.084
ENSG00000185880,TRIM69,5.0,9.2,0.08,6.25,0.05,11.25,0.083,6.5,0.042,11.0,0.113,11.0,0.11
ENSG00000170835,CEL,5.0,7.7,0.695,5.25,0.708,7.5,0.746,6.75,0.655,8.25,0.725,10.75,0.641
ENSG00000105672,ETV2,5.0,7.15,0.706,7.75,0.692,6.0,0.707,7.75,0.713,6.25,0.711,8.0,0.705
ENSG00000108384,RAD51C,4.0,10.75,0.537,13.0,0.348,12.75,0.551,,,10.25,0.676,7.0,0.574
ENSG00000132436,FIGNL1,4.0,7.25,0.589,,,4.75,0.593,9.0,0.537,3.75,0.58,11.5,0.644
ENSG00000188747,NOXA1,4.0,3.375,0.677,3.5,0.667,3.0,0.684,3.0,0.709,4.0,0.649,,
ENSG00000167515,TRAPPC2L,3.0,11.75,-0.34,12.0,-0.48,,,10.5,-0.296,12.75,-0.244,,
ENSG00000066923,STAG3,3.0,11.667,0.524,14.75,0.469,,,,,11.25,0.573,9.0,0.53


In [None]:
stage_one_lifespan.selected.metrics

### Mass_kg ###

In [29]:
stage_one_mass = selection_pipeline.fit_transform(selections["mass_kg"])
stage_one_mass.selected

===== fitting models with seed 0 =====
SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Callithrix_jacchus', 'Meriones_unguiculatus']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 5.67205	valid_0's l2: 71.6927	valid_0's huber: 4.70627
Did not meet early stopping. Best iteration is:
[148]	valid_0's l1: 5.59199	valid_0's l2: 69.9412	valid_0's huber: 4.6339
SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Sus_scrofa', 'Phascolarctos_cinereus']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 17.4768	valid_0's l2: 1105.06	valid_0's huber: 15.3512
Did not meet early stopping. Best iteration is:
[150]	valid_0's l1: 17.4768	valid_0's l2: 1105.06	valid_0's huber: 15.3512
SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Aotus_nancymaae', 'Cavia_porcellus']
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[65]	valid_0's l1: 7.001	valid_0's l2: 213.782	valid_0's huber: 6.00621
SEED: 0 | FOLD: 3 | VALIDATI

Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ENSG00000054277,OPN3,4.0,6.75,0.005,7.0,-0.033,,,8.25,0.044,4.75,-0.081,7.0,0.092
ENSG00000124172,ATP5F1E,3.0,3.833,-0.206,,,,,5.25,-0.243,3.5,-0.1,2.75,-0.275
ENSG00000010219,DYRK4,2.0,7.25,0.31,,,,,8.75,0.196,5.75,0.424,,
ENSG00000107020,PLGRKT,2.0,6.0,0.037,8.0,0.054,,,,,4.0,0.021,,
ENSG00000160796,NBEAL2,2.0,2.0,-0.076,2.5,-0.043,1.5,-0.109,,,,,,
ENSG00000175806,MSRA,1.0,10.75,-0.145,,,,,,,10.75,-0.145,,
ENSG00000127952,STYXL1,1.0,7.0,0.326,,,,,,,7.0,0.326,,
ENSG00000117151,CTBS,1.0,5.0,-0.044,,,,,,,,,5.0,-0.044
ENSG00000275111,ZNF2,1.0,4.5,-0.184,,,4.5,-0.184,,,,,,
ENSG00000198856,OSTC,1.0,4.0,-0.114,4.0,-0.114,,,,,,,,


### MtGC ###

In [30]:
stage_one_mtGC = selection_pipeline.fit_transform(selections["mtGC"])
stage_one_mtGC.selected

===== fitting models with seed 0 =====
SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Felis_catus', 'Vombatus_ursinus']
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[120]	valid_0's l1: 0.666611	valid_0's l2: 0.993221	valid_0's huber: 0.371771
SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Dasypus_novemcinctus', 'Homo_sapiens']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 1.08615	valid_0's l2: 3.17461	valid_0's huber: 0.757968
Did not meet early stopping. Best iteration is:
[150]	valid_0's l1: 1.08615	valid_0's l2: 3.17461	valid_0's huber: 0.757968
SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Pan_paniscus', 'Phascolarctos_cinereus']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 0.590552	valid_0's l2: 0.819627	valid_0's huber: 0.30303
Did not meet early stopping. Best iteration is:
[150]	valid_0's l1: 0.590552	valid_0's l2: 0.819627	valid_0's huber: 0.30303
SEED: 0 | FOLD: 3 | VAL

Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ENSG00000171121,KCNMB3,5.0,10.8,0.648,8.5,0.644,11.0,0.671,13.75,0.649,9.5,0.627,11.25,0.648
ENSG00000156384,SFR1,5.0,7.1,-0.655,5.75,-0.686,6.25,-0.632,7.5,-0.674,6.75,-0.647,9.25,-0.636
ENSG00000203710,CR1,5.0,7.0,-0.514,9.5,-0.525,7.75,-0.505,7.25,-0.534,6.25,-0.525,4.25,-0.481
ENSG00000177143,CETN1,4.0,13.75,-0.512,11.75,-0.537,12.75,-0.538,15.25,-0.464,15.25,-0.51,,
ENSG00000168300,PCMTD1,4.0,10.438,0.146,10.75,0.148,8.5,0.105,8.75,0.159,,,13.75,0.17
ENSG00000122034,GTF3A,4.0,7.0,0.613,3.75,0.615,8.75,0.582,,,4.5,0.65,11.0,0.604
ENSG00000163528,CHCHD4,4.0,6.062,0.473,5.25,0.55,5.25,0.473,9.5,0.434,4.25,0.435,,
ENSG00000154328,NEIL2,4.0,5.75,0.596,8.25,0.563,,,7.25,0.587,6.0,0.598,1.5,0.636
ENSG00000171747,LGALS4,4.0,5.75,-0.641,4.0,-0.63,6.25,-0.681,6.75,-0.612,6.0,-0.641,,
ENSG00000006282,SPATA20,3.0,12.833,0.678,10.5,0.646,,,,,8.25,0.696,19.75,0.693


### Metabolism ###

In [None]:
stage_one_metabolic_rate = selection_pipeline.fit_transform(selections["metabolic_rate"])
stage_one_metabolic_rate

===== fitting models with seed 0 =====
SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Mesocricetus_auratus', 'Mus_musculus']
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[40]	valid_0's l1: 17.1681	valid_0's l2: 781.887	valid_0's huber: 15.0463
SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Dasypus_novemcinctus', 'Sus_scrofa']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 17.8822	valid_0's l2: 660.614	valid_0's huber: 15.692
Did not meet early stopping. Best iteration is:
[150]	valid_0's l1: 17.8822	valid_0's l2: 660.614	valid_0's huber: 15.692
SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Oryctolagus_cuniculus', 'Callithrix_jacchus']
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[39]	valid_0's l1: 16.3192	valid_0's l2: 867.195	valid_0's huber: 14.2894
SEED: 0 | FOLD: 3 | VALIDATION_SPECIES: ['Cavia_porcellus', 'Rattus_norvegicus']
Training until validation scores do

### Temperature ###

In [None]:
stage_one_temperature = selection_pipeline.fit_transform(selections["temperature"])
stage_one_temperature

### Gestation ###

In [None]:
stage_one_gestation = selection_pipeline.fit_transform(selections["gestation_days"])
stage_one_gestation.selected

## Second stage selection ##