# SHAP feature selection #
## Code to select feature with combination of LightGBM and SHAP ##

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [1]:
n_folds = 5 # this sets global setting of which how many bootstraps to use
n_hold_out = 1
repeats = 5
importance_type = "split"

life_history = ["lifespan", "mass_g", "mtGC", "metabolic_rate", "temperature", "gestation_days"]

lgb_params = {"objective": "regression",
              'boosting_type': 'gbdt', 
              'lambda_l1': 2.649670285109348, 
              'lambda_l2': 3.651743005278647, 
              'max_leaves': 21, 
              'max_depth': 3, 
              'feature_fraction': 0.7381836300988616, 
              'bagging_fraction': 0.5287709904685758, 
              'learning_rate': 0.054438364299744225, 
              'min_data_in_leaf': 7, 
              'drop_rate': 0.13171689004108006,
              'metric': ['mae','mse', 'huber'],
             }

life_history = ["lifespan", "mass_kg", "mtGC", "metabolic_rate", "temperature", "gestation_days"]
debug_local = True #to use local version

In [2]:
from pathlib import Path
import sys
import inspect

lgb_params["importance_type"] = importance_type

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']


In [3]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat, Collect
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.models import Metrics
from yspecies.selection import ShapSelector
from yspecies.results import FeatureSummary

In [5]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

Using matplotlib backend: TkAgg


### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [6]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

## Setting up Features to select ##

In [7]:
default_selection = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = "lifespan", #column to predict
    categorical = ["tissue"],
    select_by = "shap",
    importance_type =  importance_type,
    feature_perturbation = "tree_path_dependent"
)

In [8]:
@dataclass(frozen=True)
class ForTraining:
    trait: str
    data: ExpressionDataset
    selection: FeatureSelection
        
    @property
    def tuple(self):
        return (self.data, self.selection)
        
    @staticmethod
    def load(trait: str, default_selection):
        return ForTraining(trait, ExpressionDataset.from_folder(locations.interim.selected / trait),  replace(default_selection, to_predict = trait ))
        
    def _repr_html(self):
        return data.repr_html()

In [9]:
selections = [ForTraining.load(trait, default_selection) for trait in life_history]
for_training = {t.trait:t for t in selections}
selections

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(414, 12337)","(12337, 40)",41,414,"(12337, 2)","(41, 18)"

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(262, 11991)","(11991, 18)",19,262,"(11991, 2)","(19, 18)"

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(348, 12043)","(12043, 31)",32,348,"(12043, 2)","(32, 18)"

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(258, 11973)","(11973, 17)",18,258,"(11973, 2)","(18, 18)"

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(326, 12033)","(12033, 26)",27,326,"(12033, 2)","(27, 18)"

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(411, 12337)","(12337, 39)",40,411,"(12337, 2)","(40, 18)"


## Setting up SHAP selection pipeline ##

### Deciding on selection parameters (which fields to include, exclude, predict)  ###

In [10]:
partition_params = PartitionParameters(n_folds, n_hold_out, 2, [],  42)


In [11]:
partition_shap_pipe = Pipeline([
    ("partitioner", DataPartitioner()),
    ('prepare_for_partitioning', TupleWith(lgb_params)),
    ("shap_computation", ShapSelector())
]
)
repeated_cv =  Repeat(partition_shap_pipe, repeats, lambda x,i: (x[0], replace(x[1], seed = i)))

In [12]:
selection_pipeline =  Pipeline(
    [
    ('extractor', DataExtractor()),
    ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
    ("partition_shap", repeated_cv),
    ("summarize", Collect(fold=lambda results: FeatureSummary(results)))
    ]
    )

# First stage selection (shap ) #

### Lifespan ###

In [13]:
stage_one_lifespan = selection_pipeline.fit_transform(for_training["lifespan"].tuple)
stage_one_lifespan.selected

===== fitting models with seed 0 =====
SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Homo_sapiens', 'Pongo_pygmaeus']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 23.5334	valid_0's l2: 2083.75	valid_0's huber: 20.8534
Did not meet early stopping. Best iteration is:
[150]	valid_0's l1: 23.5334	valid_0's l2: 2083.75	valid_0's huber: 20.8534
SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Mus_spicilegus', 'Ailuropoda_melanoleuca']
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[98]	valid_0's l1: 3.06686	valid_0's l2: 42.5971	valid_0's huber: 2.42344
SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Ovis_aries', 'Phascolarctos_cinereus']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 3.16835	valid_0's l2: 29.2406	valid_0's huber: 2.49493
Did not meet early stopping. Best iteration is:
[150]	valid_0's l1: 3.16835	valid_0's l2: 29.2406	valid_0's huber: 2.49493
SEED: 0 | FOLD: 3 | VALIDATION_S

Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ENSG00000170835,CEL,5.0,8.55,0.689,9.0,0.648,9.0,0.66,8.25,0.674,7.25,0.714,9.25,0.751
ENSG00000105672,ETV2,5.0,7.8,0.693,9.25,0.674,10.5,0.697,5.5,0.712,7.75,0.697,6.0,0.688
ENSG00000132436,FIGNL1,5.0,5.5,0.582,3.75,0.6,11.5,0.578,3.75,0.548,5.0,0.619,3.5,0.565
ENSG00000171121,KCNMB3,4.0,9.5,0.627,10.5,0.644,9.5,0.601,11.75,0.619,6.25,0.641,,
ENSG00000165501,LRR1,4.0,5.438,0.688,6.25,0.728,4.25,0.739,2.25,0.59,9.0,0.694,,
ENSG00000013288,MAN2B2,4.0,4.875,-0.385,6.0,-0.427,3.25,-0.337,8.0,-0.377,2.25,-0.4,,
ENSG00000164879,CA3,4.0,4.812,-0.451,6.25,-0.439,4.75,-0.382,,,5.25,-0.448,3.0,-0.537
ENSG00000129187,DCTD,4.0,3.75,0.628,4.75,0.675,3.0,0.537,3.0,0.662,4.25,0.636,,
ENSG00000167515,TRAPPC2L,3.0,11.75,-0.238,10.25,-0.257,,,10.25,-0.222,14.75,-0.236,,
ENSG00000066923,STAG3,3.0,11.667,0.466,10.75,0.485,,,11.5,0.469,,,12.75,0.443


### Mass_kg ###

In [17]:
stage_one_mass = selection_pipeline.fit_transform(for_training["mass_kg"].tuple)
stage_one_mass.selected

===== fitting models with seed 0 =====
SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Ictidomys_tridecemlineatus', 'Monodelphis_domestica']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 3.10991	valid_0's l2: 37.7363	valid_0's huber: 2.50013
Did not meet early stopping. Best iteration is:
[145]	valid_0's l1: 3.09676	valid_0's l2: 37.5914	valid_0's huber: 2.49098
SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Bos_taurus', 'Sarcophilus_harrisii']
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[33]	valid_0's l1: 65.5812	valid_0's l2: 21646.9	valid_0's huber: 58.6457
SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Otolemur_garnettii', 'Suricata_suricatta']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 1.0897	valid_0's l2: 9.25689	valid_0's huber: 0.825931
Did not meet early stopping. Best iteration is:
[148]	valid_0's l1: 1.08761	valid_0's l2: 9.233	valid_0's huber: 0.824029
SEED: 0 | FOLD

Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ENSG00000010219,DYRK4,3.0,23.917,0.262,53.25,0.015,15.0,0.322,,,,,3.5,0.449
ENSG00000124172,ATP5F1E,2.0,4.125,-0.222,,,,,3.0,-0.21,,,5.25,-0.235
ENSG00000127952,STYXL1,1.0,11.0,0.141,,,,,,,11.0,0.141,,
ENSG00000066923,STAG3,1.0,7.25,0.248,7.25,0.248,,,,,,,,
ENSG00000013288,MAN2B2,1.0,7.25,-0.012,,,,,7.25,-0.012,,,,
ENSG00000204498,NFKBIL1,1.0,6.75,-0.122,,,6.75,-0.122,,,,,,
ENSG00000117151,CTBS,1.0,6.0,-0.021,,,,,6.0,-0.021,,,,
ENSG00000175806,MSRA,1.0,5.75,-0.163,,,,,,,5.75,-0.163,,
ENSG00000054277,OPN3,1.0,5.25,0.236,,,,,5.25,0.236,,,,
ENSG00000164304,CAGE1,1.0,3.5,-0.492,,,,,,,,,3.5,-0.492


### MtGC ###

In [18]:
stage_one_mtGC = selection_pipeline.fit_transform(for_training["mtGC"].tuple)
stage_one_mtGC.selected

===== fitting models with seed 0 =====
SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Ursus_americanus', 'Oryctolagus_cuniculus']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 0.340691	valid_0's l2: 0.33695	valid_0's huber: 0.136808
Did not meet early stopping. Best iteration is:
[150]	valid_0's l1: 0.340691	valid_0's l2: 0.33695	valid_0's huber: 0.136808
SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Ovis_aries', 'Pongo_pygmaeus']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 0.816113	valid_0's l2: 1.53847	valid_0's huber: 0.499203
Did not meet early stopping. Best iteration is:
[150]	valid_0's l1: 0.816113	valid_0's l2: 1.53847	valid_0's huber: 0.499203
SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Pan_troglodytes', 'Mus_musculus']
Training until validation scores don't improve for 10 rounds
[150]	valid_0's l1: 1.11646	valid_0's l2: 2.30624	valid_0's huber: 0.71246
Did not meet early stopping. Best iteration is:
[150]	valid_0's 

Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1,shap_2,kendall_tau_2,shap_3,kendall_tau_3,shap_4,kendall_tau_4
ENSG00000203710,CR1,4.0,12.25,-0.497,10.5,-0.452,11.75,-0.47,20.75,-0.494,,,6.0,-0.571
ENSG00000156384,SFR1,4.0,7.188,-0.657,4.75,-0.692,5.75,-0.664,12.75,-0.587,,,5.5,-0.686
ENSG00000154582,ELOC,4.0,4.438,-0.466,3.75,-0.339,3.0,-0.556,8.5,-0.507,,,2.5,-0.465
ENSG00000171121,KCNMB3,3.0,11.167,0.643,14.5,0.663,,,4.25,0.612,,,14.75,0.653
ENSG00000188747,NOXA1,3.0,10.833,0.669,7.5,0.659,11.0,0.69,14.0,0.659,,,,
ENSG00000154328,NEIL2,3.0,7.417,0.611,6.75,0.665,,,10.5,0.666,5.0,0.503,,
ENSG00000152705,CATSPER3,3.0,7.083,0.645,,,9.75,0.685,2.5,0.649,,,9.0,0.603
ENSG00000144451,SPAG16,3.0,7.083,0.528,,,3.5,0.54,11.75,0.449,,,6.0,0.594
ENSG00000163528,CHCHD4,3.0,6.0,0.438,7.0,0.437,7.0,0.504,,,,,4.0,0.375
ENSG00000165568,AKR1E2,2.0,12.0,-0.328,9.25,-0.323,,,,,,,14.75,-0.332


### Metabolism ###

In [19]:
stage_one_metabolic_rate = selection_pipeline.fit_transform(for_training["metabolic_rate"].tuple)
stage_one_metabolic_rate

===== fitting models with seed 0 =====
SEED: 0 | FOLD: 0 | VALIDATION_SPECIES: ['Callithrix_jacchus', 'Monodelphis_domestica']
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[36]	valid_0's l1: 8.88985	valid_0's l2: 231.829	valid_0's huber: 7.59587
SEED: 0 | FOLD: 1 | VALIDATION_SPECIES: ['Bos_taurus', 'Sarcophilus_harrisii']
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[4]	valid_0's l1: 58.1922	valid_0's l2: 15405	valid_0's huber: 51.968
SEED: 0 | FOLD: 2 | VALIDATION_SPECIES: ['Otolemur_garnettii', 'Suricata_suricatta']
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[29]	valid_0's l1: 5.93123	valid_0's l2: 120.51	valid_0's huber: 4.93311
SEED: 0 | FOLD: 3 | VALIDATION_SPECIES: ['Oryctolagus_cuniculus', 'Dasypus_novemcinctus']
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[2]	valid_0's l1: 

KeyError: "None of ['ensembl_id'] are in the columns"

### Temperature ###

In [None]:
stage_one_temperature = selection_pipeline.fit_transform(for_training["temperature"].tuple)
stage_one_temperature

### Gestation ###

In [None]:
stage_one_gestation = selection_pipeline.fit_transform(for_training["gestation_days"].tuple)
stage_one_gestation.selected

## Second stage selection ##