# SHAP feature selection #
## Code to select feature with combination of LightGBM and SHAP ##

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [2]:
number_of_bootstraps = 5 # this sets global setting of which how many bootstraps to use

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'max_leaves': 20,
    'max_depth': 3,
    'learning_rate': 0.07,
    'feature_fraction': 0.8,
    'bagging_fraction': 1,
    'min_data_in_leaf': 6,
    'lambda_l1': 0.9,
    'lambda_l2': 0.9,
    "verbose": -1
}

debug_local = True #to use local version

In [3]:
from pathlib import Path
import sys
import inspect

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']


In [6]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import *
from yspecies.partition import *
from yspecies.selection import *

In [7]:
from dataclasses import dataclass
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
import pandas as pd
import shap
from pprint import pprint
import random
import numpy as np
import lightgbm as lgb
from scipy.stats import kendalltau
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, recall_score, precision_score, f1_score

In [9]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [12]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [13]:
data = ExpressionDataset.from_folder(locations.interim.selected)
data

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(452, 12630)",12630,43,452,"(12630, 2)","(44, 18)"


## Setting up SHAP selection pipeline ##

In [15]:
from sklearn.pipeline import Pipeline
selection = SelectedFeatures([],[], to_predict = "lifespan")
pipe = Pipeline([
    ('extractor', DataExtractor(selection)), # to extract the data required for ML from the dataset
    ("partitioner", DataPartitioner(species_in_validation=2)), # to partition it according to sorted stratification
    ("shap_computation", ShapSelector(ModelFactory()))] # to train lightGBM and do feature selection
)

In [16]:
d = pipe.fit_transform(data)
d

Training until validation scores don't improve for 7 rounds
Early stopping, best iteration is:
[186]	valid_0's l2: 13.2411	valid_0's l1: 2.1018
Training until validation scores don't improve for 7 rounds
Early stopping, best iteration is:
[202]	valid_0's l2: 14.215	valid_0's l1: 2.00448
Training until validation scores don't improve for 7 rounds
Early stopping, best iteration is:
[231]	valid_0's l2: 17.9586	valid_0's l1: 2.48451
Training until validation scores don't improve for 7 rounds
Early stopping, best iteration is:
[148]	valid_0's l2: 18.698	valid_0's l1: 2.68691
Training until validation scores don't improve for 7 rounds
Early stopping, best iteration is:
[108]	valid_0's l2: 96.1841	valid_0's l1: 6.38977


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


MEAN metrics = R^2    0.928
MSE   32.059
MAE    3.133
dtype: float64


Unnamed: 0_level_0,symbol,gain_score_to_lifespan,shap,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,Unnamed: 4_level_2
ENSG00000175854,SWI5,272472.739,82.596,-0.667
ENSG00000060762,MPC1,94141.358,-52.399,-0.337
ENSG00000167515,TRAPPC2L,87799.658,-51.36,-0.193
ENSG00000010219,DYRK4,69863.470,-8.226,0.29
ENSG00000066923,STAG3,68610.154,36.085,0.511
ENSG00000148248,SURF4,12961.307,-39.933,-0.22
ENSG00000144451,SPAG16,12006.033,6.743,0.574
ENSG00000134489,HRH4,7820.301,12.407,0.648
ENSG00000129187,DCTD,6513.708,5.684,0.651
ENSG00000101928,MOSPD1,3860.975,2.97,-0.67

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,shap,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000175854,SWI5,272472.739,82.596,-0.667
ENSG00000060762,MPC1,94141.358,-52.399,-0.337
ENSG00000167515,TRAPPC2L,87799.658,-51.36,-0.193
ENSG00000010219,DYRK4,69863.47,-8.226,0.29
ENSG00000066923,STAG3,68610.154,36.085,0.511
ENSG00000148248,SURF4,12961.307,-39.933,-0.22
ENSG00000144451,SPAG16,12006.033,6.743,0.574
ENSG00000134489,HRH4,7820.301,12.407,0.648
ENSG00000129187,DCTD,6513.708,5.684,0.651
ENSG00000101928,MOSPD1,3860.975,2.97,-0.67

Unnamed: 0,R^2,MSE,MAE
0,0.984,13.241,2.102
1,0.982,14.215,2.004
2,0.972,17.959,2.485
3,0.973,18.698,2.687
4,0.727,96.184,6.39


In [12]:
#shap.summary_plot('mass_g', shap_values, df)
#shap.summary_plot(shap_values, df, feature_names=shap_feature_names, sort=False, plot_type='dot', max_display=100, show=False)
pipe.fit_transform(data)

Training until validation scores don't improve for 7 rounds
Early stopping, best iteration is:
[329]	valid_0's l2: 13.925	valid_0's l1: 2.08957
Training until validation scores don't improve for 7 rounds
Early stopping, best iteration is:
[278]	valid_0's l2: 16.2151	valid_0's l1: 2.27646
Training until validation scores don't improve for 7 rounds
Early stopping, best iteration is:
[284]	valid_0's l2: 15.0505	valid_0's l1: 2.32935
Training until validation scores don't improve for 7 rounds
Early stopping, best iteration is:
[236]	valid_0's l2: 17.9281	valid_0's l1: 2.3675
Training until validation scores don't improve for 7 rounds
Early stopping, best iteration is:
[164]	valid_0's l2: 757.376	valid_0's l1: 12.9911


Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


MEAN metrics = R^2     0.092
MSE   164.099
MAE     4.411
dtype: float64


Unnamed: 0_level_0,symbol,gain_score_to_lifespan,shap,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,Unnamed: 4_level_2
ENSG00000175854,SWI5,295215.966,49.959,-0.686
ENSG00000060762,MPC1,98518.725,-91.705,-0.189
ENSG00000167515,TRAPPC2L,74403.568,-30.898,-0.223
ENSG00000066923,STAG3,46875.758,25.416,0.486
ENSG00000164879,CA3,11046.768,6.12,-0.561
ENSG00000173567,ADGRF3,9564.103,37.771,0.491
ENSG00000134489,HRH4,7922.326,9.717,0.589
ENSG00000129187,DCTD,4958.421,0.655,0.682
ENSG00000074621,SLC24A1,4157.085,-3.102,0.545
ENSG00000118600,RXYLT1,3944.464,26.152,-0.724

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,shap,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000175854,SWI5,295215.966,49.959,-0.686
ENSG00000060762,MPC1,98518.725,-91.705,-0.189
ENSG00000167515,TRAPPC2L,74403.568,-30.898,-0.223
ENSG00000066923,STAG3,46875.758,25.416,0.486
ENSG00000164879,CA3,11046.768,6.12,-0.561
ENSG00000173567,ADGRF3,9564.103,37.771,0.491
ENSG00000134489,HRH4,7922.326,9.717,0.589
ENSG00000129187,DCTD,4958.421,0.655,0.682
ENSG00000074621,SLC24A1,4157.085,-3.102,0.545
ENSG00000118600,RXYLT1,3944.464,26.152,-0.724

Unnamed: 0,R^2,MSE,MAE
0,0.955,13.925,2.09
1,0.946,16.215,2.276
2,0.947,15.051,2.329
3,0.938,17.928,2.368
4,-3.325,757.376,12.991


In [17]:
pipe.transform(data)

Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
Setting feature_perturbation = "tree_path_dependent" because no background data was given.


MEAN metrics = R^2    0.957
MSE   18.273
MAE    2.233
dtype: float64


Unnamed: 0_level_0,symbol,gain_score_to_lifespan,shap,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unnamed: 0_level_2,R^2,MSE,MAE,Unnamed: 4_level_2
ENSG00000175854,SWI5,272472.739,82.596,-0.667
ENSG00000060762,MPC1,94141.358,-52.399,-0.337
ENSG00000167515,TRAPPC2L,87799.658,-51.36,-0.193
ENSG00000010219,DYRK4,69863.470,-8.226,0.29
ENSG00000066923,STAG3,68610.154,36.085,0.511
ENSG00000148248,SURF4,12961.307,-39.933,-0.22
ENSG00000144451,SPAG16,12006.033,6.743,0.574
ENSG00000134489,HRH4,7820.301,12.407,0.648
ENSG00000129187,DCTD,6513.708,5.684,0.651
ENSG00000101928,MOSPD1,3860.975,2.97,-0.67

Unnamed: 0_level_0,symbol,gain_score_to_lifespan,shap,kendall_tau_to_lifespan
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000175854,SWI5,272472.739,82.596,-0.667
ENSG00000060762,MPC1,94141.358,-52.399,-0.337
ENSG00000167515,TRAPPC2L,87799.658,-51.36,-0.193
ENSG00000010219,DYRK4,69863.47,-8.226,0.29
ENSG00000066923,STAG3,68610.154,36.085,0.511
ENSG00000148248,SURF4,12961.307,-39.933,-0.22
ENSG00000144451,SPAG16,12006.033,6.743,0.574
ENSG00000134489,HRH4,7820.301,12.407,0.648
ENSG00000129187,DCTD,6513.708,5.684,0.651
ENSG00000101928,MOSPD1,3860.975,2.97,-0.67

Unnamed: 0,R^2,MSE,MAE
0,0.959,11.934,1.772
1,0.963,10.769,1.725
2,0.969,8.59,1.634
3,0.936,17.921,2.404
4,0.959,42.152,3.631
