# Debug #

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [1]:
n_folds = 5 # this sets global setting of which how many bootstraps to use
n_hold_out = 1
importance_type = "split"

life_history = ["lifespan", "mass_kg", "mtGC", "metabolic_rate", "temperature", "gestation_days"]
debug_local = True #to use local version
trait = "mass_kg"

params = {"bagging_fraction": 0.9522534844058304,
                  "boosting_type": "dart",
                  "objective": "regression",
                  "feature_fraction": 0.42236910941558053,
                  "lambda_l1": 0.020847266580277746,
                  "lambda_l2": 2.8448564854773326,
                  "learning_rate": 0.11484015430016059,
                  "max_depth": 3,
                  "max_leaves": 35,
                  "min_data_in_leaf": 9,
                  "num_iterations": 250,
                  "metrics": ["l1", "l2", "huber"]
                 }

In [2]:
from pathlib import Path
import sys
import inspect

#lgb_params["importance_type"] = importance_type

local = (Path("..") / "yspecies").resolve()
if debug_local and local.exists():
  sys.path.insert(0, Path("..").as_posix())
  #sys.path.insert(0, local.as_posix())
  print("extending pathes with local yspecies")
  print(sys.path)
  %load_ext autoreload
  %autoreload 2

extending pathes with local yspecies
['..', '/data/sources/yspecies/notebooks', '/opt/miniconda3/envs/yspecies/lib/python38.zip', '/opt/miniconda3/envs/yspecies/lib/python3.8', '/opt/miniconda3/envs/yspecies/lib/python3.8/lib-dynload', '', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages', '/opt/miniconda3/envs/yspecies/lib/python3.8/site-packages/IPython/extensions', '/home/antonkulaga/.ipython']


In [3]:
from dataclasses import dataclass, replace
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from typing import *
from yspecies.dataset import *
from yspecies.utils import *
from yspecies.workflow import TupleWith, Repeat, Collect
from yspecies.config import *
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.models import Metrics
from yspecies.selection import ShapSelector
from yspecies.results import FeatureSummary

In [5]:
import optuna
from optuna import Study, Trial
from optuna import multi_objective
from optuna.multi_objective import trial
from optuna.multi_objective.study import MultiObjectiveStudy
from yspecies.tuning import MultiObjectiveResults

In [6]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

#charts settings
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
%matplotlib auto
plt.ioff()
set_matplotlib_formats('svg')

Using matplotlib backend: TkAgg


### Loading data ###
Let's load data from species/genes/expressions selected by select_samples.py notebook

In [7]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

## Setting up Features to select ##

In [8]:
selection = FeatureSelection(
    samples = ["tissue","species"], #samples metadata to include
    species =  [], #species metadata other then Y label to include
    exclude_from_training = ["species"],  #exclude some fields from LightGBM training
    to_predict = trait, #column to predict
    categorical = ["tissue"],
    select_by = "shap",
    importance_type =  importance_type,
    feature_perturbation = "tree_path_dependent"
)
selection

Samples metadata,Species metadata,Genes,Predict label,not_validated species
"['tissue', 'species']",[],all,mass_kg,[]


In [9]:
loader = DataLoader(locations, selection)
selections = loader.load_life_history()
data = selections[trait]

## Setting up SHAP selection pipeline ##

### Deciding on selection parameters (which fields to include, exclude, predict)  ###

In [10]:
partition_params = PartitionParameters(n_folds, n_hold_out, 2,   42)


In [11]:
pipe =  Pipeline([
        ('extractor', DataExtractor()),
        ('prepare_for_partitioning', TupleWith(partition_params)), # to extract the data required for ML from the dataset
        ("partitioner", DataPartitioner()),
        ('prepare_for_selection', TupleWith(params)),
        ("shap_computation", ShapSelector())]
        )

In [12]:
res = pipe.fit_transform(data, selection)
res

2020-08-28 20:28:40.831 | INFO     | yspecies.selection:fit:78 - ===== fitting models with seed 42 =====
2020-08-28 20:28:40.831 | INFO     | yspecies.selection:fit:79 - PARAMETERS:
{'bagging_fraction': 0.9522534844058304, 'boosting_type': 'dart', 'objective': 'regression', 'feature_fraction': 0.42236910941558053, 'lambda_l1': 0.020847266580277746, 'lambda_l2': 2.8448564854773326, 'learning_rate': 0.11484015430016059, 'max_depth': 3, 'max_leaves': 35, 'min_data_in_leaf': 9, 'num_iterations': 250, 'metrics': ['l1', 'l2', 'huber']}
2020-08-28 20:28:40.850 | INFO     | yspecies.selection:fit:82 - SEED: 42 | FOLD: 0 | VALIDATION_SPECIES: ['Suricata_suricatta', 'Oryctolagus_cuniculus']
Found `num_iterations` in params. Will use it instead of argument
Early stopping is not available in dart mode


[250]	valid_0's l1: 17.8169	valid_0's l2: 757.273	valid_0's huber: 15.6606


2020-08-28 20:28:45.974 | INFO     | yspecies.selection:fit:82 - SEED: 42 | FOLD: 1 | VALIDATION_SPECIES: ['Monodelphis_domestica', 'Heterocephalus_glaber']


[250]	valid_0's l1: 13.6973	valid_0's l2: 469.715	valid_0's huber: 11.9359


2020-08-28 20:28:50.244 | INFO     | yspecies.selection:fit:82 - SEED: 42 | FOLD: 2 | VALIDATION_SPECIES: ['Callithrix_jacchus', 'Cavia_porcellus']


[250]	valid_0's l1: 16.0078	valid_0's l2: 586.886	valid_0's huber: 14.0241


2020-08-28 20:28:52.958 | INFO     | yspecies.selection:fit:82 - SEED: 42 | FOLD: 3 | VALIDATION_SPECIES: ['Otolemur_garnettii', 'Ictidomys_tridecemlineatus']


[250]	valid_0's l1: 4.83788	valid_0's l2: 125.784	valid_0's huber: 3.97368


Unnamed: 0_level_0,symbol,shap_absolute_sum_to_mass_kg,kendall_tau_to_mass_kg,Unnamed: 4_level_0,Unnamed: 5_level_0
Unnamed: 0_level_1,R^2,MAE,MSE,huber,validation_species
Unnamed: 0_level_2,R^2,MAE,MSE,huber,hold_out_species
ENSG00000066923,STAG3,32.000,0.215,,
ENSG00000054277,OPN3,22.750,0.225,,
ENSG00000144214,LYG1,22.500,0.287,,
ENSG00000089063,TMEM230,20.250,-0.118,,
ENSG00000164304,CAGE1,17.500,-0.366,,
ENSG00000127952,STYXL1,8.750,0.304,,
ENSG00000175806,MSRA,8.250,-0.216,,
ENSG00000075702,WDR62,7.000,-0.103,,
ENSG00000275111,ZNF2,1.250,0.028,,
0,0.869,17.817,757.273,15.661,"[Suricata_suricatta, Oryctolagus_cuniculus]"

Unnamed: 0,symbol,shap_absolute_sum_to_mass_kg,kendall_tau_to_mass_kg
ENSG00000066923,STAG3,32.0,0.215
ENSG00000054277,OPN3,22.75,0.225
ENSG00000144214,LYG1,22.5,0.287
ENSG00000089063,TMEM230,20.25,-0.118
ENSG00000164304,CAGE1,17.5,-0.366
ENSG00000127952,STYXL1,8.75,0.304
ENSG00000175806,MSRA,8.25,-0.216
ENSG00000075702,WDR62,7.0,-0.103
ENSG00000275111,ZNF2,1.25,0.028

Unnamed: 0,R^2,MAE,MSE,huber,validation_species
0,0.869,17.817,757.273,15.661,"[Suricata_suricatta, Oryctolagus_cuniculus]"
1,0.869,13.697,469.715,11.936,"[Monodelphis_domestica, Heterocephalus_glaber]"
2,0.769,16.008,586.886,14.024,"[Callithrix_jacchus, Cavia_porcellus]"
3,0.965,4.838,125.784,3.974,"[Otolemur_garnettii, Ictidomys_tridecemlineatus]"

Unnamed: 0,R^2,MAE,MSE,huber,hold_out_species
0,0.863,12.121,487.992,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
1,0.888,12.691,397.564,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
2,0.809,17.35,680.837,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
3,0.934,8.925,235.935,,"[Phascolarctos_cinereus, Aotus_nancymaae]"


In [17]:
res.selected.index

Index(['ENSG00000066923', 'ENSG00000054277', 'ENSG00000144214',
       'ENSG00000089063', 'ENSG00000164304', 'ENSG00000127952',
       'ENSG00000175806', 'ENSG00000075702', 'ENSG00000275111'],
      dtype='object')

In [18]:
empty_selected = pd.DataFrame(columns=["symbol","shap_absolute_sum_to_mass_kg", "kendall_tau_to_mass_kg"])
empty_selected.index.name = "ensembl_id"
empty_selected

Unnamed: 0_level_0,symbol,shap_absolute_sum_to_mass_kg,kendall_tau_to_mass_kg
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [19]:
from yspecies.results import FeatureSummary


In [23]:
e = replace(res,selected=empty_selected)
e

Unnamed: 0_level_0,symbol,shap_absolute_sum_to_mass_kg,kendall_tau_to_mass_kg,Unnamed: 4_level_0,Unnamed: 5_level_0
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Unnamed: 0_level_2,R^2,MAE,MSE,huber,validation_species
Unnamed: 0_level_3,R^2,MAE,MSE,huber,hold_out_species
0,0.869,17.817,757.273,15.661,"[Suricata_suricatta, Oryctolagus_cuniculus]"
1,0.869,13.697,469.715,11.936,"[Monodelphis_domestica, Heterocephalus_glaber]"
2,0.769,16.008,586.886,14.024,"[Callithrix_jacchus, Cavia_porcellus]"
3,0.965,4.838,125.784,3.974,"[Otolemur_garnettii, Ictidomys_tridecemlineatus]"
0,0.863,12.121,487.992,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
1,0.888,12.691,397.564,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
2,0.809,17.350,680.837,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
3,0.934,8.925,235.935,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
selected,metrics,hold out metrics,,,
symbol  shap_absolute_sum_to_mass_kg  kendall_tau_to_mass_kg  ensembl_id,"R^2  MAE  MSE  huber  validation_species  0  0.869  17.817  757.273  15.661  [Suricata_suricatta, Oryctolagus_cuniculus]  1  0.869  13.697  469.715  11.936  [Monodelphis_domestica, Heterocephalus_glaber]  2  0.769  16.008  586.886  14.024  [Callithrix_jacchus, Cavia_porcellus]  3  0.965  4.838  125.784  3.974  [Otolemur_garnettii, Ictidomys_tridecemlineatus]","R^2  MAE  MSE  huber  hold_out_species  0  0.863  12.121  487.992  nan  [Phascolarctos_cinereus, Aotus_nancymaae]  1  0.888  12.691  397.564  nan  [Phascolarctos_cinereus, Aotus_nancymaae]  2  0.809  17.350  680.837  nan  [Phascolarctos_cinereus, Aotus_nancymaae]  3  0.934  8.925  235.935  nan  [Phascolarctos_cinereus, Aotus_nancymaae]",,,

Unnamed: 0_level_0,symbol,shap_absolute_sum_to_mass_kg,kendall_tau_to_mass_kg
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1

Unnamed: 0,R^2,MAE,MSE,huber,validation_species
0,0.869,17.817,757.273,15.661,"[Suricata_suricatta, Oryctolagus_cuniculus]"
1,0.869,13.697,469.715,11.936,"[Monodelphis_domestica, Heterocephalus_glaber]"
2,0.769,16.008,586.886,14.024,"[Callithrix_jacchus, Cavia_porcellus]"
3,0.965,4.838,125.784,3.974,"[Otolemur_garnettii, Ictidomys_tridecemlineatus]"

Unnamed: 0,R^2,MAE,MSE,huber,hold_out_species
0,0.863,12.121,487.992,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
1,0.888,12.691,397.564,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
2,0.809,17.35,680.837,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
3,0.934,8.925,235.935,,"[Phascolarctos_cinereus, Aotus_nancymaae]"


In [28]:
f = FeatureSummary([res, e])
f

Unnamed: 0_level_0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1
Unnamed: 0_level_1,R^2,MAE,MSE,huber,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Unnamed: 0_level_2,R^2,MAE,MSE,huber,hold_out_species,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ENSG00000066923,STAG3,1.000,32.0,0.215,32.000,0.215,,
ENSG00000054277,OPN3,1.000,22.75,0.225,22.750,0.225,,
ENSG00000144214,LYG1,1.000,22.5,0.287,22.500,0.287,,
ENSG00000089063,TMEM230,1.000,20.25,-0.118,20.250,-0.118,,
ENSG00000164304,CAGE1,1.000,17.5,-0.366,17.500,-0.366,,
ENSG00000127952,STYXL1,1.000,8.75,0.304,8.750,0.304,,
ENSG00000175806,MSRA,1.000,8.25,-0.216,8.250,-0.216,,
ENSG00000075702,WDR62,1.000,7.0,-0.103,7.000,-0.103,,
ENSG00000275111,ZNF2,1.000,1.25,0.028,1.250,0.028,,
0,0.869,17.817,757.273,15.661,,,,

Unnamed: 0,symbol,repeats,mean_shap,mean_kendall_tau,shap_0,kendall_tau_0,shap_1,kendall_tau_1
ENSG00000066923,STAG3,1.0,32.0,0.215,32.0,0.215,,
ENSG00000054277,OPN3,1.0,22.75,0.225,22.75,0.225,,
ENSG00000144214,LYG1,1.0,22.5,0.287,22.5,0.287,,
ENSG00000089063,TMEM230,1.0,20.25,-0.118,20.25,-0.118,,
ENSG00000164304,CAGE1,1.0,17.5,-0.366,17.5,-0.366,,
ENSG00000127952,STYXL1,1.0,8.75,0.304,8.75,0.304,,
ENSG00000175806,MSRA,1.0,8.25,-0.216,8.25,-0.216,,
ENSG00000075702,WDR62,1.0,7.0,-0.103,7.0,-0.103,,
ENSG00000275111,ZNF2,1.0,1.25,0.028,1.25,0.028,,

Unnamed: 0,R^2,MAE,MSE,huber
0,0.869,17.817,757.273,15.661
1,0.869,13.697,469.715,11.936
2,0.769,16.008,586.886,14.024
3,0.965,4.838,125.784,3.974
0,0.869,17.817,757.273,15.661
1,0.869,13.697,469.715,11.936
2,0.769,16.008,586.886,14.024
3,0.965,4.838,125.784,3.974

Unnamed: 0,R^2,MAE,MSE,huber,hold_out_species
0,0.863,12.121,487.992,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
1,0.888,12.691,397.564,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
2,0.809,17.35,680.837,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
3,0.934,8.925,235.935,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
0,0.863,12.121,487.992,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
1,0.888,12.691,397.564,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
2,0.809,17.35,680.837,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
3,0.934,8.925,235.935,,"[Phascolarctos_cinereus, Aotus_nancymaae]"
