Load expressions
----------------

### imports ###

In [10]:
from typing import *
from yspecies import *
from yspecies.enums import *
from yspecies.dataset import *
from yspecies.misc import *
from yspecies.workflow import *

In [2]:
from dataclasses import dataclass
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import pandas as pd
import shap
from pprint import pprint
import random
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import lightgbm as lgb
from scipy.stats import kendalltau
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, recall_score, precision_score, f1_score

In [4]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [5]:
number_of_bootstraps = 5 # this sets global setting of which how many bootstraps to use

### Loading data ###

In [15]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [19]:
data = ExpressionDataset.from_folder(locations.interim.selected)
data

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(445, 12243)",12243,39,445,,


In [19]:

# save label encoders to global scope
le_tissue = LabelEncoder()
le_order = LabelEncoder()
enc_tissue = OneHotEncoder(handle_unknown='ignore')
enc_order = OneHotEncoder(handle_unknown='ignore')
    


In [7]:
def regression_model_lightgbm(X_train, X_test, y_train, y_test, categorical):
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    evals_result = {}

    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'l2', 'l1'},
        'max_leaves': 20,
        'max_depth': 3,
        'learning_rate': 0.07,
        'feature_fraction': 0.8,
        'bagging_fraction': 1,
        'min_data_in_leaf': 6,
        'lambda_l1': 0.9,
        'lambda_l2': 0.9,
        "verbose": -1
    }

    gbm = lgb.train(params,
        lgb_train,
        num_boost_round=500,
        valid_sets=lgb_eval,
        evals_result=evals_result,
        verbose_eval=1000,
        early_stopping_rounds=7)
    
    return gbm

In [8]:
def sorted_stratification(X, Y, k, species_validation=True):
    X['target'] = Y
    X = X.sort_values(by=['target'])
    
    if species_validation:
        all_species = list(set(X[X['common_name']!= 'Human']['common_name'].values))
        df_index = X.index

        k_sets_indexes = []
        k_sets_of_species_names = []
        already_selected = []
        for i in range(k):
            index_set = []
            choice1 = random.choice(all_species)
            while (choice1 in already_selected):
                choice1 = random.choice(all_species)
            already_selected.append(choice1)

            choice2 = random.choice(all_species)
            while (choice2 in already_selected):
                choice2 = random.choice(all_species)
            already_selected.append(choice2)

            k_sets_of_species_names.append([choice1, choice2])
            common_names = X['common_name'].values
            for j, c in enumerate(common_names):
                if c == choice1 or c == choice2:
                    index_set.append(j)
            k_sets_indexes.append(index_set)
        
    
    partition_indexes = [[] for i in range(k)]
    i = 0
    index_of_sample = 0
   
    while i < (int(len(Y)/k)): 
        for j in range(k):
            partition_indexes[j].append((i*k)+j)
            index_of_sample = (i*k)+j
        i+=1

    index_of_sample += 1
    i = 0
    while index_of_sample < len(Y):
        partition_indexes[i].append(index_of_sample)
        index_of_sample += 1
        i+=1
        
        
    X_features = X.drop(['target', 'common_name'], axis=1)
    Y = X['target'].values
    common_names_df = X['common_name'].values
    X = X.drop(['target', 'common_name'], axis=1) 
    
    if species_validation:
        print('Species for validation', k_sets_of_species_names)
        
    partition_Xs = []
    partition_Ys = []
    common_name_partitions = []
    
    if species_validation:
        for i, pindex in enumerate(partition_indexes):
            for j, sindex in enumerate(k_sets_indexes):
                if i == j:
                    partition_indexes[i] = list(set(partition_indexes[i]).union(set(k_sets_indexes[j])))
                else:
                    partition_indexes[i] = list(set(partition_indexes[i]).difference(set(k_sets_indexes[j])))
            
        
    for i, pindex in enumerate(partition_indexes):
        partition_Xs.append(X_features.iloc[pindex])
        common_name_partitions.append(common_names_df[pindex])
        partition_Ys.append(Y[pindex])
        
       
    return X, Y, partition_Xs, partition_Ys, common_name_partitions

In [22]:
def calculate_metrics(prediction, ground_truth):
     return {
            'R2': r2_score(ground_truth, prediction),
            'MSE': mean_squared_error(ground_truth, prediction),
            'MAE': mean_absolute_error(ground_truth, prediction),
     }
    
def encode_tissues(dataframe):
    le.fit(dataframe['tissue'].values)
    tissues_encoded = le.transform(dataframe['tissue'].values)
    dataframe['tissue_encoded'] = tissues_encoded
    
    return dataframe
    
    
def split_to_X_and_Y(dataframe, label_to_predict):
    if 'tissue' in dataframe.columns:
        X = dataframe.drop([label_to_predict, 'tissue'], axis=1)
        Y = dataframe[label_to_predict].values
        index_of_categorical_feature = list(X.columns).index('tissue_encoded')
    else:
        X = dataframe.drop([label_to_predict], axis=1)
        Y = dataframe[label_to_predict].values
        index_of_categorical_feature = None

    return X, X.values, Y, index_of_categorical_feature
    
    
def get_predictions(label_to_predict, ids=None):
    species_data = pd.read_csv('cross_species_df_merged.csv', low_memory=False)
    
    # remove other features (redundant and those that correlate with target)
    cols_to_delete = []
    for column in list(species_data.columns):
        if ids:
            if column not in ids and column not in ['tissue', label_to_predict]:
                cols_to_delete.append(column)
        else:
            if 'ENSG' not in column and column not in ['tissue', label_to_predict]:
                cols_to_delete.append(column)    
            
    species_data = species_data.drop(cols_to_delete, axis=1) 
    
    species_data = species_data[(~pd.isnull(species_data[label_to_predict]))] # select only row where target is set
    species_data = species_data.dropna(axis=1, thresh=int(len(species_data)*0.9)) # remove all genes where percentage of NaN > 10%
    species_data = species_data[species_data['tissue'].isin(['Lung', 'Liver', 'Kidney', 'Brain', 'Heart'])] # remove underrepresented tissues
    species_data = encode_tissues(species_data)
    
    print('Number of samples', len(species_data))
    print('Number of genes', len(species_data.columns))
    
    feature_df, X, Y, index_of_categorical = split_to_X_and_Y(species_data, label_to_predict)
    
    object_from_training = calculate_stable_shap_values(feature_df, Y, index_of_categorical, label_to_predict)
    features_weighted = object_from_training['list_of_weighted_features']
    shap_values = object_from_training['stable_shap_values']
    
    return shap_values, feature_df, features_weighted

### Get list of selected genes for each variable

In [23]:
lifespan_weighted_features = []
lifespan_shap_values = []
lifespan_dataframes = []

for label in ['gestation_days', 'max_lifespan', 'mass_g', 'temperature_celsius', 'metabolic_rate', 'mtGC']:
    shap_values, feature_df, weighted_features = get_predictions(label)
    lifespan_weighted_features += weighted_features
    lifespan_shap_values.append(shap_values)
    lifespan_dataframes.append(feature_df)
    

FileNotFoundError: [Errno 2] File cross_species_df_merged.csv does not exist: 'cross_species_df_merged.csv'