Load expressions
----------------

### imports ###

In [1]:
from typing import *
from yspecies import *
from yspecies.enums import *
from yspecies.dataset import *
from yspecies.misc import *
from yspecies.workflow import *

In [2]:
from dataclasses import dataclass
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import pandas as pd
import shap
from pprint import pprint
import random
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import lightgbm as lgb
from scipy.stats import kendalltau
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, recall_score, precision_score, f1_score

In [4]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [5]:
number_of_bootstraps = 5 # this sets global setting of which how many bootstraps to use

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'max_leaves': 20,
    'max_depth': 3,
    'learning_rate': 0.07,
    'feature_fraction': 0.8,
    'bagging_fraction': 1,
    'min_data_in_leaf': 6,
    'lambda_l1': 0.9,
    'lambda_l2': 0.9,
    "verbose": -1
}

### Loading data ###

In [6]:
from pathlib import Path
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")

In [7]:
data = ExpressionDataset.from_folder(locations.interim.selected)
data

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(452, 12630)",12630,43,452,"(67996, 3)","(44, 18)"


In [8]:
"species" in  data.samples.columns.to_list()

True

In [13]:
data.extended_samples(["tissue"], ["animal_class", "lifespan"])

Unnamed: 0_level_0,tissue,species,animal_class,lifespan
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SRR1287653,Blood,Ailuropoda_melanoleuca,Mammalia,36.8
SRR1287654,Blood,Ailuropoda_melanoleuca,Mammalia,36.8
SRR1287655,Blood,Ailuropoda_melanoleuca,Mammalia,36.8
SRR2308103,Liver,Ailuropoda_melanoleuca,Mammalia,36.8
SRR1981979,Brain,Aotus_nancymaae,Mammalia,20.0
SRR1981981,Liver,Aotus_nancymaae,Mammalia,20.0
SRR1981987,Heart,Aotus_nancymaae,Mammalia,20.0
SRR1981988,Kidney,Aotus_nancymaae,Mammalia,20.0
SRR636839,Liver,Bos_taurus,Mammalia,20.0
SRR636840,Liver,Bos_taurus,Mammalia,20.0


In [16]:
ls = data.samples.columns.to_list()
data.extended_samples(ls)

TypeError: argument of type 'NoneType' is not iterable

In [17]:
from yspecies import *
from yspecies.enums import *
from yspecies.dataset import *
from yspecies.misc import *
from yspecies.workflow import *

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import shap
from pprint import pprint
import random
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import lightgbm as lgb
from scipy.stats import kendalltau
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, recall_score, precision_score, f1_score

from typing import *
from dataclasses import *

class FeaturePartitioner:

    gene_features: List[str] = None
    samples_features: List[str] = None
    species_features: List[str] = None
    label_to_predict: str = "lifespan"

    def get(self, df: pd.DataFrame, feature: str or List = None):
        if feature is None:
            return df
        elif isinstance(feature, str):
            return df[[feature]]
        else:
            return df if len(feature) == 0 else df[feature]

    def intersection(self, lst1: List, lst2: List):
        return list(set(lst1) & set(lst2))

    def sp(self, data: ExpressionDataset):
        exp = self.get(data.expressions, self.gene_features)
        if self.samples_features is None:
            if self.species_features is None:
               X = exp
            else:
               species = self.get(data.species, self.species_features)
               X = data.samples[["species"]].merge(species, left_on="species", right_index=True)
        else:
            if self.species_features is None:
                X = self.get(data.samples, self.samples_features)
            else:
                self.samples_features.extend()
                if "species" not in self.samples_features:
                    self.species_features.append("species")
                species = self.get(data.species, self.species_features)
                samples = self.get(data.samples, ["species"] + self.samples_features)
                X = samples.merge(species, left_on="species", right_index=True)
        if self.label_to_predict in data.species.columns:
            spY = self.intersection(self.label_to_predict, data.species.columns)
            samY = self.intersection(self.label_to_predict, data.samples.columns)
            Y = data.samples[samY] if(len(spY) ==0) else data.samples[samY + ["species"]].merge(species[spY], left_on="species", right_index=True).drop("species")
            self.calculate_stable_shap_values(X, Y, self.label_to_predict)

    def sorted_stratification(self, X: pd.DataFrame, Y: pd.DataFrame, k: int):
        X['target'] = Y
        X = X.sort_values(by=['target'])
        partition_indexes = [[] for i in range(k)]
        i = 0
        index_of_sample = 0

        while i < (int(len(Y)/k)):
            for j in range(k):
                partition_indexes[j].append((i*k)+j)
                index_of_sample = (i*k)+j
            i += 1

        index_of_sample += 1
        i = 0
        while index_of_sample < len(Y):
            partition_indexes[i].append(index_of_sample)
            index_of_sample += 1
            i+=1

        X_features = X.drop(['target'], axis=1)
        Y = X['target'].values
        X = X.drop(['target'], axis=1)

        partition_Xs = []
        partition_Ys = []
        for pindex in partition_indexes:
            partition_Xs.append(X_features.iloc[pindex])
            partition_Ys.append(Y[pindex])

        return X, Y, partition_Xs, partition_Ys

In [16]:
f = FeaturePartitioner()
f.

Unnamed: 0_level_0,bioproject,series,species,tissue,sample_name,characteristics,sequencer,age,sex,tumor,source,study,study_title,salmon_version,library_layout,library_selection,library_strategy,lib_type,bootstrap,protocol,common_name,animal_class,order,family,lifespan,ensembl_url,mass_g,metabolic_rate,temperature,temperature_kelvin,gestation_days,taxon,female_maturity_days,male_maturity_days,litters_per_year,inter_birth_interval,birth_weight_g,weaning_weight_g
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
SRR1287653,PRJNA247712,PRJNA247712,Ailuropoda_melanoleuca,Blood,SL01,no;Model organism or animal;19;female;blood;SL...,Illumina_HiSeq_2000,19,female,no,blood,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,The giant panda blood Transcriptome,1.1.0,PAIRED,PCR,RNA-Seq,A,96,,Giant panda,Mammalia,Carnivora,Ursidae,36.8,https://www.ensembl.org/Ailuropoda_melanoleuca,,,,,48.0,taxon#9646,1.5,2192.0,0.7,548.0,110.0,
SRR1287654,PRJNA247712,PRJNA247712,Ailuropoda_melanoleuca,Blood,XB01,no;Model organism or animal;12;male;blood;XB01...,Illumina_HiSeq_2000,12,male,no,blood,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,The giant panda blood Transcriptome,1.1.0,PAIRED,PCR,RNA-Seq,A,96,,Giant panda,Mammalia,Carnivora,Ursidae,36.8,https://www.ensembl.org/Ailuropoda_melanoleuca,,,,,48.0,taxon#9646,1.5,2192.0,0.7,548.0,110.0,
SRR1287655,PRJNA247712,PRJNA247712,Ailuropoda_melanoleuca,Blood,XB02,no;Model organism or animal;6;female;blood;XB0...,Illumina_HiSeq_2000,6,female,no,blood,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,The giant panda blood Transcriptome,1.1.0,PAIRED,PCR,RNA-Seq,A,128,,Giant panda,Mammalia,Carnivora,Ursidae,36.8,https://www.ensembl.org/Ailuropoda_melanoleuca,,,,,48.0,taxon#9646,1.5,2192.0,0.7,548.0,110.0,
SRR2308103,PRJNA293919,PRJNA293919,Ailuropoda_melanoleuca,Liver,PandaM_liver,no;tissue sample;adult;Model organism or anima...,Illumina_Genome_Analyzer_IIx,,male,no,liver,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Ailuropoda melanoleuca Transcriptome or Gene e...,1.1.0,PAIRED,PolyA,RNA-Seq,A,96,,Giant panda,Mammalia,Carnivora,Ursidae,36.8,https://www.ensembl.org/Ailuropoda_melanoleuca,,,,,48.0,taxon#9646,1.5,2192.0,0.7,548.0,110.0,
SRR1981979,PRJNA280454,PRJNA280454,Aotus_nancymaae,Brain,ANAN.00-37965,no;Model organism or animal;85725;adult;female...,Illumina_HiSeq_2000,adult,female,no,brain - frontal cortex,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Owl monkey transcriptome,0.14.1,PAIRED,cDNA,RNA-Seq,A,128,,Nancy Ma's night monkey,Mammalia,Primates,Aotidae,20.0,https://www.ensembl.org/Aotus_nancymaae/,1254.0,,,,133.0,taxon#37293,,,,,,
SRR1981981,PRJNA280454,PRJNA280454,Aotus_nancymaae,Liver,ANAN.00-37969,no;Model organism or animal;85725;adult;female...,Illumina_HiSeq_2000,adult,female,no,liver,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Owl monkey transcriptome,0.14.1,PAIRED,cDNA,RNA-Seq,A,128,,Nancy Ma's night monkey,Mammalia,Primates,Aotidae,20.0,https://www.ensembl.org/Aotus_nancymaae/,1254.0,,,,133.0,taxon#37293,,,,,,
SRR1981987,PRJNA280454,PRJNA280454,Aotus_nancymaae,Heart,ANAN.00-37972,no;Model organism or animal;85725;adult;female...,Illumina_HiSeq_2000,adult,female,no,heart,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Owl monkey transcriptome,0.14.1,PAIRED,cDNA,RNA-Seq,A,128,,Nancy Ma's night monkey,Mammalia,Primates,Aotidae,20.0,https://www.ensembl.org/Aotus_nancymaae/,1254.0,,,,133.0,taxon#37293,,,,,,
SRR1981988,PRJNA280454,PRJNA280454,Aotus_nancymaae,Kidney,ANAN.00-37974,no;Model organism or animal;85725;adult;female...,Illumina_HiSeq_2000,adult,female,no,kidney,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Owl monkey transcriptome,0.14.1,PAIRED,cDNA,RNA-Seq,A,128,,Nancy Ma's night monkey,Mammalia,Primates,Aotidae,20.0,https://www.ensembl.org/Aotus_nancymaae/,1254.0,,,,133.0,taxon#37293,,,,,,
SRR636839,PRJNA184055,geo/query/acc.cgi?acc=GSE43013,Bos_taurus,Liver,GSM1054989,wild type;adult;normal liver;whole liver;Bos t...,Illumina_Genome_Analyzer_IIx,adult,,no,normal liver,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Gene Expression Defines Natural Changes in Mam...,1.2.1,PAIRED,cDNA,RNA-Seq,A,128,,Domestic cattle,Mammalia,Artiodactyla,Bovidae,20.0,https://www.ensembl.org/Bos_taurus,347000.0,306.77,38.0,311.15,277.0,taxon#9913,1.0,,1.0,,,
SRR636840,PRJNA184055,geo/query/acc.cgi?acc=GSE43013,Bos_taurus,Liver,GSM1054990,wild type;adult;normal liver;whole liver;Bos t...,Illumina_Genome_Analyzer_IIx,adult,,no,normal liver,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Gene Expression Defines Natural Changes in Mam...,1.2.1,PAIRED,cDNA,RNA-Seq,A,128,,Domestic cattle,Mammalia,Artiodactyla,Bovidae,20.0,https://www.ensembl.org/Bos_taurus,347000.0,306.77,38.0,311.15,277.0,taxon#9913,1.0,,1.0,,,


In [67]:
data.samples[["species"]].merge(species, left_on="species", right_index=True)

Unnamed: 0_level_0,species,common_name,animal_class,order,family,lifespan,ensembl_url,mass_g,metabolic_rate,temperature,temperature_kelvin,gestation_days,taxon,female_maturity_days,male_maturity_days,litters_per_year,inter_birth_interval,birth_weight_g,weaning_weight_g
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
SRR1287653,Ailuropoda_melanoleuca,Giant panda,Mammalia,Carnivora,Ursidae,36.8,https://www.ensembl.org/Ailuropoda_melanoleuca,,,,,48.0,taxon#9646,1.5,2192.0,0.7,548.0,110.0,
SRR1287654,Ailuropoda_melanoleuca,Giant panda,Mammalia,Carnivora,Ursidae,36.8,https://www.ensembl.org/Ailuropoda_melanoleuca,,,,,48.0,taxon#9646,1.5,2192.0,0.7,548.0,110.0,
SRR1287655,Ailuropoda_melanoleuca,Giant panda,Mammalia,Carnivora,Ursidae,36.8,https://www.ensembl.org/Ailuropoda_melanoleuca,,,,,48.0,taxon#9646,1.5,2192.0,0.7,548.0,110.0,
SRR2308103,Ailuropoda_melanoleuca,Giant panda,Mammalia,Carnivora,Ursidae,36.8,https://www.ensembl.org/Ailuropoda_melanoleuca,,,,,48.0,taxon#9646,1.5,2192.0,0.7,548.0,110.0,
SRR1981979,Aotus_nancymaae,Nancy Ma's night monkey,Mammalia,Primates,Aotidae,20.0,https://www.ensembl.org/Aotus_nancymaae/,1254.0,,,,133.0,taxon#37293,,,,,,
SRR1981981,Aotus_nancymaae,Nancy Ma's night monkey,Mammalia,Primates,Aotidae,20.0,https://www.ensembl.org/Aotus_nancymaae/,1254.0,,,,133.0,taxon#37293,,,,,,
SRR1981987,Aotus_nancymaae,Nancy Ma's night monkey,Mammalia,Primates,Aotidae,20.0,https://www.ensembl.org/Aotus_nancymaae/,1254.0,,,,133.0,taxon#37293,,,,,,
SRR1981988,Aotus_nancymaae,Nancy Ma's night monkey,Mammalia,Primates,Aotidae,20.0,https://www.ensembl.org/Aotus_nancymaae/,1254.0,,,,133.0,taxon#37293,,,,,,
SRR636839,Bos_taurus,Domestic cattle,Mammalia,Artiodactyla,Bovidae,20.0,https://www.ensembl.org/Bos_taurus,347000.0,306.77,38.0,311.15,277.0,taxon#9913,1.0,,1.0,,,
SRR636840,Bos_taurus,Domestic cattle,Mammalia,Artiodactyla,Bovidae,20.0,https://www.ensembl.org/Bos_taurus,347000.0,306.77,38.0,311.15,277.0,taxon#9913,1.0,,1.0,,,


In [65]:
species = pd.read_csv(locations.interim.selected / "species.tsv", sep="\t", index_col="species")

In [56]:
show_wide(data.samples[["tissue"]].join(data.expressions),3)

Unnamed: 0_level_0,tissue,ENSG00000139990,ENSG00000073921
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SRR1287653,Blood,19.259,79.624
SRR1287654,Blood,12.205,49.298
SRR1287655,Blood,16.477,67.619


In [19]:
data.samples

Unnamed: 0_level_0,bioproject,series,species,tissue,sample_name,characteristics,sequencer,age,sex,tumor,source,study,study_title,salmon_version,library_layout,library_selection,library_strategy,lib_type,bootstrap,protocol,common_name,animal_class,order,family,lifespan,ensembl_url,mass_g,metabolic_rate,temperature,temperature_kelvin,gestation_days,taxon,female_maturity_days,male_maturity_days,litters_per_year,inter_birth_interval,birth_weight_g,weaning_weight_g
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
SRR1287653,PRJNA247712,PRJNA247712,Ailuropoda_melanoleuca,Blood,SL01,no;Model organism or animal;19;female;blood;SL...,Illumina_HiSeq_2000,19,female,no,blood,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,The giant panda blood Transcriptome,1.1.0,PAIRED,PCR,RNA-Seq,A,96,,Giant panda,Mammalia,Carnivora,Ursidae,36.8,https://www.ensembl.org/Ailuropoda_melanoleuca,,,,,48.0,taxon#9646,1.5,2192.0,0.7,548.0,110.0,
SRR1287654,PRJNA247712,PRJNA247712,Ailuropoda_melanoleuca,Blood,XB01,no;Model organism or animal;12;male;blood;XB01...,Illumina_HiSeq_2000,12,male,no,blood,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,The giant panda blood Transcriptome,1.1.0,PAIRED,PCR,RNA-Seq,A,96,,Giant panda,Mammalia,Carnivora,Ursidae,36.8,https://www.ensembl.org/Ailuropoda_melanoleuca,,,,,48.0,taxon#9646,1.5,2192.0,0.7,548.0,110.0,
SRR1287655,PRJNA247712,PRJNA247712,Ailuropoda_melanoleuca,Blood,XB02,no;Model organism or animal;6;female;blood;XB0...,Illumina_HiSeq_2000,6,female,no,blood,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,The giant panda blood Transcriptome,1.1.0,PAIRED,PCR,RNA-Seq,A,128,,Giant panda,Mammalia,Carnivora,Ursidae,36.8,https://www.ensembl.org/Ailuropoda_melanoleuca,,,,,48.0,taxon#9646,1.5,2192.0,0.7,548.0,110.0,
SRR2308103,PRJNA293919,PRJNA293919,Ailuropoda_melanoleuca,Liver,PandaM_liver,no;tissue sample;adult;Model organism or anima...,Illumina_Genome_Analyzer_IIx,,male,no,liver,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Ailuropoda melanoleuca Transcriptome or Gene e...,1.1.0,PAIRED,PolyA,RNA-Seq,A,96,,Giant panda,Mammalia,Carnivora,Ursidae,36.8,https://www.ensembl.org/Ailuropoda_melanoleuca,,,,,48.0,taxon#9646,1.5,2192.0,0.7,548.0,110.0,
SRR1981979,PRJNA280454,PRJNA280454,Aotus_nancymaae,Brain,ANAN.00-37965,no;Model organism or animal;85725;adult;female...,Illumina_HiSeq_2000,adult,female,no,brain - frontal cortex,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Owl monkey transcriptome,0.14.1,PAIRED,cDNA,RNA-Seq,A,128,,Nancy Ma's night monkey,Mammalia,Primates,Aotidae,20.0,https://www.ensembl.org/Aotus_nancymaae/,1254.0,,,,133.0,taxon#37293,,,,,,
SRR1981981,PRJNA280454,PRJNA280454,Aotus_nancymaae,Liver,ANAN.00-37969,no;Model organism or animal;85725;adult;female...,Illumina_HiSeq_2000,adult,female,no,liver,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Owl monkey transcriptome,0.14.1,PAIRED,cDNA,RNA-Seq,A,128,,Nancy Ma's night monkey,Mammalia,Primates,Aotidae,20.0,https://www.ensembl.org/Aotus_nancymaae/,1254.0,,,,133.0,taxon#37293,,,,,,
SRR1981987,PRJNA280454,PRJNA280454,Aotus_nancymaae,Heart,ANAN.00-37972,no;Model organism or animal;85725;adult;female...,Illumina_HiSeq_2000,adult,female,no,heart,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Owl monkey transcriptome,0.14.1,PAIRED,cDNA,RNA-Seq,A,128,,Nancy Ma's night monkey,Mammalia,Primates,Aotidae,20.0,https://www.ensembl.org/Aotus_nancymaae/,1254.0,,,,133.0,taxon#37293,,,,,,
SRR1981988,PRJNA280454,PRJNA280454,Aotus_nancymaae,Kidney,ANAN.00-37974,no;Model organism or animal;85725;adult;female...,Illumina_HiSeq_2000,adult,female,no,kidney,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Owl monkey transcriptome,0.14.1,PAIRED,cDNA,RNA-Seq,A,128,,Nancy Ma's night monkey,Mammalia,Primates,Aotidae,20.0,https://www.ensembl.org/Aotus_nancymaae/,1254.0,,,,133.0,taxon#37293,,,,,,
SRR636839,PRJNA184055,geo/query/acc.cgi?acc=GSE43013,Bos_taurus,Liver,GSM1054989,wild type;adult;normal liver;whole liver;Bos t...,Illumina_Genome_Analyzer_IIx,adult,,no,normal liver,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Gene Expression Defines Natural Changes in Mam...,1.2.1,PAIRED,cDNA,RNA-Seq,A,128,,Domestic cattle,Mammalia,Artiodactyla,Bovidae,20.0,https://www.ensembl.org/Bos_taurus,347000.0,306.77,38.0,311.15,277.0,taxon#9913,1.0,,1.0,,,
SRR636840,PRJNA184055,geo/query/acc.cgi?acc=GSE43013,Bos_taurus,Liver,GSM1054990,wild type;adult;normal liver;whole liver;Bos t...,Illumina_Genome_Analyzer_IIx,adult,,no,normal liver,https://trace.ncbi.nlm.nih.gov/Traces/sra/?stu...,Gene Expression Defines Natural Changes in Mam...,1.2.1,PAIRED,cDNA,RNA-Seq,A,128,,Domestic cattle,Mammalia,Artiodactyla,Bovidae,20.0,https://www.ensembl.org/Bos_taurus,347000.0,306.77,38.0,311.15,277.0,taxon#9913,1.0,,1.0,,,


In [9]:

# save label encoders to global scope
le_tissue = LabelEncoder()
le_order = LabelEncoder()
enc_tissue = OneHotEncoder(handle_unknown='ignore')
enc_order = OneHotEncoder(handle_unknown='ignore')
    


In [10]:
def regression_model_lightgbm(X_train, X_test, y_train, y_test, categorical):
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    evals_result = {}

    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'l2', 'l1'},
        'max_leaves': 20,
        'max_depth': 3,
        'learning_rate': 0.07,
        'feature_fraction': 0.8,
        'bagging_fraction': 1,
        'min_data_in_leaf': 6,
        'lambda_l1': 0.9,
        'lambda_l2': 0.9,
        "verbose": -1
    }

    gbm = lgb.train(params,
        lgb_train,
        num_boost_round=500,
        valid_sets=lgb_eval,
        evals_result=evals_result,
        verbose_eval=1000,
        early_stopping_rounds=7)
    
    return gbm

In [24]:
data

expressions,genes,species,samples,Genes Metadata,Species Metadata
"(445, 12243)",12243,39,445,,


In [11]:
def sorted_stratification(X, Y, k, species_validation=True):
    X['target'] = Y
    X = X.sort_values(by=['target'])
    
    if species_validation:
        all_species = list(set(X[X['common_name']!= 'Human']['common_name'].values))
        df_index = X.index

        k_sets_indexes = []
        k_sets_of_species_names = []
        already_selected = []
        for i in range(k):
            index_set = []
            choice1 = random.choice(all_species)
            while (choice1 in already_selected):
                choice1 = random.choice(all_species)
            already_selected.append(choice1)

            choice2 = random.choice(all_species)
            while (choice2 in already_selected):
                choice2 = random.choice(all_species)
            already_selected.append(choice2)

            k_sets_of_species_names.append([choice1, choice2])
            common_names = X['common_name'].values
            for j, c in enumerate(common_names):
                if c == choice1 or c == choice2:
                    index_set.append(j)
            k_sets_indexes.append(index_set)
        
    
    partition_indexes = [[] for i in range(k)]
    i = 0
    index_of_sample = 0
   
    while i < (int(len(Y)/k)): 
        for j in range(k):
            partition_indexes[j].append((i*k)+j)
            index_of_sample = (i*k)+j
        i+=1

    index_of_sample += 1
    i = 0
    while index_of_sample < len(Y):
        partition_indexes[i].append(index_of_sample)
        index_of_sample += 1
        i+=1
        
        
    X_features = X.drop(['target', 'common_name'], axis=1)
    Y = X['target'].values
    common_names_df = X['common_name'].values
    X = X.drop(['target', 'common_name'], axis=1) 
    
    if species_validation:
        print('Species for validation', k_sets_of_species_names)
        
    partition_Xs = []
    partition_Ys = []
    common_name_partitions = []
    
    if species_validation:
        for i, pindex in enumerate(partition_indexes):
            for j, sindex in enumerate(k_sets_indexes):
                if i == j:
                    partition_indexes[i] = list(set(partition_indexes[i]).union(set(k_sets_indexes[j])))
                else:
                    partition_indexes[i] = list(set(partition_indexes[i]).difference(set(k_sets_indexes[j])))
            
        
    for i, pindex in enumerate(partition_indexes):
        partition_Xs.append(X_features.iloc[pindex])
        common_name_partitions.append(common_names_df[pindex])
        partition_Ys.append(Y[pindex])
        
       
    return X, Y, partition_Xs, partition_Ys, common_name_partitions

In [12]:
def calculate_metrics(prediction, ground_truth):
     return {
            'R2': r2_score(ground_truth, prediction),
            'MSE': mean_squared_error(ground_truth, prediction),
            'MAE': mean_absolute_error(ground_truth, prediction),
     }
    
def encode_tissues(dataframe):
    le.fit(dataframe['tissue'].values)
    tissues_encoded = le.transform(dataframe['tissue'].values)
    dataframe['tissue_encoded'] = tissues_encoded
    
    return dataframe
    
    
def split_to_X_and_Y(dataframe, label_to_predict):
    if 'tissue' in dataframe.columns:
        X = dataframe.drop([label_to_predict, 'tissue'], axis=1)
        Y = dataframe[label_to_predict].values
        index_of_categorical_feature = list(X.columns).index('tissue_encoded')
    else:
        X = dataframe.drop([label_to_predict], axis=1)
        Y = dataframe[label_to_predict].values
        index_of_categorical_feature = None

    return X, X.values, Y, index_of_categorical_feature
    
    
def get_predictions(label_to_predict, ids=None):
    species_data = pd.read_csv('cross_species_df_merged.csv', low_memory=False)
    
    # remove other features (redundant and those that correlate with target)
    cols_to_delete = []
    for column in list(species_data.columns):
        if ids:
            if column not in ids and column not in ['tissue', label_to_predict]:
                cols_to_delete.append(column)
        else:
            if 'ENSG' not in column and column not in ['tissue', label_to_predict]:
                cols_to_delete.append(column)    
            
    species_data = species_data.drop(cols_to_delete, axis=1) 
    
    species_data = species_data[(~pd.isnull(species_data[label_to_predict]))] # select only row where target is set
    species_data = species_data.dropna(axis=1, thresh=int(len(species_data)*0.9)) # remove all genes where percentage of NaN > 10%
    species_data = species_data[species_data['tissue'].isin(['Lung', 'Liver', 'Kidney', 'Brain', 'Heart'])] # remove underrepresented tissues
    species_data = encode_tissues(species_data)
    
    print('Number of samples', len(species_data))
    print('Number of genes', len(species_data.columns))
    
    feature_df, X, Y, index_of_categorical = split_to_X_and_Y(species_data, label_to_predict)
    
    object_from_training = calculate_stable_shap_values(feature_df, Y, index_of_categorical, label_to_predict)
    features_weighted = object_from_training['list_of_weighted_features']
    shap_values = object_from_training['stable_shap_values']
    
    return shap_values, feature_df, features_weighted

### Get list of selected genes for each variable

In [13]:
lifespan_weighted_features = []
lifespan_shap_values = []
lifespan_dataframes = []

for label in ['gestation_days', 'max_lifespan', 'mass_g', 'temperature_celsius', 'metabolic_rate', 'mtGC']:
    shap_values, feature_df, weighted_features = get_predictions(label)
    lifespan_weighted_features += weighted_features
    lifespan_shap_values.append(shap_values)
    lifespan_dataframes.append(feature_df)
    

FileNotFoundError: [Errno 2] File cross_species_df_merged.csv does not exist: 'cross_species_df_merged.csv'