Load expressions
----------------

### imports ###

In [1]:
from typing import *
from yspecies import *
from yspecies.enums import *
from yspecies.dataset import *
from yspecies.misc import *

In [2]:
from dataclasses import dataclass
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import pandas as pd
import shap
from pprint import pprint
import random
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import lightgbm as lgb
from scipy.stats import kendalltau
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, recall_score, precision_score, f1_score

In [4]:
#settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pprint
pp = pprint.PrettyPrinter(indent=4)

## Parameters cell ##

Parameters are overiddent by papermill when run inside DVC stages



In [20]:
NUMBER_OF_BOOTSTRAPS = 5 # this sets global setting of which how many bootstraps to use
TISSUES_LIST = ['Lung', 'Liver', 'Kidney', 'Brain', 'Heart'] # this sets global setting of which tissue to use

In [21]:

# save label encoders to global scope
le_tissue = LabelEncoder()
le_order = LabelEncoder()
enc_tissue = OneHotEncoder(handle_unknown='ignore')
enc_order = OneHotEncoder(handle_unknown='ignore')
    


In [5]:
def regression_model_lightgbm(X_train, X_test, y_train, y_test, categorical):
    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    evals_result = {}

    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'l2', 'l1'},
        'max_leaves': 20,
        'max_depth': 3,
        'learning_rate': 0.07,
        'feature_fraction': 0.8,
        'bagging_fraction': 1,
        'min_data_in_leaf': 6,
        'lambda_l1': 0.9,
        'lambda_l2': 0.9,
        "verbose": -1
    }

    gbm = lgb.train(params,
        lgb_train,
        num_boost_round=500,
        valid_sets=lgb_eval,
        evals_result=evals_result,
        verbose_eval=1000,
        early_stopping_rounds=7)
    
    return gbm

In [10]:
def sorted_stratification(X, Y, k, species_validation=True):
    X['target'] = Y
    X = X.sort_values(by=['target'])
    
    if species_validation:
        all_species = list(set(X[X['common_name']!= 'Human']['common_name'].values))
        df_index = X.index

        k_sets_indexes = []
        k_sets_of_species_names = []
        already_selected = []
        for i in range(k):
            index_set = []
            choice1 = random.choice(all_species)
            while (choice1 in already_selected):
                choice1 = random.choice(all_species)
            already_selected.append(choice1)

            choice2 = random.choice(all_species)
            while (choice2 in already_selected):
                choice2 = random.choice(all_species)
            already_selected.append(choice2)

            k_sets_of_species_names.append([choice1, choice2])
            common_names = X['common_name'].values
            for j, c in enumerate(common_names):
                if c == choice1 or c == choice2:
                    index_set.append(j)
            k_sets_indexes.append(index_set)
        
    
    partition_indexes = [[] for i in range(k)]
    i = 0
    index_of_sample = 0
   
    while i < (int(len(Y)/k)): 
        for j in range(k):
            partition_indexes[j].append((i*k)+j)
            index_of_sample = (i*k)+j
        i+=1

    index_of_sample += 1
    i = 0
    while index_of_sample < len(Y):
        partition_indexes[i].append(index_of_sample)
        index_of_sample += 1
        i+=1
        
        
    X_features = X.drop(['target', 'common_name'], axis=1)
    Y = X['target'].values
    common_names_df = X['common_name'].values
    X = X.drop(['target', 'common_name'], axis=1) 
    
    if species_validation:
        print('Species for validation', k_sets_of_species_names)
        
    partition_Xs = []
    partition_Ys = []
    common_name_partitions = []
    
    if species_validation:
        for i, pindex in enumerate(partition_indexes):
            for j, sindex in enumerate(k_sets_indexes):
                if i == j:
                    partition_indexes[i] = list(set(partition_indexes[i]).union(set(k_sets_indexes[j])))
                else:
                    partition_indexes[i] = list(set(partition_indexes[i]).difference(set(k_sets_indexes[j])))
            
        
    for i, pindex in enumerate(partition_indexes):
        partition_Xs.append(X_features.iloc[pindex])
        common_name_partitions.append(common_names_df[pindex])
        partition_Ys.append(Y[pindex])
        
       
    return X, Y, partition_Xs, partition_Ys, common_name_partitions

In [None]:
def merge_datasets():
    expressions = pd.read_csv('./../data/interim/selected_expressions.tsv', sep='\t')
    samples = pd.read_csv('./../data/interim/selected_samples.tsv', sep="\t", index_col=None, dtype=None)
    bad_samples = [
        "SRR8750397", #may be single-cell
        "SRR8750398", #may be single-cell
        "SRR8750399", #may be single-cell
        "SRR3109726", #old kidney
        "SRR3109728", #old kidney
        "SRR3403827", #brainsteam
        "SRR3403828", #brainsteam
        "SRR306404", #hypoxia exposed
        "SRR306406"
    ]
    samples = samples[~samples['run'].isin(bad_samples)]
    samples = samples.set_index('run')
    expressions = expressions.set_index('run')
    cross_species_joined_df = expressions.merge(samples, left_on='run', right_on='run')
     
    print('Number of samples', len(cross_species_joined_df))
    cross_species_joined_df.to_csv('cross_species_df_merged.csv', index=False)

### Load pathes ###

In [10]:
from pathlib import Path
if(Path("./data").exists()):
    base_dir = Path("./")
else:
    base_dir = Path("../")
print(base_dir.absolute())

data_dir = base_dir / "data"
input_dir = data_dir / "input"
interim_dir = data_dir / "interim"
output_dir =  data_dir / "output"


/data/sources/species/notebooks/..


In [11]:
def load_table(path: Path, index: str = None, dtype: str = None)->pd.DataFrame:    
    if index is None:
        return pd.read_csv(str(path), sep="\t", dtype=dtype)
    else:
        return pd.read_csv(str(path), sep="\t", index_col=index, dtype=dtype)

In [12]:
def show(df: pd.DataFrame, cols: int, rows: int = 3) -> pd.DataFrame:
    return df[df.columns[0:cols]].head(rows)

## Load data ##

In [16]:
genes = load_table(interim_dir / "selected_genes.tsv", index="Homo_sapiens")  
samples =  load_table(interim_dir / "selected_samples.tsv" , index="run").sort_index() 
species =  load_table(interim_dir / "selected_species.tsv" , index="species")
expressions = load_table(interim_dir / "selected_expressions.tsv", index="run").sort_index()
tab(
[
    ["genes", "samples", "species", "expressions"],
    [genes.shape, samples.shape, species.shape, expressions.shape]
]
)

FileNotFoundError: [Errno 2] File ../data/interim/selected_genes.tsv does not exist: '../data/interim/selected_genes.tsv'

In [14]:
expressions.columns == genes.index

NameError: name 'expressions' is not defined

In [10]:
d = Dataset("selected_species", expressions, genes, samples)
d

<yspecies.dataset.Dataset at 0x7ff212188dd0>

In [1]:
d.get_samples_colnames()

NameError: name 'd' is not defined