In [1]:
cd ..

# Multi-omics stratification on PDAC patients

In [2]:
import os
import pandas as pd
import numpy as np
import optuna
import time
import dill
import shutil
from optuna.samplers import TPESampler

from src import settings
from explainability import FeatureImportance
from utils import transform_full_dataset

## Load dataset

In [3]:
methylation_data = pd.read_csv(settings.methylation_data_path, sep=";", index_col=0, decimal=",")
methylation_data.columns = methylation_data.columns.str.replace(".", "-")
methylation_data = methylation_data.T
methylation_data = methylation_data.astype(np.float32)
print("methylation_data.shape", methylation_data.shape)
methylation_data.head()

In [4]:
rnaseq_data = pd.read_csv(settings.rnaseq_data_path, sep=";", index_col=0, decimal=",")
rnaseq_data = rnaseq_data.T
rnaseq_data = rnaseq_data.astype(np.float32)
print("rnaseq_data.shape", rnaseq_data.shape)
rnaseq_data.head()

In [5]:
samples = methylation_data.index.intersection(rnaseq_data.index)
methylation_data = methylation_data.loc[samples]
rnaseq_data = rnaseq_data.loc[samples]
assert methylation_data.index.equals(rnaseq_data.index)
Xs= [rnaseq_data, methylation_data]
print("common samples:", len(samples))

In [6]:
date = "2023070315"
with open(os.path.join(settings.optimization_path, f'optimization_optuna_{date}.pkl'), 'rb') as file:
    optimization_study = dill.load(file)
optimization_results = pd.read_csv(os.path.join(settings.optimization_path, f"optimization_results_{date}.csv"))
best_trial = optimization_results.iloc[0]
print("optimization_results.shape", optimization_results.shape)
optimization_results.head()

In [7]:
transformed_Xs = transform_full_dataset(Xs=Xs, fit_pipelines = False, results_folder = settings.results_path)
transformed_X = pd.concat(transformed_Xs, axis = 1)
print("transformed_X.shape", transformed_X.shape)
transformed_X.head()

In [8]:
new_study = False
if new_study:
    date = time.strftime('%Y%m%d%H')
    feature_importance_study = optuna.create_study(direction="maximize")
    for file in os.listdir(settings.feature_importance_path):
        try:
            os.remove(os.path.join(settings.feature_importance_path, file))
        except IsADirectoryError:
            shutil.rmtree(os.path.join(settings.feature_importance_path, file), ignore_errors= True)
    for view_idx, X in enumerate(transformed_Xs):
        features = X.columns.to_list()
        for feature_to_drop in features:
            feature_importance_study.enqueue_trial({"view_idx": view_idx, "feature_to_drop": feature_to_drop})
    print("Features tu explain:", len(feature_importance_study.trials))

else:
    date = "2023070718"
    with open(os.path.join(settings.feature_importance_path, f'feature_importance_results_{date}.pkl'), 'rb') as file:
        feature_importance_study = dill.load(file)

In [None]:
func_objective = lambda trial: FeatureImportance().objective(trial= trial, Xs= Xs, samples= samples, original_score = optimization_study.best_value,
                                                             features_per_component = optimization_study.best_params["features_per_component"],
                                                             in_channels_list = eval(best_trial["user_attrs_num_features"]),
                                                             hidden_channels_list = [view_hidden[1:] for view_hidden in eval(best_trial["user_attrs_num_units"])],
                                                             n_clusters = optimization_study.best_params["n_clusters"],
                                                             n_epochs = optimization_study.best_params["n_epochs"],
                                                             lambda_coeff = optimization_study.best_params["lambda_coeff"],
                                                             latent_space = optimization_study.best_params["latent_space"],
                                                             batch_size = int(best_trial["user_attrs_batch_size"]),
                                                             optimization_folder = settings.optimization_path,
                                                             random_state=settings.RANDOM_STATE, n_jobs= 2, folder= settings.feature_importance_path)

keep_trying = True
while keep_trying:
    try:
        feature_importance_study = FeatureImportance.optimize_optuna_and_save(study= feature_importance_study, n_trials = len(feature_importance_study.trials), date=date, 
                                                                              show_progress_bar= True, folder= settings.feature_importance_path, func= func_objective)
        if new_study:
            keep_trying = False
    except FileNotFoundError:
        pass
    except ValueError:
        pass

In [9]:
current_study = feature_importance_study.trials_dataframe()[["system_attrs_fixed_params", "value"]].dropna().sort_values("value", ascending = False)
current_study["system_attrs_fixed_params"] = current_study["system_attrs_fixed_params"].apply(lambda x: x["feature_to_drop"])
_ = current_study.iloc[:50].plot.bar(x='system_attrs_fixed_params', y='value', ylabel='Feature importance', xlabel = "RNA-seq", 
                                     title= "Feature Permutation plot", figsize= (20, 6))