In [None]:
cd ..

# Multi-omics stratification on PDAC patients

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.preprocessing import StandardScaler
import optuna
import time
import dill
import shutil
from optuna.samplers import TPESampler

from src import settings
from src.optimization import Optimization


In [None]:
data_folder = "data"
processed_data_folder = "processed"
processed_data_path = os.path.join(data_folder, processed_data_folder)

## Load dataset

In [None]:
files = os.listdir(processed_data_path)
files = list(filter(lambda x: os.path.isfile(os.path.join(processed_data_path, x)), files))
file = -1

In [None]:
Xs = [pd.read_csv(os.path.join(processed_data_path, filename), index_col=0) for filename in files]
samples = Xs[0].index

In [None]:
file += 1
filename = files[file]
data = pd.read_csv(os.path.join(processed_data_path, filename), index_col=0)
print(f"{filename}.shape", data.shape)
data.head()

In [None]:
file += 1
filename = files[file]
data = pd.read_csv(os.path.join(processed_data_path, filename), index_col=0)
print(f"{filename}.shape", data.shape)
data.head()

In [None]:
file += 1
filename = files[file]
data = pd.read_csv(os.path.join(processed_data_path, filename), index_col=0)
print(f"{filename}.shape", data.shape)
data.head()

In [None]:
file += 1
filename = files[file]
data = pd.read_csv(os.path.join(processed_data_path, filename), index_col=0)
print(f"{filename}.shape", data.shape)
data.head()

In [None]:
new_study = False
if new_study:
    shutil.rmtree("tensorboard/", ignore_errors= True)
    date = time.strftime('%Y%m%d%H')
    optimization_study = optuna.create_study(direction="maximize", sampler=TPESampler(seed = 42, multivariate = True, n_startup_trials = 500))
    n_trials = 1
    for file in os.listdir(settings.optimization_path):
        try:
            os.remove(os.path.join(settings.optimization_path, file))
        except IsADirectoryError:
            shutil.rmtree(os.path.join(settings.optimization_path, file), ignore_errors= True)
else:
    date = "2024051615"
    with open(os.path.join(settings.optimization_path, f'optimization_optuna_{date}.pkl'), 'rb') as file:
        optimization_study = dill.load(file)
    n_trials = 1000

In [None]:
# %%capture --no-display

pipelines = [StandardScaler().set_output(transform="pandas") for _ in range(len(Xs))]

func_objective = lambda trial: Optimization().objective(trial= trial, Xs= Xs, samples= samples, pipelines= pipelines, num_layers_option= [1,2,1], num_units_option= [2,6, 2],
                                                        n_epochs_option= [20,100,20], lambda_option = [0.001, 1, 0.25], n_clusters_option= [2,6,1],
                                                        latent_space_option = [32, 128, 32], batch_size=32,
                                                        random_state=settings.RANDOM_STATE, n_jobs= 1)

keep_trying = True
while keep_trying:
    try:
        optimization_study = Optimization.optimize_optuna_and_save(study= optimization_study, n_trials = n_trials, date=date,
                                                                   show_progress_bar= True, folder= settings.optimization_path, func= func_objective)
        if new_study:
            keep_trying = False
    except FileNotFoundError:
        pass
    except ValueError:
        pass

In [None]:
optimization_study.best_params

In [None]:
fig = optuna.visualization.plot_optimization_history(optimization_study)
fig.show()

In [None]:
fig = optuna.visualization.plot_param_importances(optimization_study)
fig.show()

In [None]:
fig = optuna.visualization.plot_slice(optimization_study)
fig.show()

In [None]:
optimization_results = pd.read_csv(os.path.join(settings.optimization_path, f"optimization_results_{date}.csv"))
best_trial = optimization_results.iloc[0]
print("optimization_results.shape", optimization_results.shape)
optimization_results.head()