In [1]:
from typing import Dict, List, Tuple, Optional, Union
import silence_tensorflow.auto
import time
from utils.models_tuner import *
from utils.data_processing import to_bed

In [None]:
from ucsc_genomes_downloader import Genome

genome = Genome("hg38", cache_directory=GENOME_CACHE_DIR)

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit

holdouts_generator = StratifiedShuffleSplit(
    n_splits=HOLDOUTS_NUM_SPLIT,
    test_size=TEST_SIZE
)

In [4]:
from tqdm.auto import tqdm
from epigenomic_dataset import active_enhancers_vs_inactive_enhancers, active_promoters_vs_inactive_promoters
from epigenomic_dataset.utils import normalize_epigenomic_data
from utils.bio_constants import *
import warnings
warnings.filterwarnings('ignore')

all_tuner_results = []
all_input_layer = []
all_output_layer = []

for task, threshold in tqdm((
    (active_enhancers_vs_inactive_enhancers, 0),
    (active_promoters_vs_inactive_promoters, 1)
), desc="Tasks"):
    all_results = []
    task_name = task.__name__

    # We get the task data with binarized labels
    X, y = task(
        binarize=True,
        cell_line=CELL_LINE,
        window_size=WINDOW_SIZE,
        min_active_tpm_value=threshold,
        max_inactive_tpm_value=threshold,
        root="bio_data/epigenomic/"+str(task_name),
        verbose=1
    )
    bed = to_bed(X)

    # Start the main loop, iterating through the holdouts
    for holdout_number, (train_indices, test_indices) in tqdm(
        enumerate(holdouts_generator.split(X, y)),
        total=HOLDOUTS_NUM_SPLIT,
        leave=False,
        desc="Computing Holdouts For {}".format(task_name)
    ):
        # Get the training and test data
        train_bed, test_bed = bed.iloc[train_indices], bed.iloc[test_indices]
        train_X, test_X = X.iloc[train_indices], X.iloc[test_indices]
        train_y, test_y = y.iloc[train_indices], y.iloc[test_indices]

        # Impute and normalize the epigenomic data
        train_X, test_X = normalize_epigenomic_data(train_X, test_X)

        # Flatten the output values
        train_y = train_y.values.flatten()
        test_y = test_y.values.flatten()

        input_layers = {}
        output_layers = {}
        for model in MODELS_TYPE:
            if model == MODEL_TYPE_MMNN:
                check_ffn_param = any("ffnn_parameters" in d for d in all_tuner_results[:2])
                check_cnn_param = any("cnn_parameters" in d for d in all_tuner_results[:2])
                if check_ffn_param and check_cnn_param:
                    input_layers["input_epigenomic_data"] = all_tuner_results[0].get("ffnn_parameters").get("input_epigenomic_data")
                    input_layers["input_sequence_data"] = all_tuner_results[1].get("cnn_parameters").get("input_sequence_data")
                    output_layers["last_hidden_ffnn"] = all_tuner_results[0].get("ffnn_parameters").get("last_hidden_ffnn")
                    output_layers["last_hidden_cnn"] = all_tuner_results[1].get("cnn_parameters").get("last_hidden_cnn")
                    print(f"check_ffn_param: {check_ffn_param} check_cnn_param: {check_cnn_param} input_layers: {input_layers} output_layers: {output_layers} ")
            tuner_result = hyperparameter_tuning(train_X,
                                                test_X,
                                                train_y,
                                                test_y,
                                                train_bed,
                                                test_bed,
                                                genome,
                                                WINDOW_SIZE,
                                                holdout_number,
                                                task_name, model,
                                                input_layers,
                                                output_layers)
            all_tuner_results.append(tuner_result)

Tasks:   0%|          | 0/2 [00:00<?, ?it/s]2021-12-21 09:08:43.537 | INFO     | utils.models_tuner:tuner_evaluation:120 - Start hyperparameter tuning for ffnn


Get Layer From ffnn Models!


2021-12-21 09:08:46.124 | INFO     | utils.models_tuner:tuner_evaluation:120 - Start hyperparameter tuning for cnn


Get Layer From cnn Models!
check_ffn_param: True check_cnn_param: True input_layers: {'input_epigenomic_data': <KerasTensor: shape=(None, 58) dtype=float32 (created by layer 'input_epigenomic_data')>, 'input_sequence_data': <KerasTensor: shape=(None, 256, 4) dtype=float32 (created by layer 'input_sequence_data')>} output_layers: {'last_hidden_ffnn': <KerasTensor: shape=(None, 208) dtype=float32 (created by layer 'last_hidden_ffnn')>, 'last_hidden_cnn': <KerasTensor: shape=(None, 80) dtype=float32 (created by layer 'last_hidden_cnn')>} 
model_name: mmnn input_layers: {'input_epigenomic_data': <KerasTensor: shape=(None, 58) dtype=float32 (created by layer 'input_epigenomic_data')>, 'input_sequence_data': <KerasTensor: shape=(None, 256, 4) dtype=float32 (created by layer 'input_sequence_data')>} hidden_layers: {'last_hidden_ffnn': <KerasTensor: shape=(None, 208) dtype=float32 (created by layer 'last_hidden_ffnn')>, 'last_hidden_cnn': <KerasTensor: shape=(None, 80) dtype=float32 (created b

2021-12-21 09:09:17.678 | INFO     | utils.models_tuner:tuner_evaluation:120 - Start hyperparameter tuning for mmnn
2021-12-21 09:09:26.237 | INFO     | utils.models_tuner:tuner_evaluation:120 - Start hyperparameter tuning for ffnn


Get Layer From ffnn Models!


2021-12-21 09:09:27.274 | INFO     | utils.models_tuner:tuner_evaluation:120 - Start hyperparameter tuning for cnn


Get Layer From cnn Models!
check_ffn_param: True check_cnn_param: True input_layers: {'input_epigenomic_data': <KerasTensor: shape=(None, 58) dtype=float32 (created by layer 'input_epigenomic_data')>, 'input_sequence_data': <KerasTensor: shape=(None, 256, 4) dtype=float32 (created by layer 'input_sequence_data')>} output_layers: {'last_hidden_ffnn': <KerasTensor: shape=(None, 208) dtype=float32 (created by layer 'last_hidden_ffnn')>, 'last_hidden_cnn': <KerasTensor: shape=(None, 80) dtype=float32 (created by layer 'last_hidden_cnn')>} 
model_name: mmnn input_layers: {'input_epigenomic_data': <KerasTensor: shape=(None, 58) dtype=float32 (created by layer 'input_epigenomic_data')>, 'input_sequence_data': <KerasTensor: shape=(None, 256, 4) dtype=float32 (created by layer 'input_sequence_data')>} hidden_layers: {'last_hidden_ffnn': <KerasTensor: shape=(None, 208) dtype=float32 (created by layer 'last_hidden_ffnn')>, 'last_hidden_cnn': <KerasTensor: shape=(None, 80) dtype=float32 (created b

2021-12-21 09:09:35.503 | INFO     | utils.models_tuner:tuner_evaluation:120 - Start hyperparameter tuning for mmnn
Tasks:  50%|█████     | 1/2 [01:06<01:06, 66.83s/it]2021-12-21 09:09:50.658 | INFO     | utils.models_tuner:tuner_evaluation:120 - Start hyperparameter tuning for ffnn


Get Layer From ffnn Models!


2021-12-21 09:09:51.558 | INFO     | utils.models_tuner:tuner_evaluation:120 - Start hyperparameter tuning for cnn


Get Layer From cnn Models!
check_ffn_param: True check_cnn_param: True input_layers: {'input_epigenomic_data': <KerasTensor: shape=(None, 58) dtype=float32 (created by layer 'input_epigenomic_data')>, 'input_sequence_data': <KerasTensor: shape=(None, 256, 4) dtype=float32 (created by layer 'input_sequence_data')>} output_layers: {'last_hidden_ffnn': <KerasTensor: shape=(None, 208) dtype=float32 (created by layer 'last_hidden_ffnn')>, 'last_hidden_cnn': <KerasTensor: shape=(None, 80) dtype=float32 (created by layer 'last_hidden_cnn')>} 
model_name: mmnn input_layers: {'input_epigenomic_data': <KerasTensor: shape=(None, 58) dtype=float32 (created by layer 'input_epigenomic_data')>, 'input_sequence_data': <KerasTensor: shape=(None, 256, 4) dtype=float32 (created by layer 'input_sequence_data')>} hidden_layers: {'last_hidden_ffnn': <KerasTensor: shape=(None, 208) dtype=float32 (created by layer 'last_hidden_ffnn')>, 'last_hidden_cnn': <KerasTensor: shape=(None, 80) dtype=float32 (created b

2021-12-21 09:10:08.318 | INFO     | utils.models_tuner:tuner_evaluation:120 - Start hyperparameter tuning for mmnn
2021-12-21 09:10:25.065 | INFO     | utils.models_tuner:tuner_evaluation:120 - Start hyperparameter tuning for ffnn


Get Layer From ffnn Models!


2021-12-21 09:10:26.038 | INFO     | utils.models_tuner:tuner_evaluation:120 - Start hyperparameter tuning for cnn


Get Layer From cnn Models!
check_ffn_param: True check_cnn_param: True input_layers: {'input_epigenomic_data': <KerasTensor: shape=(None, 58) dtype=float32 (created by layer 'input_epigenomic_data')>, 'input_sequence_data': <KerasTensor: shape=(None, 256, 4) dtype=float32 (created by layer 'input_sequence_data')>} output_layers: {'last_hidden_ffnn': <KerasTensor: shape=(None, 208) dtype=float32 (created by layer 'last_hidden_ffnn')>, 'last_hidden_cnn': <KerasTensor: shape=(None, 80) dtype=float32 (created by layer 'last_hidden_cnn')>} 
model_name: mmnn input_layers: {'input_epigenomic_data': <KerasTensor: shape=(None, 58) dtype=float32 (created by layer 'input_epigenomic_data')>, 'input_sequence_data': <KerasTensor: shape=(None, 256, 4) dtype=float32 (created by layer 'input_sequence_data')>} hidden_layers: {'last_hidden_ffnn': <KerasTensor: shape=(None, 208) dtype=float32 (created by layer 'last_hidden_ffnn')>, 'last_hidden_cnn': <KerasTensor: shape=(None, 80) dtype=float32 (created b

2021-12-21 09:10:42.403 | INFO     | utils.models_tuner:tuner_evaluation:120 - Start hyperparameter tuning for mmnn
Tasks: 100%|██████████| 2/2 [02:21<00:00, 70.73s/it]


In [5]:
all_tuner_results

[{'ffnn': (<keras.engine.functional.Functional at 0x235a27c0a88>,
   <KerasTensor: shape=(None, 58) dtype=float32 (created by layer 'input_epigenomic_data')>,
   <KerasTensor: shape=(None, 208) dtype=float32 (created by layer 'last_hidden_ffnn')>),
  'ffnn_parameters': {'task_name': 'AEvsIE',
   'holdout_number': 0,
   'learning_rate': 0.01,
   'create_date': '21/12/2021-09:08:43',
   'num_layers': 6,
   'n_neurons0': 160,
   'n_neurons1': 208,
   'input_epigenomic_data': <KerasTensor: shape=(None, 58) dtype=float32 (created by layer 'input_epigenomic_data')>,
   'last_hidden_ffnn': <KerasTensor: shape=(None, 208) dtype=float32 (created by layer 'last_hidden_ffnn')>}},
 {'cnn': (<keras.engine.functional.Functional at 0x235a4247288>,
   <KerasTensor: shape=(None, 256, 4) dtype=float32 (created by layer 'input_sequence_data')>,
   <KerasTensor: shape=(None, 80) dtype=float32 (created by layer 'last_hidden_cnn')>),
  'cnn_parameters': {'task_name': 'AEvsIE',
   'holdout_number': 0,
   'le

In [6]:
from utils.models.build_binary_classification_mmnn import build_binary_classification_mmnn
from utils.models.build_binary_classification_cnn import build_binary_classification_cnn
from utils.models.build_binary_classification_ffnn import build_binary_classification_ffnn
from tqdm.auto import tqdm
from utils.data_processing import *
from utils.evaluations import train_model
from epigenomic_dataset import active_enhancers_vs_inactive_enhancers, active_promoters_vs_inactive_promoters
from epigenomic_dataset.utils import normalize_epigenomic_data
import time
import warnings
warnings.filterwarnings('ignore')

# Create a list to store all the computed performance
all_binary_classification_performance = []

training_histories = {}

# For each task
for task, threshold in tqdm((
    (active_enhancers_vs_inactive_enhancers, 0),
    (active_promoters_vs_inactive_promoters, 1)
), desc="Tasks"):
    start_time = time.time()
    task_name = task.__name__
    # We get the task data with binarized labels
    X, y = task(
        binarize=True,
        cell_line=CELL_LINE,
        window_size=WINDOW_SIZE,
        min_active_tpm_value=threshold,
        max_inactive_tpm_value=threshold,
        root="bio_data/epigenomic/"+str(task_name),
        verbose=1
    )
    training_histories[task_name] = []

    # Start the main loop, iterating through the holdouts
    for holdout_number, (train_indices, test_indices) in tqdm(
        enumerate(holdouts_generator.split(X, y)),
        total=HOLDOUTS_NUM_SPLIT,
        leave=False,
        desc="Computing Holdouts For {}".format(task_name)
    ):

        for use_feature_selection in tqdm((True, False), desc="Running Feature Selection For {}".format(task_name), leave=False):
            # Get the training and test data
            train_X, test_X = X.iloc[train_indices], X.iloc[test_indices]
            train_y, test_y = y.iloc[train_indices], y.iloc[test_indices]

            # Impute and normalize the epigenomic data
            train_X, test_X = normalize_epigenomic_data(train_X, test_X)

            # Flatten the output values
            train_y = train_y.values.flatten()
            test_y = test_y.values.flatten()

            if use_feature_selection:
                kept_features, discarded_features = execute_boruta_feature_selection(
                    X_train=pd.DataFrame(train_X),
                    y_train=train_y,
                    holdout_number=holdout_number,
                    task_name=task_name,
                    max_iter=20
                )

                if len(kept_features) > 0:
                    train_X = train_X[:,kept_features]
                    test_X = test_X[:,kept_features]

            # Get the number of features of this specific dataset
            number_of_features = train_X.shape[1]
            ffnn_parameters = all_tuner_results[0].get("ffnn_parameters")
            cnn_parameters = all_tuner_results[1].get("cnn_parameters")
            mmnn_parameters = all_tuner_results[2].get("mmnn_parameters")

            ffnn, input_epigenomic_data, last_hidden_ffnn = build_binary_classification_ffnn(input_shape=number_of_features, hp_param=ffnn_parameters)
            cnn, input_sequence_data, last_hidden_cnn = build_binary_classification_cnn(window_size=WINDOW_SIZE, hp_param=cnn_parameters)
            mmnn_simple = build_binary_classification_mmnn(hp_param_ffnn=ffnn_parameters,
                                                            hp_param_cnn=cnn_parameters,
                                                            hp_param_mmnn=mmnn_parameters,
                                                            input_shape=number_of_features,
                                                            window_size=WINDOW_SIZE
            )
            
            mmnn_boosted = build_binary_classification_mmnn(
                hp_param_mmnn=mmnn_parameters,
                input_sequence_data=input_sequence_data,
                input_epigenomic_data=input_epigenomic_data,
                last_hidden_ffnn=last_hidden_ffnn,
                last_hidden_cnn=last_hidden_cnn
            )
            
            for model, train_sequence, test_sequence in tqdm(
                (
                    (ffnn, get_ffnn_sequence(train_X, train_y), get_ffnn_sequence(test_X, test_y)),
                    (cnn, get_cnn_sequence(genome, train_bed, train_y), get_cnn_sequence(genome, test_bed, test_y)),
                    (mmnn_simple, get_mmnn_sequence(genome, train_bed, train_X, train_y), get_mmnn_sequence(genome, test_bed, test_X, test_y)),
                    (mmnn_boosted, get_mmnn_sequence(genome, train_bed, train_X, train_y), get_mmnn_sequence(genome, test_bed, test_X, test_y)),
                ),
                desc="Training models",
                leave=False
            ):

                # We compute the model performance
                history, performance = train_model(
                    model,
                    model.name+"V1",
                    task_name,
                    CELL_LINE,
                    train_sequence,
                    test_sequence,
                    holdout_number,
                    use_feature_selection,
                    start_time
                )
                training_histories[task_name].append(history)
                # We chain the computed performance to the performance list
                all_binary_classification_performance.append(performance)

                start_time = time.time()

# We convert the computed performance list into a DataFrame
all_binary_classification_performance = pd.concat(all_binary_classification_performance)

Tasks:   0%|          | 0/2 [00:00<?, ?it/s]
[A
Tasks:   0%|          | 0/2 [00:08<?, ?it/s]


KeyboardInterrupt: 

In [None]:
all_binary_classification_performance.to_csv(f"results/all_binary_classification_performance.csv", index=False)

In [None]:
all_binary_classification_performance

In [None]:
all_binary_classification_performance[all_binary_classification_performance['run_type'] == 'test'].sort_values(by='AUPRC', ascending=False)

In [None]:
from plot_keras_history import plot_history

for region, x in training_histories.items():
  plot_history(training_histories[region], title=region, graphs_per_row=6)