In [8]:
import sys
import os
import lightgbm
sys.path.append(os.path.abspath('../src'))

import copy
from preprocessing import data_loader
from pathlib import Path
import numpy as np
from sklearn.preprocessing import StandardScaler

from models.model_factory import ModelFactory
from main import load_config, run_kfold_training, run_train_test_split


In [9]:
# Initialize components
selected_cohorts = ["LUAD", "BRCA"]
data_load = data_loader.TCGADataLoader(use_cache=True)

# Load and preprocess data for the selected cohorts
print(f"Loading and preprocessing data for: {', '.join(selected_cohorts)}")
expression_data, mutation_data = data_load.preprocess_data(cancer_types=selected_cohorts)

Loading and preprocessing data for: LUAD, BRCA
Loading cached aligned expression and mutation data from:
C:\Users\KerenYlab.MEDICINE\OneDrive - Technion\Asaf\Expression_to_Mutation\mutation_prediction\cache\expression_aligned_BRCA-LUAD.pkl C:\Users\KerenYlab.MEDICINE\OneDrive - Technion\Asaf\Expression_to_Mutation\mutation_prediction\cache\mutation_aligned_BRCA-LUAD.pkl


In [None]:

config_path = Path("../config/config.yaml")
config = load_config(config_path)

X_log = np.log1p(expression_data)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_log)

cohort_suffix = "-".join(selected_cohorts)
results_root = Path("../results/notebook_multitask_test") / cohort_suffix
results_root.mkdir(parents=True, exist_ok=True)

models_to_compare = ["multitask_nn", "neural_net", "lightgbm"]
comparison_rows = []

for model_name in models_to_compare:
    config_variant = copy.deepcopy(config)
    config_variant['model']['name'] = model_name

    factory = ModelFactory()
    model = factory.get_model(
        model_name=model_name,
        input_size=X_scaled.shape[1],
        output_size=mutation_data.shape[1],
        config=config_variant,
    )

    model_dir = results_root / model_name
    metrics = run_train_test_split(
        model=model,
        X=X_scaled,
        Y=mutation_data,
        test_size=config_variant['preprocessing']['test_size'],
        output_dir=model_dir,
        config_meta={
            'config': config_variant,
            'cohorts': selected_cohorts,
            'model_name': model_name,
        },
        random_state=config_variant['preprocessing']['random_state'],
        label=model_name,
    )

    mean_metrics = metrics.mean()
    comparison_rows.append({'model': model_name, **mean_metrics.to_dict()})

comparison_df = pd.DataFrame(comparison_rows).set_index('model')
comparison_df.sort_values('f1', ascending=False)




✅ Train/test evaluation complete.
                    f1   roc_auc  accuracy
multitask_nn  0.304076  0.794594  0.723675
✅ Train/test evaluation complete.
                  f1  roc_auc  accuracy
neural_net  0.301886  0.78241  0.845082
[LightGBM] [Info] Number of positive: 52, number of negative: 976
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.964833 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3800537
[LightGBM] [Info] Number of data points in the train set: 1028, number of used features: 19210
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.050584 -> initscore=-2.932219
[LightGBM] [Info] Start training from score -2.932219
[LightGBM] [Info] Number of positive: 95, number of negative: 933
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.935503 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3800537
[LightG

In [None]:
import itertools
import json

# Define search grid for multitask NN
hidden_layer_sets = [
    [2048, 1536, 1024, 512],
    [2048, 1024, 512],
    [1536, 1024, 512],
    [1024, 768, 512],
    [1024, 512, 256],
]
head_layer_sets = [
    [512, 256],
    [256, 128],
    [512],
    [256],
    [128]
    []
]
dropout_rates = [0.2, 0.3]
learning_rates = [5e-4, 1e-4]

search_params = []
for hidden_layers, head_layers, dropout_rate, lr in itertools.product(
    hidden_layer_sets, head_layer_sets, dropout_rates, learning_rates
):
    # Skip redundant configuration where head equals main width and dropout high
    search_params.append(
        {
            'hidden_layers': hidden_layers,
            'head_layers': head_layers,
            'dropout_rate': dropout_rate,
            'learning_rate': lr,
        }
    )

grid_root = results_root / "multitask_grid"
grid_root.mkdir(parents=True, exist_ok=True)

grid_rows = []

for run_idx, params in enumerate(search_params, start=1):
    cfg = copy.deepcopy(config)
    cfg['model']['name'] = 'multitask_nn'
    cfg['model']['multitask_nn'].update(params)

    factory = ModelFactory()
    model = factory.get_model(
        model_name='multitask_nn',
        input_size=X_small.shape[1],
        output_size=Y_small.shape[1],
        config=cfg,
    )

    run_dir = grid_root / f"run_{run_idx:02d}"
    metrics = run_train_test_split(
        model=model,
        X=X_small,
        Y=Y_small,
        test_size=cfg['preprocessing']['test_size'],
        output_dir=run_dir,
        config_meta={
            'selected_features': selected_features,
            'config': cfg,
            'cohorts': selected_cohorts,
            'search_params': params,
        },
        random_state=cfg['preprocessing']['random_state'],
        label=f"multitask_nn_run_{run_idx:02d}",
    )

    mean_metrics = metrics.mean()
    grid_rows.append({
        'run': run_idx,
        **params,
        **mean_metrics.to_dict(),
    })

grid_df = pd.DataFrame(grid_rows).set_index('run')
grid_df.sort_values('f1', ascending=False)
