In [None]:
import os
os.environ['PYTHONHASHSEED'] = '0'
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = ''
import random
import numpy as np
import pandas as pd
import itertools
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold
import matplotlib.pyplot as plt
import tensorflow as tf
from pathlib import Path
from tensorflow.keras import layers, Model
from tensorflow.keras.layers import Lambda, Dense, Dropout, Input, concatenate, Flatten, BatchNormalization, Activation
from tensorflow.keras import backend as K, regularizers, initializers
from tensorflow.keras.initializers import LecunUniform, GlorotUniform
from tensorflow.keras.regularizers import l2
from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
from huggingface_hub import hf_hub_download
tf.config.threading.set_intra_op_parallelism_threads(1)
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.random.set_seed(42)


In [None]:
from huggingface_hub import hf_hub_download

FNAME = "embedding_associations_cell_type_tissue_drug_pathway_openai_large.parquet"  
parquet_path = hf_hub_download(
    repo_id   = "honicky/genept-composable-embeddings",
    filename  = FNAME,
    repo_type = "model"
)
emb = pd.read_parquet(parquet_path)

In [None]:
df_gene = pd.read_csv(Path(__file__).parent / "Dataset"/"scTrioseq2"/ "Gene_Expression.csv")
df_dna = pd.read_csv(Path(__file__).parent / "Dataset"/"scTrioseq2"/ "DNA_Methylation.csv")
 
true_label = pd.read_csv(Path(__file__).parent / "Dataset"/"scTrioseq2"/ "label.csv")

scaler = StandardScaler()
le = LabelEncoder()
le.fit(true_label)
y = le.transform(true_label)

ge_genes = list(df_gene.columns)

df_pdi = pd.read_csv(Path(__file__).parent / "Dataset"/"PDI"/ "PDI.csv",sep=',')
df_ge_pdi = df_pdi[df_pdi.Target.isin(ge_genes)]
unique_tfs = df_ge_pdi['TF'].unique()
unique_targets = df_ge_pdi['Target'].unique()

dna_genes =list(df_dna.columns) 
df_dna_pdi = df_pdi[df_pdi.Target.isin(dna_genes)]

dna_unique_tfs = df_dna_pdi['TF'].unique()
dna_unique_targets = df_dna_pdi['Target'].unique()

df_ppi = pd.read_csv(Path(__file__).parent / "Dataset"/"PPI"/ "PPI.csv",sep=',')
df_ppi = df_ppi.drop(['Unnamed: 0'], axis =1)
df_ppi['combined_score'] = df_ppi['combined_score'] /1000
threshold = df_ppi['combined_score'].quantile(0.9)
df_ppi = df_ppi[df_ppi['combined_score'] >= threshold]
df_ge_ppi = df_ppi[df_ppi['protein1'].isin(ge_genes) | df_ppi['protein2'].isin(ge_genes)]
unique_protein1 = df_ge_ppi['protein1'].unique()
unique_protein2 = df_ge_ppi['protein2'].unique()

df_dna_ppi = df_ppi[df_ppi['protein1'].isin(dna_genes) | df_ppi['protein2'].isin(dna_genes)]
dna_unique_protein1 = df_dna_ppi['protein1'].unique()
dna_unique_protein2 = df_dna_ppi['protein2'].unique()

ge_pdi_ppi_genes = list(set(unique_targets) | set(unique_protein2))
dna_pdi_ppi_genes = list(set(dna_unique_targets) | set(dna_unique_protein2))

In [None]:
genept_gene_emb = emb.loc[emb.index.isin(ge_pdi_ppi_genes)]
ge_pdi_ppi_genept = genept_gene_emb.index

In [None]:
genept_dna_emb = emb.loc[emb.index.isin(dna_pdi_ppi_genes)]
dna_pdi_ppi_genept = genept_dna_emb.index

In [None]:
def create_ge_mask_matrix (extra_neurons):
    de_gene_to_idx = {gene: i for i, gene in enumerate(ge_genes)}
    target_to_idx = {target: j for j, target in enumerate(unique_targets)}
    protein2_to_idx = {protein2: k for k, protein2 in enumerate(unique_protein2)}

    pdi_mask_matrix = np.zeros((len(ge_genes), len(unique_targets)), dtype=int)

    for _, row in df_ge_pdi.iterrows():
        tfa = row["TF"]
        target = row["Target"]
        if tfa in de_gene_to_idx and target in target_to_idx:
            tf_idx = de_gene_to_idx[tfa]
            target_idx = target_to_idx[target]
            pdi_mask_matrix[tf_idx, target_idx] = 1
    
    ppi_mask_matrix = np.zeros((len(ge_genes), len(unique_protein2)))

    for _, row in df_ge_ppi.iterrows():
        protein1 = row["protein1"]
        protein2 = row["protein2"]
        combined_score = row["combined_score"]

        if protein1 in de_gene_to_idx and protein2 in protein2_to_idx:
            protein1_idx = de_gene_to_idx[protein1]
            protein2_idx = protein2_to_idx[protein2]
            ppi_mask_matrix[protein1_idx, protein2_idx] = combined_score 
            
    pdi_mask_df=     pd.DataFrame(pdi_mask_matrix)
    ppi_mask_df=     pd.DataFrame(ppi_mask_matrix)
    pdi_mask_df.columns = unique_targets
    ppi_mask_df.columns = unique_protein2
    pdi_ppi_mask_df = pd.concat([pdi_mask_df, ppi_mask_df], axis=1)
    pdi_ppi_mask_df = pdi_ppi_mask_df.loc[:, ~pdi_ppi_mask_df.columns.duplicated(keep='last')]
    col_mask = pdi_ppi_mask_df.columns.isin(ge_pdi_ppi_genept)
    sub_pdi_ppi_mask_df = pdi_ppi_mask_df.loc[:, col_mask]
    
    extra_columns = np.ones((len(ge_genes), extra_neurons), dtype=int)
    adjusted_mask_matrix = np.hstack([sub_pdi_ppi_mask_df, extra_columns])
    new_mask_matrix = tf.constant(adjusted_mask_matrix, dtype=tf.float32)
    
    return new_mask_matrix

def create_dna_mask_matrix (extra_neurons):
    de_gene_to_idx = {gene: i for i, gene in enumerate(dna_genes)}
    target_to_idx = {target: j for j, target in enumerate(dna_unique_targets)}
    protein2_to_idx = {protein2: k for k, protein2 in enumerate(dna_unique_protein2)}


    pdi_mask_matrix = np.zeros((len(dna_genes), len(dna_unique_targets)), dtype=int)


    for _, row in df_dna_pdi.iterrows():
        tfa = row["TF"]
        target = row["Target"]
        if tfa in de_gene_to_idx and target in target_to_idx:
            tf_idx = de_gene_to_idx[tfa]
            target_idx = target_to_idx[target]
            pdi_mask_matrix[tf_idx, target_idx] = 1
    
    ppi_mask_matrix = np.zeros((len(dna_genes), len(dna_unique_protein2)))

    for _, row in df_dna_ppi.iterrows():
        protein1 = row["protein1"]
        protein2 = row["protein2"]
        combined_score = row["combined_score"]

        if protein1 in de_gene_to_idx and protein2 in protein2_to_idx:
            protein1_idx = de_gene_to_idx[protein1]
            protein2_idx = protein2_to_idx[protein2]
            ppi_mask_matrix[protein1_idx, protein2_idx] = combined_score 
            
    pdi_mask_df=     pd.DataFrame(pdi_mask_matrix)
    ppi_mask_df=     pd.DataFrame(ppi_mask_matrix)
    pdi_mask_df.columns = dna_unique_targets
    ppi_mask_df.columns = dna_unique_protein2
    pdi_ppi_mask_df = pd.concat([pdi_mask_df, ppi_mask_df], axis=1)
    pdi_ppi_mask_df = pdi_ppi_mask_df.loc[:, ~pdi_ppi_mask_df.columns.duplicated(keep='last')]
    col_mask = pdi_ppi_mask_df.columns.isin(dna_pdi_ppi_genept)
    sub_pdi_ppi_mask_df = pdi_ppi_mask_df.loc[:, col_mask]

    
    extra_columns = np.ones((len(dna_genes), extra_neurons), dtype=int)
    adjusted_mask_matrix = np.hstack([sub_pdi_ppi_mask_df, extra_columns])
    new_mask_matrix = tf.constant(adjusted_mask_matrix, dtype=tf.float32)
    
    return new_mask_matrix

In [None]:
ge_target_to_kegg = pd.read_csv(Path(__file__).parent / "Dataset"/"scTrioseq2"/ "ge_target_to_KEGG_significant.csv",sep=',')
dna_target_to_kegg = pd.read_csv(Path(__file__).parent / "Dataset"/"scTrioseq2"/ "dna_target_to_KEGG_significant.csv",sep=',')

In [None]:
def create_ge_pathway_mask_matrix ():
    genes = ge_pdi_ppi_genept
    pathways = ge_target_to_kegg["PathwayID"].unique().tolist()

    gene_to_idx = {gene: i for i, gene in enumerate(genes)}
    pathway_to_idx = {pw: j for j, pw in enumerate(pathways)}

    mask_matrix = np.zeros((len(genes), len(pathways)), dtype=int)

    for _, row in ge_target_to_kegg.iterrows():
        gene = row["SYMBOL"]
        pw = row["PathwayID"]
        if gene in gene_to_idx and pw in pathway_to_idx:
            i = gene_to_idx[gene]
            j = pathway_to_idx[pw]
            mask_matrix[i, j] = 1
    mask_df = pd.DataFrame(mask_matrix, index=genes, columns=pathways)
    return mask_df

def create_dna_pathway_mask_matrix ():
    genes = dna_pdi_ppi_genept
    pathways = dna_target_to_kegg["PathwayID"].unique().tolist()

    gene_to_idx = {gene: i for i, gene in enumerate(genes)}
    pathway_to_idx = {pw: j for j, pw in enumerate(pathways)}

    mask_matrix = np.zeros((len(genes), len(pathways)), dtype=int)

    for _, row in dna_target_to_kegg.iterrows():
        gene = row["SYMBOL"]
        pw = row["PathwayID"]
        if gene in gene_to_idx and pw in pathway_to_idx:
            i = gene_to_idx[gene]
            j = pathway_to_idx[pw]
            mask_matrix[i, j] = 1
    mask_df = pd.DataFrame(mask_matrix, index=genes, columns=pathways)
    return mask_df
    
mask_gene_pathway  = create_ge_pathway_mask_matrix ()
mask_dna_pathway = create_dna_pathway_mask_matrix()

In [None]:
class MaskedLayer(layers.Layer):
    def __init__(self, mask_matrix, **kwargs):
        super().__init__(**kwargs)
        self.mask_matrix = K.constant(mask_matrix, dtype='float32')

    def call(self, inputs):
        return K.dot(inputs, self.mask_matrix)


class AttentionPathwayLayer(layers.Layer):
    def __init__(self, genept_embs, pathway_mask, init_seed=42, **kwargs):
        super().__init__(**kwargs)

        genept_np = genept_embs.values if hasattr(genept_embs, "values") else np.asarray(genept_embs)
        n_genes, D_embed = genept_np.shape

        pw_np = pathway_mask.values if hasattr(pathway_mask, "values") else np.asarray(pathway_mask)
        n_pathways = pw_np.shape[1]

        self.G = K.constant(genept_np, dtype="float32")
        self.P = K.constant(pw_np, dtype="float32")

        glorot = initializers.GlorotUniform(seed=init_seed)
        self.W = self.add_weight(
            shape=(n_pathways, D_embed),
            initializer=glorot,
            trainable=True,
            name="W_query"
        )

    def call(self, x):
        G_T      = K.transpose(self.G)            
        S        = K.dot(self.W, G_T)             
        S_T      = K.transpose(S)                 
        S_masked = S_T * self.P                   
        alpha    = K.softmax(S_masked, axis=0)    
        x_sq     = K.squeeze(x, axis=-1)          
        out_mat  = K.dot(x_sq, alpha)             
        return K.expand_dims(out_mat, axis=-1)    

    def compute_output_shape(self, input_shape):
        batch_size, n_genes, _ = input_shape
        n_pathways = K.int_shape(self.P)[1]
        return (batch_size, n_pathways, 1)
    
    def get_attention(self):
        G_T      = K.transpose(self.G)            
        S        = K.dot(self.W, G_T)             
        S_T      = K.transpose(S)                 
        S_masked = S_T * self.P                   
        return K.softmax(S_masked, axis=0)      


def create_model(
    num_ex_neuron,
    layer_one,
    activation_function_one,
    activation_function_two,
    activation_function_three,
    dropout_p,
    learning_rate,
    l2_reg,
    optimizer_choice,
    genept_gene_emb,
    mask_gene_pathway,
    genept_dna_emb,
    mask_dna_pathway
):

    new_ge_mask_matrix = create_ge_mask_matrix(num_ex_neuron)
    new_dna_mask_matrix = create_dna_mask_matrix(num_ex_neuron)
    
    inputA = Input(shape=(df_gene.shape[1],), name="gene_input")
    x = layers.Dense(
        new_ge_mask_matrix.shape[0],
        activation=activation_function_one,
        kernel_regularizer=regularizers.l2(l2_reg),
        kernel_initializer=initializers.LecunUniform(42)
    )(inputA)
    x = MaskedLayer(new_ge_mask_matrix)(x)
    x = Dropout(dropout_p)(x)
    x_feat = Lambda(lambda t: K.expand_dims(t, -1), name="gene_add_dim")(x)

    gene_path_layer = AttentionPathwayLayer(
    genept_embs  = genept_gene_emb,
    pathway_mask = mask_gene_pathway,
    name         = "gene_path_attn"
    )
    x_path = gene_path_layer(x_feat)
    x_path = Flatten()(x_path)

    x_proj = layers.Dense(
        layer_one,
        activation=activation_function_two,
        kernel_regularizer=regularizers.l2(l2_reg),
        kernel_initializer=initializers.LecunUniform(42),
        name="gene_proj"
    )(x_path)

    inputB = Input(shape=(df_dna.shape[1],), name="dna_input")
    y = layers.Dense(
        new_dna_mask_matrix.shape[0],
        activation=activation_function_one,
        kernel_regularizer=regularizers.l2(l2_reg),
        kernel_initializer=initializers.LecunUniform(42)
    )(inputB)
    y = MaskedLayer(new_dna_mask_matrix)(y)
    y = Dropout(dropout_p)(y)
    y_feat = Lambda(lambda t: K.expand_dims(t, -1), name="dna_add_dim")(y)

    dna_path_layer = AttentionPathwayLayer(
    genept_embs  = genept_dna_emb,
    pathway_mask = mask_dna_pathway,
    name         = "dna_path_attn"
    )
    y_path = dna_path_layer(y_feat)
    y_path = Flatten()(y_path)

    y_proj = layers.Dense(
        layer_one,
        activation=activation_function_two,
        kernel_regularizer=regularizers.l2(l2_reg),
        kernel_initializer=initializers.LecunUniform(42),
        name="dna_proj"
    )(y_path)

    merged = concatenate([x_proj, y_proj], name="merge_xy")
    h = layers.Dense(
        12,
        activation=activation_function_three,
        kernel_regularizer=regularizers.l2(l2_reg),
        kernel_initializer=initializers.GlorotUniform(42),
        name="fusion_dense"
    )(merged)
    h = Dropout(dropout_p, name="drop_fusion")(h)
    out = layers.Dense(
        4,
        activation="softmax",
        kernel_regularizer=regularizers.l2(l2_reg),
        kernel_initializer=initializers.GlorotUniform(42),
        name="output"
    )(h)

    model = Model(inputs=[inputA, inputB], outputs=out, name="PathwayModel")

    if optimizer_choice == 'adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer_choice == 'sgd':
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [None]:
def evaluate_model_cv(X_train, X_val, y_train, y_val, params):
    model = create_model(
        num_ex_neuron=params['num_ex_neuron'],
        layer_one=params['layer_one'],
        activation_function_one=params['activation_function_one'],
        activation_function_two=params['activation_function_two'],
        activation_function_three=params['activation_function_three'],
        dropout_p=params['dropout_p'],
        learning_rate=params['learning_rate'],
        l2_reg=params['l2_reg'],
        optimizer_choice=params['optimizer'],
        genept_gene_emb       = genept_gene_emb,
        mask_gene_pathway     = mask_gene_pathway,
        genept_dna_emb        = genept_dna_emb,
        mask_dna_pathway      = mask_dna_pathway
    )
    
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=params['epoch'],
        batch_size=params['batch_size'],
        shuffle = False,
        verbose=0
    )
    
    y_pred = model.predict(X_val).argmax(1)
    

    f1_weighted = f1_score(y_val, y_pred, average='weighted')
    f1_macro = f1_score(y_val, y_pred, average='macro')
    f1_micro = f1_score(y_val, y_pred, average='micro')

    precision_weighted = precision_score(y_val, y_pred, average='weighted')
    precision_macro = precision_score(y_val, y_pred, average='macro')
    precision_micro = precision_score(y_val, y_pred, average='micro')
    

    recall_weighted = recall_score(y_val, y_pred, average='weighted')
    recall_macro = recall_score(y_val, y_pred, average='macro')
    recall_micro = recall_score(y_val, y_pred, average='micro')
    

    accuracy = accuracy_score(y_val, y_pred)

    return {
        'history': history.history,
        'f1_weighted': f1_weighted,
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        'precision_weighted': precision_weighted,
        'precision_macro': precision_macro,
        'precision_micro': precision_micro,
        'recall_weighted': recall_weighted,
        'recall_macro': recall_macro,
        'recall_micro': recall_micro,
        'accuracy': accuracy
    }

def evaluate_combination(param_combination,X_gene_train,X_dna_train,y_train,n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    f1_weighted_list = []
    f1_macro_list = []
    f1_micro_list = []
    
    precision_weighted_list = []
    precision_macro_list = []
    precision_micro_list = []
    
    recall_weighted_list = []
    recall_macro_list = []
    recall_micro_list = []
    
    accuracy_list = []
    all_histories = []
    
    for train_idx, val_idx in skf.split(X_gene_train, y_train):
        Xg_tr, Xg_val = X_gene_train[train_idx], X_gene_train[val_idx]
        Xd_tr, Xd_val = X_dna_train[train_idx], X_dna_train[val_idx]
        y_tr, y_val   = y_train[train_idx], y_train[val_idx]
    
        scores = evaluate_model_cv([Xg_tr, Xd_tr], [Xg_val, Xd_val], y_tr, y_val, param_combination)
        
        history_dict = scores.pop('history')
        all_histories.append(history_dict)
        
        f1_weighted_list.append(scores['f1_weighted'])
        f1_macro_list.append(scores['f1_macro'])
        f1_micro_list.append(scores['f1_micro'])
        
        precision_weighted_list.append(scores['precision_weighted'])
        precision_macro_list.append(scores['precision_macro'])
        precision_micro_list.append(scores['precision_micro'])
        
        recall_weighted_list.append(scores['recall_weighted'])
        recall_macro_list.append(scores['recall_macro'])
        recall_micro_list.append(scores['recall_micro'])
        
        accuracy_list.append(scores['accuracy'])
    
    results = {
        'f1_weighted_mean': np.mean(f1_weighted_list),
        'f1_weighted_std': np.std(f1_weighted_list),
        'f1_macro_mean': np.mean(f1_macro_list),
        'f1_macro_std': np.std(f1_macro_list),
        'f1_micro_mean': np.mean(f1_micro_list),
        'f1_micro_std': np.std(f1_micro_list),
        'precision_weighted_mean': np.mean(precision_weighted_list),
        'precision_weighted_std': np.std(precision_weighted_list),
        'precision_macro_mean': np.mean(precision_macro_list),
        'precision_macro_std': np.std(precision_macro_list),
        'precision_micro_mean': np.mean(precision_micro_list),
        'precision_micro_std': np.std(precision_micro_list),
        'recall_weighted_mean': np.mean(recall_weighted_list),
        'recall_weighted_std': np.std(recall_weighted_list),
        'recall_macro_mean': np.mean(recall_macro_list),
        'recall_macro_std': np.std(recall_macro_list),
        'recall_micro_mean': np.mean(recall_micro_list),
        'recall_micro_std': np.std(recall_micro_list),
        'accuracy_mean': np.mean(accuracy_list),
        'accuracy_std': np.std(accuracy_list)
    }
    
    return param_combination, results, all_histories


params_grid = {
     'num_ex_neuron': [0],
     'layer_one': [64],
     'activation_function_one': ['relu'],
     'activation_function_two': ['sigmoid'],
     'activation_function_three': ['tanh'],
     'dropout_p': [0.3],
     'learning_rate': [0.01],
    'l2_reg': [0.01],
    'epoch': [200],
    'batch_size':[16],
    'optimizer': ['sgd']
 }

param_combinations = list(itertools.product(
    params_grid['num_ex_neuron'],
    params_grid['layer_one'],
    params_grid['activation_function_one'],
    params_grid['activation_function_two'],
    params_grid['activation_function_three'],
    params_grid['dropout_p'],
    params_grid['learning_rate'],
    params_grid['l2_reg'],
    params_grid['epoch'],
    params_grid['batch_size'],
    params_grid['optimizer']
    
))


param_dicts = []
for comb in param_combinations:
    param_dicts.append({
        'num_ex_neuron': comb[0],
        'layer_one': comb[1],
        'activation_function_one': comb[2],
        'activation_function_two': comb[3],
        'activation_function_three': comb[4],
        'dropout_p': comb[5],
        'learning_rate': comb[6],
        'l2_reg': comb[7],
        'epoch': comb[8],
        'batch_size': comb[9],
        'optimizer': comb[10]
    })


X_gene_train, X_gene_test, X_dna_train, X_dna_test, y_train, y_test = train_test_split(df_gene, df_dna, y, test_size=0.2, stratify=y, random_state=42)
scaler = StandardScaler()
X_gene_train=(np.array(X_gene_train).astype(np.float32))
scaler.fit(X_gene_train)
X_gene_train=scaler.transform(X_gene_train)

X_gene_test=(np.array(X_gene_test).astype(np.float32))
scaler.fit(X_gene_test)
X_gene_test=scaler.transform(X_gene_test)

X_dna_train=(np.array(X_dna_train).astype(np.float32))
scaler.fit(X_dna_train)
X_dna_train=scaler.transform(X_dna_train)

X_dna_test=(np.array(X_dna_test).astype(np.float32))
scaler.fit(X_dna_test)
X_dna_test=scaler.transform(X_dna_test)

with tqdm_joblib(
        tqdm(desc="Evaluating hyperparameter combinations",
             total=len(param_dicts))
     ) as progress_bar:
    results = Parallel(n_jobs=-1)(
        delayed(evaluate_combination)(
            param, X_gene_train, X_dna_train, y_train, n_splits=5
        )
        for param in param_dicts
    )

results_list = []
histories_dict = {}
best_param_weighted = None
best_weighted_scores = None
best_avg_f1_weighted = -np.inf

best_param_macro = None
best_macro_scores = None
best_avg_f1_macro = -np.inf

best_param_micro = None
best_micro_scores = None
best_avg_f1_micro = -np.inf

for param_dict, scores_dict, histories in results:
    key = str(param_dict)
    histories_dict[key] = histories

    row = {}
    row.update(param_dict)
    row.update(scores_dict)
    results_list.append(row)

    if scores_dict['f1_weighted_mean'] > best_avg_f1_weighted:
        best_avg_f1_weighted = scores_dict['f1_weighted_mean']
        best_param_weighted  = param_dict
        best_weighted_scores = scores_dict

    if scores_dict['f1_macro_mean'] > best_avg_f1_macro:
        best_avg_f1_macro = scores_dict['f1_macro_mean']
        best_param_macro  = param_dict
        best_macro_scores = scores_dict

    if scores_dict['f1_micro_mean'] > best_avg_f1_micro:
        best_avg_f1_micro = scores_dict['f1_micro_mean']
        best_param_micro  = param_dict
        best_micro_scores = scores_dict

results_df = pd.DataFrame(results_list)
best_key = str(best_param_macro)
best_histories = histories_dict[best_key]  


hyperparam_cols = list(param_dict.keys())


weighted_cols = hyperparam_cols + ['f1_weighted_mean', 'f1_weighted_std',
                                     'precision_weighted_mean', 'precision_weighted_std',
                                     'recall_weighted_mean', 'recall_weighted_std',
                                     'accuracy_mean', 'accuracy_std']
results_df_weighted = results_df[weighted_cols]

macro_cols = hyperparam_cols + ['f1_macro_mean', 'f1_macro_std',
                                  'precision_macro_mean', 'precision_macro_std',
                                  'recall_macro_mean', 'recall_macro_std',
                                  'accuracy_mean', 'accuracy_std']
results_df_macro = results_df[macro_cols]

micro_cols = hyperparam_cols + ['f1_micro_mean', 'f1_micro_std',
                                  'precision_micro_mean', 'precision_micro_std',
                                  'recall_micro_mean', 'recall_micro_std',
                                  'accuracy_mean', 'accuracy_std']
results_df_micro = results_df[micro_cols]

In [None]:
def create_model_test(
    num_ex_neuron,
    layer_one,
    activation_function_one,
    activation_function_two,
    activation_function_three,
    dropout_p,
    learning_rate,
    l2_reg,
    optimizer_choice
):

    new_ge_mask_matrix = create_ge_mask_matrix(num_ex_neuron)    
    new_dna_mask_matrix = create_dna_mask_matrix(num_ex_neuron)  

    inputA = Input(shape=(df_gene.shape[1],), name="gene_input") 
    inputB = Input(shape=(df_dna.shape[1],), name="dna_input")   

    x1 = layers.Dense(
        new_ge_mask_matrix.shape[0],               
        kernel_initializer=LecunUniform(seed=42),
        activation=activation_function_one,
        kernel_regularizer=l2(l2_reg)
    )(inputA)                                        


    x1_masked = MaskedLayer(new_ge_mask_matrix)(x1) 
    x1_masked = Dropout(dropout_p)(x1_masked)       

    x1_feat = layers.Lambda(lambda t: K.expand_dims(t, -1), name="gene_add_dim")(x1_masked)

    gene_path_layer = AttentionPathwayLayer(
        genept_embs  = genept_gene_emb,    
        pathway_mask = mask_gene_pathway,  
        name         = "gene_path_attn"
    )
    x1_path = gene_path_layer(x1_feat)     

    x1_path = Flatten(name="gene_flat_path")(x1_path)  

    x1_proj = layers.Dense(
        layer_one,
        kernel_initializer=LecunUniform(seed=42),
        activation=activation_function_two,
        kernel_regularizer=l2(l2_reg),
        name="gene_proj"
    )(x1_path)                                   

    y1 = layers.Dense(
        new_dna_mask_matrix.shape[0],               
        kernel_initializer=LecunUniform(seed=42),
        activation=activation_function_one,
        kernel_regularizer=l2(l2_reg)
    )(inputB)                                     

    y1_masked = MaskedLayer(new_dna_mask_matrix)(y1)  
    y1_masked = Dropout(dropout_p)(y1_masked)         

    y1_feat = layers.Lambda(lambda t: K.expand_dims(t, -1), name="dna_add_dim")(y1_masked)

    dna_path_layer = AttentionPathwayLayer(
        genept_embs  = genept_dna_emb,      
        pathway_mask = mask_dna_pathway,    
        name         = "dna_path_attn"
    )
    y1_path = dna_path_layer(y1_feat) 
    y1_path = Flatten(name="dna_flat_path")(y1_path)   

    y1_proj = layers.Dense(
        layer_one,
        kernel_initializer=LecunUniform(seed=42),
        activation=activation_function_two,
        kernel_regularizer=l2(l2_reg),
        name="dna_proj"
    )(y1_path)                                    

    merged = concatenate([x1_proj, y1_proj], name="merge_xy")  

    h = layers.Dense(
        12,
        activation=activation_function_three,
        kernel_initializer=GlorotUniform(seed=42),
        kernel_regularizer=l2(l2_reg),
        name="fusion_dense"
    )(merged)                                        
    h = Dropout(dropout_p, name="drop_fusion")(h)    

    outputs = layers.Dense(
        4,
        activation="softmax",
        kernel_initializer=GlorotUniform(seed=42),
        kernel_regularizer=l2(l2_reg),
        name="output"
    )(h)                                             

    model = Model(inputs=[inputA, inputB], outputs=outputs, name="PathwayModel_Test")

    if optimizer_choice == 'adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)

    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [None]:
def evaluate_test_model(best_param, avg_type, save_csv_path=None):
   
    per_run_results = []

    for run_i in range(1,11):
       
        model = create_model_test(
            num_ex_neuron           = best_param['num_ex_neuron'],
            layer_one               = best_param['layer_one'],
            activation_function_one = best_param['activation_function_one'],
            activation_function_two = best_param['activation_function_two'],
            activation_function_three = best_param['activation_function_three'],
            dropout_p               = best_param['dropout_p'],
            learning_rate           = best_param['learning_rate'],
            l2_reg                  = best_param['l2_reg'],
            optimizer_choice        = best_param['optimizer']
        )
        model.fit(
            [X_gene_train, X_dna_train],
            y_train,
            epochs=best_param['epoch'],
            batch_size=best_param['batch_size'],
            shuffle=False,
            verbose=0
        )
        y_pred = model.predict([X_gene_test, X_dna_test]).argmax(axis=1)

        f1_val      = f1_score(y_test, y_pred, average=avg_type)
        precision_v = precision_score(y_test, y_pred, average=avg_type)
        recall_v    = recall_score(y_test, y_pred, average=avg_type)
        acc_v       = accuracy_score(y_test, y_pred)

        per_run_results.append({
            'run_idx': run_i,
            f'f1_{avg_type}':       f1_val,
            f'precision_{avg_type}': precision_v,
            f'recall_{avg_type}':    recall_v,
            'accuracy':              acc_v
        })

    df_runs = pd.DataFrame(per_run_results)
    summary = {
        f'f1_{avg_type}_mean':        df_runs[f'f1_{avg_type}'].mean(),
        f'f1_{avg_type}_std':         df_runs[f'f1_{avg_type}'].std(),
        f'precision_{avg_type}_mean': df_runs[f'precision_{avg_type}'].mean(),
        f'precision_{avg_type}_std':  df_runs[f'precision_{avg_type}'].std(),
        f'recall_{avg_type}_mean':    df_runs[f'recall_{avg_type}'].mean(),
        f'recall_{avg_type}_std':     df_runs[f'recall_{avg_type}'].std(),
        'accuracy_mean':              df_runs['accuracy'].mean(),
        'accuracy_std':               df_runs['accuracy'].std()
    }

    df_runs[f'f1_{avg_type}_std']        = pd.NA
    df_runs[f'precision_{avg_type}_std'] = pd.NA
    df_runs[f'recall_{avg_type}_std']    = pd.NA
    df_runs['accuracy_std']              = pd.NA

    summary_row = {
        'run_idx': 'mean',
        f'f1_{avg_type}':        summary[f'f1_{avg_type}_mean'],
        f'f1_{avg_type}_std':    summary[f'f1_{avg_type}_std'],
        f'precision_{avg_type}': summary[f'precision_{avg_type}_mean'],
        f'precision_{avg_type}_std': summary[f'precision_{avg_type}_std'],
        f'recall_{avg_type}':    summary[f'recall_{avg_type}_mean'],
        f'recall_{avg_type}_std': summary[f'recall_{avg_type}_std'],
        'accuracy':              summary['accuracy_mean'],
        'accuracy_std':          summary['accuracy_std']
    }
    df_summary = pd.DataFrame([summary_row])

    df_runs = pd.concat([df_runs, df_summary], ignore_index=True)

    if save_csv_path is not None:
        df_runs.to_csv(save_csv_path, index=False)

    return summary, df_runs

summary_macro, df_macro_runs = evaluate_test_model(
    best_param_macro,
    avg_type="macro")