# Unified Deep-CBN Training
This notebook provides a unified training function capable of handling both classification and regression tasks using the Deep-CBN architecture.

In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, metrics
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [8]:
# Character dictionary used to encode SMILES strings
# Dictionary for converting SMILES characters to numbers
smiles_dict = {
    "#": 29, "%": 30, ")": 31, "(": 1, "+": 32, "-": 33, "/": 34, ".": 2,
    "1": 35, "0": 3, "3": 36, "2": 4, "5": 37, "4": 5, "7": 38, "6": 6,
    "9": 39, "8": 7, "=": 40, "A": 41, "@": 8, "C": 42, "B": 9, "E": 43,
    "D": 10, "G": 44, "F": 11, "I": 45, "H": 12, "K": 46, "M": 47, "L": 13,
    "O": 48, "N": 14, "P": 15, "S": 49, "R": 16, "U": 50, "T": 17, "W": 51,
    "V": 18, "Y": 52, "[": 53, "Z": 19, "]": 54, "\\": 20, "a": 55, "c": 56,
    "b": 21, "e": 57, "d": 22, "g": 58, "f": 23, "i": 59, "h": 24, "m": 60,
    "l": 25, "o": 61, "n": 26, "s": 62, "r": 27, "u": 63, "t": 28, "y": 64,
    " ": 65, ":": 66, ",": 67, "p": 68, "j": 69, "*": 70
}

MAX_SMI_LEN = 100
NUM_CHARS = 71

def label_smiles(line):
    X = np.zeros(MAX_SMI_LEN, dtype=int)
    for i, ch in enumerate(line[:MAX_SMI_LEN]):
        if ch in smiles_dict:
            X[i] = smiles_dict[ch]
    return X


In [9]:
def build_models(mode):
    inp = layers.Input(shape=(MAX_SMI_LEN, NUM_CHARS), name='XDinput')
    x = layers.Conv1D(64, 2, activation='relu')(inp)
    x = layers.Conv1D(64, 4, activation='relu')(x)
    x = layers.Conv1D(128, 4, activation='relu')(x)
    feature_output = x
    feature_model = models.Model(inp, feature_output, name='model_feature')

    pred_inp = layers.Input(shape=(feature_output.shape[1], feature_output.shape[2]))
    y = layers.GlobalAveragePooling1D()(pred_inp)
    y = layers.Dense(512, activation='relu')(y)
    y = layers.BatchNormalization()(y)
    y = layers.Dropout(0.1)(y)
    y = layers.Dense(256, activation='relu')(y)
    y = layers.BatchNormalization()(y)
    y = layers.Dropout(0.1)(y)
    y = layers.Dense(64, activation='relu')(y)
    if mode == 'classification':
        outputs = layers.Dense(2, activation='softmax')(y)
    else:
        outputs = layers.Dense(1)(y)
    pred_model = models.Model(pred_inp, outputs, name='model_pred')

    inter_out = pred_model(feature_output)
    interaction_model = models.Model(inp, inter_out, name='interactionModel')
    return feature_model, pred_model, interaction_model


In [None]:
from sklearn.metrics import roc_curve, auc

def train_deep_cbn(df, target_col, mode='classification', epochs=10, smiles_col='smiles'):
    smiles = df[smiles_col].astype(str)
    y = df[target_col]
    X = np.array([label_smiles(s) for s in smiles])
    X = to_categorical(X, num_classes=NUM_CHARS)

    if mode == 'classification':
        y_data = to_categorical(y.values, num_classes=2)
    else:
        y_data = y.values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_data, test_size=0.2, random_state=42,
        stratify=y if mode=='classification' else None)

    feat_model, pred_model, inter_model = build_models(mode)

    if mode == 'classification':
        loss = 'categorical_crossentropy'
        METRICS = [
            metrics.CategoricalAccuracy(name='accuracy'),
            metrics.Precision(name='precision'),
            metrics.Recall(name='recall'),
            metrics.AUC(name='auc'),
            metrics.F1Score(average='macro', name='f1_score')
        ]
    else:
        def r2_score(y_true, y_pred):
            ss_res = tf.reduce_sum(tf.square(y_true - y_pred))
            ss_tot = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true)))
            return 1 - ss_res / (ss_tot + tf.keras.backend.epsilon())
        loss = 'mse'
        METRICS = [metrics.MeanAbsoluteError(name='mae'), metrics.MeanSquaredError(name='mse'), r2_score]

    es = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=30, restore_best_weights=True)

    inter_model.compile(optimizer=optimizers.Adam(1e-3), loss=loss, metrics=METRICS)
    inter_model.fit(X_train, y_train, epochs=epochs, batch_size=256, callbacks=[es], verbose=0)
    phase1_test = inter_model.evaluate(X_test, y_test, verbose=0)

    feature_train = feat_model.predict(X_train)
    feature_test = feat_model.predict(X_test)

    model_phaz2 = models.clone_model(pred_model)
    model_phaz2.compile(optimizer=optimizers.Adam(1e-3), loss=loss, metrics=METRICS)
    model_phaz2.fit(feature_train, y_train, epochs=epochs, batch_size=256, callbacks=[es], verbose=0)

    inputs = feat_model.input
    outputs = pred_model(feat_model.output)
    model_phaz3 = models.Model(inputs, outputs, name='model_phase3')
    model_phaz3.compile(optimizer=optimizers.Adam(1e-4), loss=loss, metrics=METRICS)
    model_phaz3.fit(X_train, y_train, epochs=epochs, batch_size=256, callbacks=[es], verbose=0)
    train_eval = model_phaz3.evaluate(X_train, y_train, verbose=0)
    test_eval = model_phaz3.evaluate(X_test, y_test, verbose=0)

    results = {
        'phase1_test': phase1_test,
        'phase3_train': train_eval,
        'phase3_test': test_eval
    }
    return model_phaz3, results


In [None]:
# Example usage on multiple targets
data = pd.read_csv('../Data/tox21.csv')
targets = ['NR-AR', 'NR-ER', 'NR-PPAR-gamma']
summary = {}
for tgt in targets:
    df = data[[tgt, 'smiles']].dropna()
    model, metrics_dict = train_deep_cbn(df, tgt, mode='classification', epochs=5)
    summary[tgt] = metrics_dict


In [None]:
# Summarize performance metrics
summary_rows = []
for target, met in summary.items():
    row = {'target': target}
    row.update({
        'phase1_loss': met['phase1_test'][0],
        'phase3_test_metric': met['phase3_test'][4] if len(met['phase3_test'])>4 else met['phase3_test'][2]
    })
    summary_rows.append(row)
summary_df = pd.DataFrame(summary_rows)
summary_df

Unnamed: 0,target,phase1_loss,phase3_test_metric
0,NR-AR,0.308201,0.977601
1,NR-ER,0.539536,0.903288
2,NR-PPAR-gamma,0.344456,0.981145


In [18]:
# Summarize performance metrics and plot ROC-AUC curves
summary_rows = []

plt.figure(figsize=(6, 4))
for target, met in summary.items():
    row = {
        'target': target,
        'phase1_loss': met['phase1_test'][0],
        'phase3_test_metric': met['phase3_test'][4] if len(met['phase3_test']) > 4 else met['phase3_test'][2]
    }
    summary_rows.append(row)

    roc = met.get('roc')
    if roc:
        plt.plot(roc['fpr'], roc['tpr'], label=f"{target} (AUC = {roc['auc']:.2f})")

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves by Target')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

summary_df = pd.DataFrame(summary_rows)
summary_df


NameError: name 'plt' is not defined