In [7]:
import polars as pl
import numpy as np
import pandas as pd

import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# import from the auxFunctions.py file
from auxFunctions import calculate_mae_and_mrrmse, mean_rowwise_rmse_loss, custom_mean_rowwise_rmse, create_model_checkpoint, plot_training_history

## load in train data

In [8]:
de_train = pl.scan_parquet('./kaggledata/de_train.parquet')
de_train_df = de_train.collect().to_pandas()

## define function to output test score for each model

In [14]:
def extractAffinities(sm_names, affinities):
    """
    Function to extract affinities from the affinities dataframe

    Parameters:
    - sm_names: List/Array of sm_names
    - affinities: Stored affinities predicted using DeepPurpose

    Returns:
    - Affinities as a numpy array
    """
    encoded_affinities = []
    for name in sm_names:
        filtered = affinities[affinities['sm_name'] == name]
        sm_affinities = filtered.iloc[:, 2:].values[0]
        encoded_affinities.append(sm_affinities)

    np_encoded_affinities = np.array(encoded_affinities)

    return np_encoded_affinities

In [17]:
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.models import Sequential

# basically entire code in the other model notebooks
def testModel(affinityFileName):
    print(f'Testing: {affinityFileName}')
    filename = f'./affinities/{affinityFileName}'
    affinities =pd.read_csv(filename, index_col=0)
    
    cell_type = de_train_df['cell_type'].to_numpy().reshape(-1, 1)
    encoder = OneHotEncoder()
    encoder.fit(cell_type)
    encoded_cell_type = encoder.transform(cell_type)

    sm_name = de_train_df['sm_name']
    np_encoded_affinities = extractAffinities(sm_name, affinities)

    encoded_features = np.hstack((encoded_cell_type.toarray(), np_encoded_affinities))
    genes_lfc = de_train_df.drop(columns=['cell_type', 'sm_name', 'sm_lincs_id', 'SMILES', 'control'])

    # Split the data into 70% training, 15% validation, and 15% testing
    X_train, X_temp, y_train, y_temp = train_test_split(encoded_features, genes_lfc.values, test_size=0.3, shuffle=False)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)
    
    tf.random.set_seed(42)

    model = Sequential([
        Dense(3400, activation="tanh"),
        Dense(3000, activation="tanh"),
        Dense(2100, activation="tanh"),
        Dense(200, activation="tanh"),
        Dense(200, activation="tanh"),
        Dropout(0.25),
        Dense(18211, activation="linear")
    ])

    model.compile(loss=mean_rowwise_rmse_loss, 
                    optimizer=tf.keras.optimizers.Adam(),
                    metrics=[custom_mean_rowwise_rmse])

    # train for 5 epochs only since model converges rather quickly
    history_1 = model.fit(X_train, y_train,
                        epochs=5,
                        validation_data=(X_val,y_val),
                        batch_size=32,
                        callbacks=[create_model_checkpoint("model_1", monitor="val_custom_mean_rowwise_rmse")])
    
    # prints mae & mmrse
    calculate_mae_and_mrrmse(model=model, data=X_test, y_true=y_test)

In [15]:
# viewing what affinities are available
import os

# Specify the path to the subfolder
subfolder_path = './affinities'

# Get a list of all files in the subfolder
files = [f for f in os.listdir(subfolder_path) if os.path.isfile(os.path.join(subfolder_path, f))]

# Print the list of file names
for file in files:
    print(file)

affinities_CNN_CNN_BindingDB.csv
affinities_Morgan_CNN_BindingDB.csv
affinities_MPNN_CNN_BindingDB.csv
affinities_MPNN_CNN_DAVIS.csv
affinities_Transformer_CNN_BindingDB.csv


In [18]:
for file in files:
    testModel(file)

Testing: affinities_CNN_CNN_BindingDB.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Mean Absolute Error (MAE): 0.8210691997408935
Mean Rowwise Root Mean Squared Error (MRRMSE): 1.2875516966558984
Testing: affinities_Morgan_CNN_BindingDB.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Mean Absolute Error (MAE): 0.8207672227442011
Mean Rowwise Root Mean Squared Error (MRRMSE): 1.2873470674043066
Testing: affinities_MPNN_CNN_BindingDB.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Mean Absolute Error (MAE): 0.8209535103557689
Mean Rowwise Root Mean Squared Error (MRRMSE): 1.2875114412056248
Testing: affinities_MPNN_CNN_DAVIS.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Mean Absolute Error (MAE): 0.8211661095399948
Mean Rowwise Root Mean Squared Error (MRRMSE): 1.2875206825597805
Testing: affinities_Transformer_CNN_BindingDB.csv
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Mean Absolute Error (MAE): 0.820316523027801
Mean Rowwise Root Mean Squared Error (