# model 1
one-hot encoded cell_type + MPNN_CNN_BindingDB affinities

In [2]:
import polars as pl
import numpy as np
import pandas as pd

import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# import from the auxFunctions.py file
from auxFunctions import calculate_mae_and_mrrmse, mean_rowwise_rmse_loss, custom_mean_rowwise_rmse, create_model_checkpoint, plot_training_history

## pre-process data
one-hot encode cell type
<br>map sm_name to affinities_MPNN_CNN_BindingDB.csv

In [3]:
de_train = pl.scan_parquet('./de_train.parquet')
de_train_df = de_train.collect().to_pandas()

# test provided by kaggle --> upload predictions to kaggle to get the score
id_map = pd.read_csv('./id_map.csv')

affinities = pd.read_csv('affinities_MPNN_CNN_BindingDB.csv', index_col=0)

train dataset provided by kaggle
- will be split into train/test/validation for internal testing before model is trained on the entire train and used to predict on the test in id_map

In [4]:
def extractAffinities(sm_names, affinities):
    """
    Function to extract affinities from the affinities dataframe

    Parameters:
    - sm_names: List/Array of sm_names
    - affinities: Stored affinities predicted using DeepPurpose

    Returns:
    - Affinities as a numpy array
    """
    encoded_affinities = []
    for name in sm_names:
        filtered = affinities[affinities['sm_name'] == name]
        sm_affinities = filtered.iloc[:, 2:].values[0]
        encoded_affinities.append(sm_affinities)

    np_encoded_affinities = np.array(encoded_affinities)

    return np_encoded_affinities

In [5]:
# one-hot encode cell_type
cell_type = de_train_df['cell_type'].to_numpy().reshape(-1, 1)
encoder = OneHotEncoder()
encoder.fit(cell_type)

# of type scipy.sparse._csr.csr_matrix
encoded_cell_type = encoder.transform(cell_type)

# map sm_name to affinities
sm_name = de_train_df['sm_name']

# has shape (614, 12766), of type numpy.ndarray
#this is the training set
np_encoded_affinities = extractAffinities(sm_name, affinities)

# concatenate encoded_cell_type and np_encoded_affinities
# final shape (614, 12772)
encoded_features = np.hstack((encoded_cell_type.toarray(), np_encoded_affinities))

print(encoded_cell_type)


# wanted output
genes_lfc = de_train_df.drop(columns=['cell_type', 'sm_name', 'sm_lincs_id', 'SMILES', 'control'])

  (0, 2)	1.0
  (1, 3)	1.0
  (2, 4)	1.0
  (3, 5)	1.0
  (4, 2)	1.0
  (5, 3)	1.0
  (6, 4)	1.0
  (7, 5)	1.0
  (8, 0)	1.0
  (9, 1)	1.0
  (10, 2)	1.0
  (11, 3)	1.0
  (12, 4)	1.0
  (13, 5)	1.0
  (14, 2)	1.0
  (15, 3)	1.0
  (16, 4)	1.0
  (17, 5)	1.0
  (18, 2)	1.0
  (19, 3)	1.0
  (20, 4)	1.0
  (21, 5)	1.0
  (22, 2)	1.0
  (23, 3)	1.0
  (24, 4)	1.0
  :	:
  (589, 5)	1.0
  (590, 2)	1.0
  (591, 3)	1.0
  (592, 4)	1.0
  (593, 5)	1.0
  (594, 2)	1.0
  (595, 3)	1.0
  (596, 4)	1.0
  (597, 5)	1.0
  (598, 2)	1.0
  (599, 3)	1.0
  (600, 4)	1.0
  (601, 5)	1.0
  (602, 2)	1.0
  (603, 3)	1.0
  (604, 4)	1.0
  (605, 5)	1.0
  (606, 2)	1.0
  (607, 3)	1.0
  (608, 4)	1.0
  (609, 5)	1.0
  (610, 2)	1.0
  (611, 3)	1.0
  (612, 4)	1.0
  (613, 5)	1.0


In [6]:
# repeat for kaggle test set
kaggle_cell_type = id_map['cell_type'].to_numpy().reshape(-1, 1)


encoded_kaggle_cell_type = encoder.transform(kaggle_cell_type)



kaggle_sm_name = id_map['sm_name']

#This is for the test data set
encoded_kaggle_affinities = extractAffinities(kaggle_sm_name, affinities)


# final shape (255, 12772)
encoded_kaggle_features = np.hstack((encoded_kaggle_cell_type.toarray(), encoded_kaggle_affinities))

In [7]:
# Split the data into 70% training, 15% validation, and 15% testing
X_train, X_temp, y_train, y_temp = train_test_split(encoded_features, genes_lfc.values, test_size=0.3, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)

# used for final training before predicting on kaggle_test
full_features = encoded_features
full_labels = genes_lfc.values

print(y_train)

[[ 0.10472047 -0.07752421 -1.62559604 ...  0.03412678  0.22137655
   0.36875538]
 [ 0.91595324 -0.88438038  0.37183448 ...  0.70477983  1.09670189
  -0.86988664]
 [-0.38772076 -0.30537826  0.56777737 ...  0.41576793  0.07843919
  -0.25936541]
 ...
 [ 0.61950832 -0.03779631  0.87478376 ... -0.95027954 -0.49932213
   0.11094977]
 [-0.1314054   0.17761662 -0.11689098 ...  0.25845771 -0.29531843
  -0.3699244 ]
 [ 1.07188372 -0.35765163  0.17995645 ... -0.48290495 -0.4353545
  -0.23529439]]


## model tuning

In [8]:
from tensorflow import keras
from tensorflow.keras import layers

from keras import Sequential
from keras.layers import Activation, Dense

In [16]:
##define search space

def build_model(hp):
    model = keras.Sequential()

    model.add(layers.Dense(units=hp.Int("units1", min_value=32, max_value=18211, step=32),activation="relu",))

    model.add(layers.Dense(units=hp.Int("units2", min_value=32, max_value=18211, step=32),activation="relu",))

    model.add(layers.Dense(18211, activation="softmax"))


    model.compile(
        optimizer="adam", loss=mean_rowwise_rmse_loss, metrics=[custom_mean_rowwise_rmse],
    )

    return model

In [17]:
import keras_tuner

In [18]:
build_model(keras_tuner.HyperParameters())

<keras.src.engine.sequential.Sequential at 0x1c707fab550>

In [19]:
#select tuner classs to run search

tuner = keras_tuner.RandomSearch(
    hypermodel=build_model,
    objective="val_accuracy",
    max_trials=3,
    executions_per_trial=2,
    overwrite=True,
    directory="./",
    project_name="model_1",
)

In [20]:
#search space summary

tuner.search_space_summary()

Search space summary
Default search space size: 2
units1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 18211, 'step': 32, 'sampling': 'linear'}
units2 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 18211, 'step': 64, 'sampling': 'linear'}


In [14]:
tuner.search(X_train, y_train, epochs=2, validation_data=(X_val, y_val))


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
17472             |17472             |units

Epoch 1/2

KeyboardInterrupt: 

In [None]:
models = tuner.get_best_models(num_models=2)
best_model = models[0]
# Build the model.
# Needed for `Sequential` without specified `input_shape`.
best_model.build(input_shape=(None, 614, 12772))
best_model.summary()

NameError: name 'tuner' is not defined

In [None]:
best_hps = tuner.get_best_hyperparameters(5)
# Build the model with the best hp.
model_tuned = build_model(best_hps[0])
# Fit with the entire dataset.
x_all = np.concatenate((X_train, X_val))
y_all = np.concatenate((y_train, y_val))
model_tuned.fit(x=x_all, y=y_all, epochs=50)

[[ 0.05505954  0.3664068   0.2408866  ... -0.07028358  0.06527169
  -0.2303034 ]
 [ 0.05505954  0.36640686  0.2408866  ... -0.07028358  0.06527174
  -0.23030336]
 [ 0.05505954  0.36640662  0.24088672 ... -0.07028371  0.06527165
  -0.23030327]
 ...
 [ 0.05505954  0.36640686  0.2408866  ... -0.07028355  0.06527174
  -0.2303034 ]
 [ 0.05505954  0.36640686  0.2408866  ... -0.07028355  0.06527173
  -0.2303034 ]
 [ 0.05505954  0.36640686  0.2408866  ... -0.07028355  0.06527174
  -0.2303034 ]]
18211
