In [1]:
! cd ~
! pwd
! mkdir -p scp/output
! ls -l ../kaggle/input/open-problems-single-cell-perturbations

/home/jovyan/workspace
total 5354032
-rw-rw-r-- 1 jovyan jovyan  100924456 Nov  9 00:05 adata_excluded_ids.csv
-rw-rw-r-- 1 jovyan jovyan   37227289 Sep 13 14:53 adata_obs_meta.csv
-rw-rw-r-- 1 jovyan jovyan 1761529901 Sep 13 14:53 adata_train.parquet
-rw-rw-r-- 1 jovyan jovyan  450105344 Oct 27 18:38 adata_train.parquet.F249d036
-rw-rw-r-- 1 jovyan jovyan   92135550 Sep 13 14:53 de_train.parquet
-rw-rw-r-- 1 jovyan jovyan       6723 Sep 13 14:53 id_map.csv
-rw-rw-r-- 1 jovyan jovyan     943757 Sep 13 14:53 multiome_obs_meta.csv
-rw-rw-r-- 1 jovyan jovyan 2555628667 Sep 13 14:53 multiome_train.parquet
-rw-rw-r-- 1 jovyan jovyan  461373440 Oct 27 18:38 multiome_train.parquet.CD29e8F0
-rw-rw-r-- 1 jovyan jovyan   13075170 Sep 13 14:53 multiome_var_meta.csv
-rw-rw-r-- 1 jovyan jovyan   18711844 Sep 13 14:53 sample_submission.csv


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

# Imports
* * *

In [3]:
import numpy as np
import pandas as pd
import os
import datetime
import json
from collections import defaultdict, OrderedDict

import tensorflow as tf
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import tanh
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras import mixed_precision
from tensorflow.python.client import device_lib

# set the dtype policy
# mixed_precision.set_global_policy('mixed_float16')

In [4]:
# Set the seed using keras.utils.set_random_seed. This will set:
# 1) `numpy` seed
# 2) `tensorflow` random seed
# 3) `python` random seed
tf.keras.utils.set_random_seed(17)

# This will make TensorFlow ops as deterministic as possible, but it will
# affect the overall performance, so it's not enabled by default.
# `enable_op_determinism()` is introduced in TensorFlow 2.9.
tf.config.experimental.enable_op_determinism()

In [5]:
# check cuda
tf.test.is_built_with_cuda(), tf.config.list_physical_devices('GPU')
def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print(get_available_devices())

# Your output is probably something like ['/device:CPU:0']
# It should be ['/device:CPU:0', '/device:GPU:0']

['/device:CPU:0', '/device:GPU:0']


2023-11-28 16:58:46.346918: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-28 16:58:46.351200: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-28 16:58:46.351467: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-28 16:58:46.352276: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

# Read test data
* * *

In [6]:
# Contants
data_path = '../kaggle/input/open-problems-single-cell-perturbations'
model_path = './scp/model'
intermediate_path = './scp/intermediate'
output_path = './scp/output'

# model name: date in the format YYYYMMDD_HHMM
formatted_datetime = '20231128_1357'
OUTPUT_SCALE = 180

# Read mapping from submission file id to  data names
* * *

In [7]:
id_map_df = pd.read_csv(f"{data_path}/id_map.csv", index_col=[0])
id_map_df.head()

Unnamed: 0_level_0,cell_type,sm_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,B cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...
1,B cells,ABT-199 (GDC-0199)
2,B cells,ABT737
3,B cells,AMD-070 (hydrochloride)
4,B cells,AT 7867


# Read submission file example
* * *
Contains the submission ids and coumns required, so it can be used as a template to fill in with predictions, right?

In [8]:
submission_df = pd.read_csv(f"{data_path}/sample_submission.csv", index_col=[0])
print(submission_df.shape)
submission_df.head()

(255, 18211)


Unnamed: 0_level_0,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,AAK1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# read name-id mappings for embeddings
with open(f'{intermediate_path}/name_to_id_maps_for_embeddings.txt', 'r') as file:
     name_to_id_maps = json.loads(file.read())
list(name_to_id_maps['cell_type'])

['B cells',
 'Myeloid cells',
 'NK cells',
 'T cells CD4+',
 'T cells CD8+',
 'T regulatory cells']

In [10]:
list(name_to_id_maps['gene_name'].items())[:10]

[('A1BG', 0),
 ('A1BG-AS1', 1),
 ('A2M', 2),
 ('A2M-AS1', 3),
 ('A2MP1', 4),
 ('A4GALT', 5),
 ('AAAS', 6),
 ('AACS', 7),
 ('AAGAB', 8),
 ('AAK1', 9)]

In [11]:
list(name_to_id_maps['sm_name'].items())[:10]

[('5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine', 0),
 ('ABT-199 (GDC-0199)', 1),
 ('ABT737', 2),
 ('AMD-070 (hydrochloride)', 3),
 ('AT 7867', 4),
 ('AT13387', 5),
 ('AVL-292', 6),
 ('AZ628', 7),
 ('AZD-8330', 8),
 ('AZD3514', 9)]

# Read features
* * *

In [12]:
fp_path = f"{intermediate_path}/fingerprints.csv"
fp_df = pd.read_csv(fp_path)
print(f'Read {fp_df.shape[0]} small molecule fingerprints from path:\n{fp_path}')
fp_df['fp'] = fp_df['fingerprint'].apply(lambda x: list(map(int, x[1:-1].split(','))))
fp_df.drop(columns=['fingerprint'], inplace=True)
name_to_fp = {row[0]: row[1] for row in fp_df.values}
print(name_to_fp['Clotrimazole'])
fp_df

Read 146 small molecule fingerprints from path:
./scp/intermediate/fingerprints.csv
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0]


Unnamed: 0,sm_name,fp
0,Clotrimazole,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Mometasone Furoate,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Idelalisib,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Vandetanib,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Bosutinib,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
141,CGM-097,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
142,TGX 221,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
143,Azacitidine,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
144,Atorvastatin,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [13]:
ro5_path = f"{intermediate_path}/ro5_descriptors.csv"
ro5_df = pd.read_csv(ro5_path)
print(f'Read {ro5_df.shape[0]} small molecule descriptors from path:\n{ro5_path}')
ro5_df

Read 146 small molecule descriptors from path:
./scp/intermediate/ro5_descriptors.csv


Unnamed: 0,sm_name,MW,HBA,HBD,LogP,pass_ro5
0,Clotrimazole,344.845,2,0,5.37670,True
1,Mometasone Furoate,521.437,6,1,4.86920,True
2,Idelalisib,415.432,8,2,3.75430,True
3,Vandetanib,475.362,6,1,5.00420,True
4,Bosutinib,530.456,8,1,5.19038,False
...,...,...,...,...,...,...
141,CGM-097,659.271,8,0,6.58350,False
142,TGX 221,364.449,6,1,3.01262,True
143,Azacitidine,244.207,9,5,-3.16800,True
144,Atorvastatin,558.650,7,4,6.31360,False


In [14]:
# name_to_ro5 = {sm_name: ro5_df[['MW', 'HBA', 'HBD', 'LogP', 'pass_ro5']]}
ro5_df['pass_ro5'] = ro5_df['pass_ro5'].astype('int')
ro5_df['LogP'] = ro5_df['LogP'].round(4)
name_to_ro5 = {row[0]: row[1:] for row in ro5_df.values}
name_to_ro5['Clotrimazole']

array([344.845, 2, 0, 5.3767, 1], dtype=object)

In [15]:
name_to_id_maps['cell_type']['NK cells'], name_to_id_maps['cell_type']

(2,
 {'B cells': 0,
  'Myeloid cells': 1,
  'NK cells': 2,
  'T cells CD4+': 3,
  'T cells CD8+': 4,
  'T regulatory cells': 5})

# Load model
* * *

In [16]:
# read the model
file_path = f"{model_path}/{formatted_datetime}.keras"
print(f'Loading model from {file_path}')
regressor = tf.keras.models.load_model(file_path)
regressor.summary()

Loading model from ./scp/model/20231128_1357.keras


2023-11-28 16:58:48.655598: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-28 16:58:48.655892: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-28 16:58:48.656078: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-28 16:58:48.656315: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-28 16:58:48.656503: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from S

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_7 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 1, 32)        192         ['input_6[0][0]']                
                                                                                            

# Create the feature matrix for prediction
* * *
The ids (combinations of cell and drug) in the submission file are a disjoint set with the training file.

In [17]:
# example of encoding a name as numeric id for for input to embedding layer
name_to_id_maps['cell_type']['Myeloid cells']

1

In [18]:
id_to_names  = lambda id: id_map_df.loc[[id]].values[0].tolist()
id_to_names(2)

['B cells', 'ABT737']

In [19]:
feature_matrix_predict = OrderedDict()
submission_gene_ids = [name_to_id_maps['gene_name'][gene_name] for gene_name in submission_df.columns]
for id in submission_df.index.tolist():
    cell_type, sm_name = id_to_names(id)
    ro5 = name_to_ro5[sm_name]
    fp = name_to_fp[sm_name]
    cell_type_id = name_to_id_maps['cell_type'][cell_type]
    sm_id = name_to_id_maps['sm_name'][sm_name]
    for gene_id in submission_gene_ids:
        feature_matrix_predict[(cell_type_id, sm_id, gene_id, *ro5, *fp)] = None # this will be always none because we don't have this data
        # print((cell_type_id, sm_id, gene_id), feature_matrix_train.get((cell_type_id, sm_id, gene_id), None))


# Predict the submission values
* * *

In [20]:
X_pred =  np.array(list(feature_matrix_predict.keys()), dtype='int16').reshape(-1,175)
Y_preds = regressor.predict([X_pred[:,0],  X_pred[:,1],  X_pred[:,2],  X_pred[:,3:8].reshape((-1, 1, 5)),  X_pred[:,8:].reshape((-1, 1, 167))], batch_size=2024)
print(f'Y_preds.shape: {Y_preds.shape}')

  52/2295 [..............................] - ETA: 6s

2023-11-28 16:59:45.359688: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Y_preds.shape: (4643805, 1, 1)


In [21]:
num_rows, num_cols = submission_df.shape
Y_preds = Y_preds.reshape(num_rows, num_cols)
Y_preds.shape,  Y_preds.min(),  Y_preds.mean(), Y_preds.max()

((255, 18211), -19.212826, 0.3146684, 34.624146)

In [22]:
submission_df.iloc[:,:] = Y_preds
submission_df.head(10)

Unnamed: 0_level_0,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,AAK1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.343996,0.289857,0.207838,0.14856,0.495442,0.580691,0.148939,0.300765,0.180562,0.111279,...,0.13806,0.255974,0.199184,0.283842,0.309461,0.297193,0.170403,0.175018,0.065011,0.085825
1,0.229756,0.080588,0.177181,0.103611,0.50045,0.594643,0.003145,0.11147,0.01123,-0.007255,...,-0.006317,0.038983,0.027029,0.187187,0.247794,0.162858,0.013298,0.007195,-0.056679,-0.023969
2,0.448622,0.347054,0.451721,0.430418,0.862622,1.962351,0.17629,0.385409,0.108943,-0.013689,...,0.083158,0.316118,0.314925,0.447072,0.454645,0.43043,0.274384,0.146962,-0.007023,0.080314
3,0.010505,0.009791,-0.079869,-0.089935,-0.0105,0.021465,-0.035395,0.005571,0.012971,0.047327,...,-0.005657,-0.017086,-0.04831,0.001644,0.017421,-0.001379,-0.055417,-0.018314,-0.057371,-0.026382
4,0.193698,0.122006,0.005983,-0.051883,0.330829,0.438747,0.02267,0.129096,0.026903,0.049578,...,0.02267,0.083061,0.037999,0.145144,0.174191,0.142088,-0.000131,0.025737,-0.024345,0.004656
5,0.573608,0.351667,0.491856,0.380946,1.118487,1.827926,0.225476,0.34614,0.235383,0.380681,...,0.176955,0.306396,0.273087,0.442626,0.546624,0.405468,0.265066,0.223871,-0.096436,0.09104
6,0.114432,0.069517,0.003762,0.001229,-0.121219,-0.263051,-0.012232,0.08446,-0.040639,-0.042312,...,-0.039754,0.027682,0.027546,0.1147,0.118968,0.108083,0.013239,-0.023916,-0.036395,-0.038978
7,0.07755,0.062726,0.096122,0.105624,0.322504,0.684983,0.038805,0.065989,0.064125,0.062382,...,0.057745,0.073126,0.052628,0.047274,0.066804,0.078089,0.062088,0.046653,-0.018878,0.043738
8,0.281074,0.165994,0.177647,0.136192,0.370722,0.755174,0.081477,0.185724,0.106397,0.09322,...,0.078725,0.123802,0.106322,0.226402,0.25514,0.218224,0.07891,0.088432,0.023745,0.04332
9,0.246927,0.133227,0.058036,-0.014667,0.473235,0.676207,-0.167165,0.123157,-0.094318,-0.069721,...,-0.078374,0.063493,-0.013374,0.173212,0.22336,0.148997,-0.066631,-0.14787,-0.129964,-0.090347


In [23]:
submission_df.round(2).to_csv(f"{output_path}/submission_{formatted_datetime}.csv")
print(f'Submission file saved to in path {output_path}/submission_{formatted_datetime}.csv')

Submission file saved to in path ./scp/output/submission_20231128_1357.csv
