<a href="https://colab.research.google.com/github/alessandronascimento/pyLiBELa/blob/main/Colabs/TF/Preparing_PDBBind_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Conecting to Google Drive

from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/Projects_Data/PDBbind_v2020/scripts

Mounted at /gdrive
/gdrive/My Drive/Projects_Data/PDBbind_v2020/scripts


In [None]:
#@title Reading PDBBind v2020 data
binding_targets = []             # pdb id
binding_years = []               # year of the structure
binding_resolution = []          # resolution
binding_score = []               # -logKd/Ki

binding_file = open('../index/INDEX_general_PL_data.2020', 'r')
for line in binding_file:
  line2 = line.split()
  if line2[0][0] != '#':
    binding_targets.append(line2[0])
    if (line2[1] == 'NMR'):
      binding_resolution.append(0.00)
    else:
      binding_resolution.append(float(line2[1]))
    binding_years.append(int(line2[2]))
    binding_score.append(float(line2[3]))
binding_file.close()

print('Binding data found for %d targets' % len(binding_targets))

Binding data found for 19443 targets


In [None]:
#@title Getting list of targets with grids
targets_ok = []
target_ok_file = open('targets_ok.dat', 'r')
for line in target_ok_file:
  line2 = line.split()
  targets_ok.append(line2[0])
target_ok_file.close()

In [None]:
#@title Auxiliary functions

import tensorflow as tf

def process_file(file_path):
  data = tf.io.read_file(file_path)
  data = tf.io.decode_raw(data, tf.float64)
  data = tf.reshape(data, (-1, 60, 60, 3))
  return data


def data_generator():
  ntargets=20
  for i in range(ntargets): #len(targets_ok)):
    target = targets_ok[i]
    file_path = '../targets/{}/grid_30_0.5_SF0'.format(target)
    grid1 = process_file(file_path + '/McGrid_rec.grid')
    grid2 = process_file(file_path + '/McGrid_lig.grid')

    idx = binding_targets.index(target)
    observable = binding_score[idx]
    combined_grid = tf.concat([grid1, grid2], axis=-1)

    # Yield the data as a tuple
    yield combined_grid, observable
#    yield grid1, grid2, observable

  for i in range(ntargets): #len(targets_ok)):
    target = targets_ok[i]
    file_path = '../targets/{}/grid_30_0.5_SF0'.format(target)
    grid1 = process_file(file_path + '/McGrid_rec.grid')

    for j in range(1,11):
      grid2 = process_file(file_path + '/McGrid_dec_{}.grid'.format(j))
      observable = 0.00
      combined_grid = tf.concat([grid1, grid2], axis=-1)
#      yield grid1, grid2, observable
      yield combined_grid, observable


In [None]:
#@title Reading dataset

import numpy as np

def slice_data(dataset):
  x = tf.concat([x for x, y in dataset], axis=-1)
  y = tf.concat([y for x, y in dataset], axis=-1)
  return x,y



# Create the dataset from the generator function
dataset = tf.data.Dataset.from_generator(
    data_generator,
    output_signature=(tf.TensorSpec(shape=(None, 60, 60, 6), dtype=tf.float64, name='combgrid'), tf.TensorSpec(shape=(), dtype=tf.float32, name='ligscore')),
)

dataset = dataset.batch(batch_size=10, drop_remainder=True)

train_val, test = tf.keras.utils.split_dataset(dataset, left_size=0.8, right_size=0.2, shuffle=True, seed=23)
train, valid = tf.keras.utils.split_dataset(train_val, left_size=0.8, right_size=0.2)

x_train, y_train = slice_data(train)
x_test, y_test = slice_data(test)
x_valid, y_valid = slice_data(valid)

# Repeat the dataset for multiple epochs (optional)
#dataset = dataset.repeat()

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (4,) + inhomogeneous part.

In [None]:
#@title Alex's model

cnn_model = tf.keras.Sequential([
        tf.keras.layers.Conv3D(64, 5, activation='relu', data_format='channels_first', input_shape=(60,60,60,6), padding="same"),
        tf.keras.layers.MaxPool3D(data_format='channels_first'),
        tf.keras.layers.Conv3D(128, 3, activation='relu', data_format='channels_first', padding="same"),
        tf.keras.layers.Conv3D(128, 3, activation='relu', data_format='channels_first', padding="same"),
        tf.keras.layers.MaxPool3D(data_format='channels_first'),
        tf.keras.layers.Conv3D(256, 3, activation='relu', data_format='channels_first', padding="same"),
        tf.keras.layers.Conv3D(256, 3, activation='relu', data_format='channels_first', padding="same"),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(units=128, activation="relu"),
        tf.keras.layers.Dense(units=64, activation="relu"),
        tf.keras.layers.Dense(units=1),
    ])

cnn_model.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), metrics=["RootMeanSquaredError"])

In [None]:
out_info = cnn_model.fit(x_train, y_train, epochs=20, validation_data=(x_valid, y_valid))
mse_test = cnn_model.evaluate(x_test, y_test)
#mse_test, rmse_test = cnn_model.evaluate(test.batch(BATCH))

#Questions:

1. What if we used a batch normalization in the model? Something like:

```
tf.keras.layers.BatchNormalization()
```

in the begining of the model and after each hidden layer?
