<a href="https://colab.research.google.com/github/alessandronascimento/pyLiBELa/blob/main/Colabs/TF/(GVM)_Preparing_PDBBind_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!ls

pdbbind  sample_data


In [4]:
project_id = 'smart-monitor-401017'
!gcloud config set project {project_id}
!gsutil ls

Updated property [core/project].
gs://pdbbind/


In [5]:
#@title Helper class to handle GCS file access

from pathlib import Path


class AccessFile():
  """
  Upon initialization, downloads the file stored in the cloud storage
  as given by the cgs_path rooted in gs://pdbbind/ and stores it in
  Colab's /content/tmp directory. The file is deleted once all the
  references to the object are dropped.
  """

  def __init__(self, gcs_path, log=False):
    self.gcs_path = gcs_path
    self.local_path = f'/content/tmp/{self.gcs_path}'

    self.copy_status = !gsutil cp gs://pdbbind/{gcs_path} {self.local_path}
    if log: print(self.copy_status)

    self.is_open = False
    if not Path(self.local_path).exists():
      raise FileNotFoundError(f'File {self.local_path} not found. \
      \nThis Google Cloud Storage copy operation returned {self.copy_status}')

  def open(self, mode):
    self.is_open = True
    self._file_handle = open(self.local_path, mode)
    return self

  def getFile(self):
    return self._file_handle

  def getPath(self):
    return self.local_path

  def __del__(self):
    if self.is_open: self.getFile().close()
    deletion = !rm -f {self.local_path}

In [6]:
#@title Getting list of targets with grids
targets_ok = []
target_ok_file = AccessFile('scripts/targets_ok.dat').open('r')
for line in target_ok_file.getFile():
  line2 = line.split()
  targets_ok.append(line2[0])
target_ok_file.getFile().close()

In [7]:
#@title Reading PDBBind v2020 data
binding_targets = []             # pdb id
binding_years = []               # year of the structure
binding_resolution = []          # resolution
binding_score = []               # -logKd/Ki

binding_file = AccessFile('index/INDEX_general_PL_data.2020').open('r')
for line in binding_file.getFile():
  line2 = line.split()
  if line2[0][0] != '#':
    if line2[0] not in targets_ok: continue

    binding_targets.append(line2[0])
    if (line2[1] == 'NMR'):
      binding_resolution.append(0.00)
    else:
      binding_resolution.append(float(line2[1]))
    binding_years.append(int(line2[2]))
    binding_score.append(float(line2[3]))
binding_file.getFile().close()

print('Binding data found for %d valid targets' % len(binding_targets))

Binding data found for 18925 valid targets


In [8]:
#@title Auxiliary functions
import tensorflow as tf

def process_file(file_access : AccessFile):

  data = tf.io.read_file(file_access.getPath())
  data = tf.io.decode_raw(data, tf.float64)
  data = tf.reshape(data, (60, 60, 60, 3))
  return data

def _log_failure(message):
  with open("log_files_not_found", 'a') as f:
    f.write(f'{message}\n')

def data_generator(name_list, score_list):

  for i in range(len(name_list)):

    target = name_list[i].decode()
    root_path = f'targets/{target}/grid_30_0.5_SF0'

    try:
      grid1_access = AccessFile(f'{root_path}/McGrid_rec.grid')
      grid2_access = AccessFile(f'{root_path}/McGrid_lig.grid')
    except FileNotFoundError as err:
      _log_failure(f'rec or lig "{target}" not found with error <<{err}>>\n')
      continue

    grid1 = process_file(grid1_access)
    grid2 = process_file(grid2_access)

    observable = score_list[i]
    combined_grid = tf.concat([grid1, grid2], axis=-1)

    # Yield the data as a tuple
    yield combined_grid, observable

    # Yields the 10 related decoys of the current grid
    for j in range(1,11):

      try:
        grid3_access = AccessFile(f'{root_path}/McGrid_dec_{j}.grid')
      except FileNotFoundError as err:
        _log_failure(f'Decoy "{target}" not found with error {err}')
        continue

      grid3 = process_file(grid3_access)
      observable = 0.00
      combined_grid = tf.concat([grid1, grid3], axis=-1)
      yield combined_grid, observable


In [9]:
#@title Reading dataset

from sklearn.model_selection import train_test_split

train_names, test_names, train_scores, test_scores = train_test_split(binding_targets, binding_score, train_size=0.8, shuffle=True)
train_names, valid_names, train_scores, valid_scores = train_test_split(train_names, train_scores, train_size=0.8)

def slice_data(dataset):
  x = tf.concat([x for x, y in dataset], axis=-1)
  y = tf.concat([y for x, y in dataset], axis=-1)
  return x,y

# Some dataset parameters

output_signature = (tf.TensorSpec(shape=(60, 60, 60, 6), dtype=tf.float64, name='combgrid'),
                    tf.TensorSpec(shape=(), dtype=tf.float32, name='ligscore'))
batch_size = 5
prefetch_size = 1

# Create the dataset from the generator function,
# with batch and prefetch sizes already determined
train_dataset = tf.data.Dataset.from_generator(
    data_generator,
    output_signature=output_signature,
    args=(tf.convert_to_tensor(train_names, dtype=tf.string), tf.convert_to_tensor(train_scores, dtype=tf.float32)),
    name="train_dataset_gen"
).batch(batch_size).prefetch(prefetch_size)

test_dataset = tf.data.Dataset.from_generator(
    data_generator,
    output_signature=output_signature,
    args=(tf.convert_to_tensor(test_names, dtype=tf.string), tf.convert_to_tensor(test_scores, dtype=tf.float32)),
    name="test_dataset_gen"
).batch(batch_size).prefetch(prefetch_size)

valid_dataset = tf.data.Dataset.from_generator(
    data_generator,
    output_signature=output_signature,
    args=(tf.convert_to_tensor(valid_names, dtype=tf.string), tf.convert_to_tensor(valid_scores, dtype=tf.float32)),
    name="valid_dataset_gen"
).batch(batch_size).prefetch(prefetch_size)

# train_val, test = tf.keras.utils.split_dataset(dataset, left_size=0.8, right_size=0.2, shuffle=True, seed=23)
# train, valid = tf.keras.utils.split_dataset(train_val, left_size=0.8, right_size=0.2)

# x_train, y_train = slice_data(train)
# x_test, y_test = slice_data(test)
# x_valid, y_valid = slice_data(valid)

# Repeat the dataset for multiple epochs (optional)
#dataset = dataset.repeat()

In [10]:
#@title Alex's model

cnn_model = tf.keras.Sequential([
        tf.keras.layers.Conv3D(64, 5, activation='relu', data_format='channels_last', input_shape=(60,60,60,6), padding="same"),
        tf.keras.layers.MaxPool3D(data_format='channels_last'),
        tf.keras.layers.Conv3D(128, 3, activation='relu', data_format='channels_last', padding="same"),
        tf.keras.layers.Conv3D(128, 3, activation='relu', data_format='channels_last', padding="same"),
        tf.keras.layers.MaxPool3D(data_format='channels_last'),
        tf.keras.layers.Conv3D(256, 3, activation='relu', data_format='channels_last', padding="same"),
        tf.keras.layers.Conv3D(256, 3, activation='relu', data_format='channels_last', padding="same"),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(units=128, activation="relu"),
        tf.keras.layers.Dense(units=64, activation="relu"),
        tf.keras.layers.Dense(units=1),
    ])

cnn_model.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), metrics=["RootMeanSquaredError"])

In [None]:
out_info = cnn_model.fit(train_dataset, epochs=1, validation_data=valid_dataset)
mse_test = cnn_model.evaluate(test_dataset)

Copying gs://pdbbind/targets/3dbs/grid_30_0.5_SF0/McGrid_dec_9.grid...
/ [0 files][    0.0 B/  4.9 MiB]                                                

#Questions:

1. What if we used a batch normalization in the model? Something like:

```
tf.keras.layers.BatchNormalization()
```

in the begining of the model and after each hidden layer?
