In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from functools import partial
import matplotlib.pyplot as plt

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    print("Device:", tpu.master())
    strategy = tf.distribute.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print("Number of replicas:", strategy.num_replicas_in_sync)

2024-10-23 15:30:25.242540: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-23 15:30:25.621160: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Number of replicas: 1


In [5]:
AUTOTUNE = tf.data.AUTOTUNE
DATAPATH = "./"
BATCH_SIZE = 64


In [6]:
FILENAMES = tf.io.gfile.glob(DATAPATH + "randomized*.tfrecord")
split_ind = int(0.8 * len(FILENAMES))
TRAINING_FILENAMES, VALID_FILENAMES = FILENAMES[:split_ind], FILENAMES[split_ind:]

TEST_FILENAMES = tf.io.gfile.glob(DATAPATH + "randomized*.tfrecord")
print("Train TFRecord Files:", len(TRAINING_FILENAMES))
print("Validation TFRecord Files:", len(VALID_FILENAMES))
print("Test TFRecord Files:", len(TEST_FILENAMES))

Train TFRecord Files: 24
Validation TFRecord Files: 6
Test TFRecord Files: 30


In [7]:
def read_tfrecord(example):
    tfrecord_format = (
        {
            "charge": tf.io.FixedLenFeature([1], tf.int64),
            "msms": tf.io.FixedLenFeature([174], tf.float32),
            "pep": tf.io.FixedLenFeature([30], tf.int64)
        }
    )
    example = tf.io.parse_single_example(example, tfrecord_format)
    return example

In [8]:
def load_dataset(filenames):
    ignore_order = tf.data.Options()
    ignore_order.experimental_deterministic = False  # disable order, increase speed
    dataset = tf.data.TFRecordDataset(
        filenames
    )  # automatically interleaves reads from multiple files
    dataset = dataset.with_options(
        ignore_order
    )  # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(
        partial(read_tfrecord), num_parallel_calls=AUTOTUNE
    )
    # returns a dataset of (image, label) pairs if labeled=True or just images if labeled=False
    return dataset

In [9]:
def get_dataset(filenames):
    dataset = load_dataset(filenames)
    dataset = dataset.shuffle(2048)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

In [10]:
train_dataset = get_dataset(TRAINING_FILENAMES)
valid_dataset = get_dataset(VALID_FILENAMES)
test_dataset = get_dataset(TEST_FILENAMES)

In [11]:



train_dataset




<_BatchDataset element_spec={'charge': TensorSpec(shape=(None, 1), dtype=tf.int64, name=None), 'msms': TensorSpec(shape=(None, 174), dtype=tf.float32, name=None), 'pep': TensorSpec(shape=(None, 30), dtype=tf.int64, name=None)}>

In [12]:
initial_learning_rate = 0.01
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps=20, decay_rate=0.96, staircase=True
)

checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    "ppttransfo.h5", save_best_only=True
)

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    patience=10, restore_best_weights=True
)

In [13]:
def make_model():
 # Input layers
    peptides_in = keras.Input(
        name="pep", dtype="int32", sparse=False, batch_input_shape=(None, 30)
    )
    precursor_charge_in = keras.Input(
        name="charge", dtype="float32", sparse=False, batch_input_shape=(None, 6)
    )
    
    peptides_mod = layers.Dense(173, name="first")(peptides_in)
    
    meta_in = layers.Concatenate(name="meta_in", trainable=True, axis=-1)(
        [peptides_mod, precursor_charge_in]
    )

    activation = layers.LeakyReLU(name="activation", alpha=0.30000001192092896, trainable=True)(
        meta_in
    )

    output_layer = layers.Flatten(name="msms", data_format="channels_last", trainable=True)(
        activation
    )

    # Compile model
    # if this doesn't work, explicitly import masked_spectral_distance from losses
    model = keras.Model(
        inputs=[peptides_in, precursor_charge_in], outputs=output_layer
    )
    model.compile(loss="MeanSquaredError", optimizer="adam", metrics=["accuracy"])

    return model

In [14]:
with strategy.scope():
    model = make_model()
    
history = model.fit(
    train_dataset,
    epochs=2,
    validation_data=valid_dataset,
    callbacks=[checkpoint_cb, early_stopping_cb],
)

Epoch 1/2


  inputs = self._flatten_to_reference_inputs(inputs)


ValueError: in user code:

    File "/home/godsnor/miniconda3/envs/ppt/lib/python3.10/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/home/godsnor/miniconda3/envs/ppt/lib/python3.10/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/godsnor/miniconda3/envs/ppt/lib/python3.10/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/home/godsnor/miniconda3/envs/ppt/lib/python3.10/site-packages/keras/src/engine/training.py", line 1152, in train_step
        self._validate_target_and_loss(y, loss)
    File "/home/godsnor/miniconda3/envs/ppt/lib/python3.10/site-packages/keras/src/engine/training.py", line 1106, in _validate_target_and_loss
        raise ValueError(

    ValueError: Target data is missing. Your model was compiled with loss=MeanSquaredError, and therefore expects target data to be provided in `fit()`.


In [24]:
import tensorflow as tf 
raw_dataset = tf.data.TFRecordDataset("randomized-0.tfrecord")

for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)

features {
  feature {
    key: "charge"
    value {
      int64_list {
        value: 3
      }
    }
  }
  feature {
    key: "id"
    value {
      int64_list {
        value: 419
      }
    }
  }
  feature {
    key: "msms"
    value {
      float_list {
        value: -1.0
        value: -1.0
        value: -1.0
        value: -1.0
        value: 537739.875
        value: -1.0
        value: 1480334.0
        value: -1.0
        value: 112657.703125
        value: -1.0
        value: -1.0
        value: -1.0
        value: 136652.90625
        value: -1.0
        value: -1.0
        value: -1.0
        value: 645898.375
        value: -1.0
        value: -1.0
        value: -1.0
        value: -1.0
        value: -1.0
        value: -1.0
        value: -1.0
        value: 3564317.0
        value: -1.0
        value: -1.0
        value: -1.0
        value: 1891722.0
        value: -1.0
        value: -1.0
        value: -1.0
        value: 1052026.0
        value: -1.0
        val