# Tensorflow Experiments Template

In [6]:
# install benatools library
!pip install benatools >> /dev/null

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import os
import time as time
import matplotlib.pyplot as plt
from tqdm import tqdm
import glob

import tensorflow as tf

from benatools.tf.tpu import (get_device_strategy, init_tpu)
from benatools.utils.tools import MultiStratifiedKFold

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

# CONSTANTS
PLATFORM = 'COLAB'  # this could be 'COLAB' or 'LOCAL'
DEVICE = 'TPU'   # This could be 'GPU' or 'CPU'

# Initialization

Seeding everything for experiment replicability

In [8]:
# Seed
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(42)

Setting up distributed strategy. In case of training with TPU's or multiple GPU's, a distributed strategy must be created. 

In [4]:
strategy, AUTO, REPLICAS, tpu = get_device_strategy(DEVICE, verbose=True)

connecting to TPU...
Running on TPU grpc://10.32.91.146:8470
initializing TPU ...
INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Initializing the TPU system: grpc://10.32.91.146:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.32.91.146:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


TPU initialized
REPLICAS: 8


# Read Data

There are normally some files linked to the dataset with metadata, contextual information, calendars, etc.

In [None]:
# Read files
# training_examples = pd.read_csv('training_examples.csv')

# Dataset

### Dataset Folder
If training on TPU, the data must be stored into a GS bucket.  
When training on Kaggle platform, calling 
```python
KaggleDatasets().get_gcs_path() 
```
automatically copies the dataset into a GS bucket.  
If training on Google Colab, be aware that you might incurr in egress charges.  
If training on GPU, there is no such problem.

In [13]:

file_folder = ''  # the file folder or the dataset name

GCS_DS_PATH = '/content/drive/MyDrive/ssid'

if DEVICE == 'TPU':
    if PLATFORM == 'COLAB':
        files_train = np.array(tf.io.gfile.glob(GCS_DS_PATH + '/*.tfrec'))  # in this case it should be something like gs://
else:
    files_train = np.array(tf.io.gfile.glob(GCS_DS_PATH + '/*.tfrec'))
    
# Another way to do it if the files are already classify in folds is the following
#for i in range(FOLDS):
#    GCS_DS_PATH = KaggleDatasets().get_gcs_path('birdsongs-data-tf-external-fold'+str(i))
#    files_train.append(np.sort(np.array(tf.io.gfile.glob(GCS_DS_PATH + '/*.tfrec'))))    
    
    
    
train_df = pd.DataFrame({'path':files_train})
train_df['fold'] = np.array([0,1,2,3,4])
train_df

Unnamed: 0,path,fold
0,/content/drive/MyDrive/ssid/ssid_fold_0_000_25...,0
1,/content/drive/MyDrive/ssid/ssid_fold_1_000_25...,1
2,/content/drive/MyDrive/ssid/ssid_fold_2_000_25...,2
3,/content/drive/MyDrive/ssid/ssid_fold_3_000_25...,3
4,/content/drive/MyDrive/ssid/ssid_fold_4_000_25...,4


### CV Strategy
One of the most important things is to have a proper CV strategy, to make sure the CV result is reliable.  
Usually, and when working with preprocessed TFRecords, the dataset is already split into folds.
Usually the fold number can be found on the file name.  

When experimenting, it is a good practice to have split the dataset beforehand, for reproducibility purposes.  

If the dataset is not split yet, this is usually a good moment to do it.

### TFRecords Dataset Object

TF Records is the fastest way to train using tensorflow. This avoids opening images or files individually, since many records can be added into the same file of 100-200 MB.   
These are some basic functions and a schema to generate TFRecordDataset

In [15]:

def read_labeled_tfrecord(ex):
    """
    This is an example of decoding a tf record. You should know before hand the tf record format, and
    define it in a dictionary.
    
    Inputs:
        ex: is an tf example object, provided by the TFRecordDataset
    Outputs:
        data: the decoded data
        label: the label of this example
        
    More parameters, inputs or outputs, can be added to this function.
    """
    labeled_tfrec_format = {
      'x': tf.io.FixedLenFeature([], tf.string), # image o data
      'y': tf.io.FixedLenFeature([], tf.string), # label
    }
    example = tf.io.parse_single_example(ex, labeled_tfrec_format)
    x = tf.io.decode_raw(example['image'], out_type=tf.float32)
    y = tf.io.decode_raw(example['image'], out_type=tf.float32)

    #y = tf.one_hot(y, n_classes, on_value=1.0, off_value=0.0, dtype=tf.float32) # labels in one hot format
    return x, y # returns a decoded example 

def transforms(image, label, prob=0.5, dim=224):
    # Data augmentation methods should come here
    image = transform2d(image, dimension=dim, rotation=30.0, prob=prob)
    image = tf.reshape(image, (dim,dim,3))
    image = dropout(image, prob=prob, rank=2)
    image = tf.reshape(image, (dim,dim,3))
    return image, label

def batch_transforms(batch, labels, batch_size, prob=0.5, dim=224):
    # Data augmentation methods should come here
    image2, label2 = cutmix(batch, labels, dimension=dim, prob=0.66, batch_size=batch_size, n_classes=5)
    image3, label3 = mixup(batch, labels, dimension=dim, prob=0.66, batch_size=batch_size, n_classes=5)
    imgs = []; labs = []
    for j in range(batch_size):
        P = tf.cast( tf.random.uniform([],0,1)<=0.5, tf.float32)
        imgs.append(P*image2[j,]+(1-P)*image3[j,])
        labs.append(P*label2[j,]+(1-P)*label3[j,])
        
    image4 = tf.reshape(tf.stack(imgs),(batch_size,dim,dim,3))
    label4 = tf.reshape(tf.stack(labs),(batch_size,5))
    return image4,label4


def load_dataset(filenames, batch_size=32, labeled=True, shuffle=False, repeat=False, do_transforms=False, do_batch_transforms=False, drop_remainders=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    # Create the dataset object from the filenames
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.cache()

    # Repeats the dataset in a loop. Set true in training and false in validation
    if repeat:
        dataset = dataset.repeat()
    
    # Shuffle the dataset. True in training and false in validation
    if shuffle: 
        dataset = dataset.shuffle(1024*REPLICAS)
        opt = tf.data.Options()
        opt.experimental_deterministic = False # disable order, increase speed
        dataset = dataset.with_options(opt)

    # At this point the dataset opens the files and reads TF Records
    dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls=AUTO) # Decode TF Records

    # At this point runs the transformations on the data, like data augmentation.
    # transforms is a function which receives a sample and a label and returns a transformed sample and label
    # this can be implemented in many ways
    if do_transforms:
        dataset = dataset.map(transforms, num_parallel_calls=AUTO)
    
    # For TPU the batches must have the same lenght, so it is mandatory to drop the remainders
    dataset = dataset.batch(batch_size, drop_remainder=drop_remainders)
    
    # At this point runs tranforms that must be performed on batches, like mixup or cutmix
    if do_batch_transforms:
        dataset = dataset.map(lambda batch, label: augmentations_batch(batch, label, batch_size, prob, dim), num_parallel_calls=AUTO)
    
    # Whether to return the label or not
    if labeled==False:
        dataset = dataset.map(lambda image, label: image, num_parallel_calls=AUTO)
    
    dataset = dataset.prefetch(AUTO)
    return dataset


def count_data_items(filenames):
    """
    The number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    This is useful when calling the model.fit() method because it needs to know how many batches to run on the epoch
    """ 
    n = [int(f[:f.rfind('.')].split('_')[-1]) for f in filenames]
    return np.sum(n)


def get_fold(fold, train_df):
    """
    This is an utility function to return the train and validation files to feed the dataset, given a fold number.
    
    Inputs:
        fold: the fold number requested
        train_df: a pandas DataFrame with a fold column and a path column
        
    Outputs:
        train_files: an array with the training files of this fold
        val_files: an array with the validation files of this fold
        """
    train_files = train_df[train_df['fold']!=str(fold)]['path'].values
    val_files = train_df[train_df['fold']==str(fold)]['path'].values
    return train_files, val_files

# Model
When experimenting, many different models or variations can be tried.  
It is useful to have a common function to route the model creations further in the training loop

In [None]:

# Example of model based on efficient net with categorical crossentropy
def get_model(b, n_classes, shape=(128,128,3)):

    # inputs
    inp = tf.keras.layers.Input(shape=shape)
    
    base = tf.keras.applications.EfficientNetB0(include_top=False, weights='imagenet', input_shape=shape)

    x = base(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)

    # # use the same head as the baseline notebook.
    x = tf.keras.layers.Dense(512, activation='relu')(x)
    x = tf.keras.layers.Dropout(rate=0.2)(x)
    x = tf.keras.layers.Dense(512, activation='relu')(x)
    x = tf.keras.layers.Dropout(rate=0.2)(x)
    x = tf.keras.layers.Dense(n_classes, activation='softmax')(x) # softmax converts logits (raw predictions [-inf,+inf]) to probabilities [0,1]

    model = tf.keras.Model(inputs=inp, outputs=x)

    opt = tf.keras.optimizers.Adam(learning_rate=0.0001*REPLICAS)
    loss = tf.keras.losses.MeanSquaredError() 
    #loss = tf.keras.losses.BinaryCrossentropy(from_logits=True, label_smoothing=0.05) 

    model.compile(optimizer=opt, 
                  #experimental_steps_per_execution=5,  #experimental
                  loss=loss,
                  metrics=[F1,true_positives,possible_positives,predicted_positives, recall, precission])  # some extra custom metrics
    
    return model

# Experiments Configuration

In [None]:
N_EXPERIMENTS = 5  # Normally not more than one run per commit
FOLDS = [0,1,2,3,4]  # Each run should cover a single fold

# DATASET PARAMS
#IMG_SIZE = [128] * N_EXPERIMENTS

# DATALOADER PARAMS
BS_TRAIN = [8] * N_EXPERIMENTS
BS_VAL = [8] * N_EXPERIMENTS

# MODEL PARAMS
MODEL = [2] * N_EXPERIMENTS
B = [1] * N_EXPERIMENTS

# TRANSFORMS
# Params for the transforms functions

# GLOBAL PARAMETERS
EPOCHS=20
DISPLAY_PLOT=False
VERBOSE = 1

# Training Loop

In [None]:
seed_everything(42)

for i in range(0,N_EXPERIMENTS-4):
    print(f'********** EXPERIMENT {i} **********')
    print(f'***** bs train {BS_TRAIN[i]*REPLICAS} *****')
    print(f'***** bs val {BS_VAL[i]*REPLICAS} *****')
    print(f'***** model {MODEL[i]} *****')
    print(f'***** efficientnet B{B[i]} *****')
    print(f'***** dropout rate {P_DROPOUT[i]} *****')
    print(f'***** noise rate {P_NOISE[i]} *****')
    print(f'**********************************\n')

    # INIT TPU
    if DEVICE=='TPU':
        init_tpu(tpu)
    
    # CREATE TRAIN AND VALIDATION DATASETS
    files_train, files_val = get_fold(FOLDS[i], train_all)

    # DATASETS
    val_dataset = load_dataset(files_val, device=device, batch_size=BS_VAL[i]*REPLICAS, labeled=True, shuffle=False, repeat=False),
    
    # BUILD MODEL
    print('Building model...')
    K.clear_session()
    with strategy.scope():
        model = get_model(MODEL[i], B[i], N_CLASSES, shape=shape)

    # SAVE BEST MODEL EACH FOLD
    model_path = "fold"+str(i)
    
    # CALLBACKS
    sv = tf.keras.callbacks.ModelCheckpoint(model_path+'.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min', save_freq='epoch')
    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
    lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, verbose=0, mode='min', min_delta=0.0001, cooldown=0, min_lr=1e-8)

    # TRAIN
    print('Training...')
    history = model.fit(
        load_dataset(files_train, device=device, batch_size=BS_TRAIN[i]*REPLICAS, labeled=True, shuffle=True, repeat=True, transforms=transforms),
        epochs = EPOCHS, 
        callbacks = [es,sv,lr],
        steps_per_epoch = count_data_items(files_train)/BS_TRAIN[i]//REPLICAS,
        validation_data = val_dataset,
        verbose = VERBOSE
    )
    
    # PLOT TRAINING
    if DISPLAY_PLOT:
        history = pd.DataFrame(history.history)
        plt.figure(figsize=(15,5))
        plt.plot(np.arange(len(history)), history['loss'],'-o',label='Train Loss',color='#ff7f0e')
        plt.plot(np.arange(len(history)), history['val_loss'],'-o',label='Val Loss',color='#1f77b4')
        x = np.argmin( history['val_loss'] ); y = np.min( history['val_loss'] )
        xdist = plt.xlim()[1] - plt.xlim()[0]; ydist = plt.ylim()[1] - plt.ylim()[0]
        plt.text(x-0.03*xdist,y-0.13*ydist,'min loss\n%.2f'%y,size=14)
        plt.ylabel('Loss',size=14); plt.xlabel('Epoch',size=14)
        plt.legend(loc=2)
        
        plt2 = plt.gca().twinx()
        plt2.plot(np.arange(len(history)),history['F1'],'-o',label='Train F1',color='#36de47')
        plt2.plot(np.arange(len(history)),history['val_F1'],'-o',label='Val F1',color='#330066')
        #x = np.argmax( history['val_F1'] ); y = np.max( history['val_F1'] )
        #xdist = plt2.xlim()[1] - plt2.xlim()[0]; ydist = plt2.ylim()[1] - plt2.ylim()[0]
        #plt2.text(x-0.03*xdist,y-0.13*ydist,'max F1\n%.2f'%y,size=14)
        #plt2.ylabel('F1',size=14); plt2.xlabel('Epoch',size=14)
        plt2.legend()
        
        #plt2 = plt.gca().twinx()
        #plt2.plot(np.arange(len(history)),history['lr'],'-o',label='LR',color='#2ca02c')
        #plt.ylabel('LR',size=14)
        
        plt.title('Experiment %i'%i,size=18)
        plt.legend(loc=3)
        plt.show()
    
    print('\n')