# Using GA model wrapper with Hugging Face TF models

In [None]:
#install dependencies
! pip install transformers datasets gradient-accumulator

## How to use GA with HF TF models

In [None]:
from gradient_accumulator import GradientAccumulateModel
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

#import your model
from transformers import TFxxx

#load your model checkpoint
HF_model = TFxxx.from_pretrained(checkpoint)

#Define your model inputs and outputs
#The inputs are in most cases the output of your models tokenizer
#e.g. test it with tokenizer("Hello, my dog is cute", return_tensors="tf")
#adapt the model_input below if needed

input_ids = tf.keras.Input(shape=(None,), dtype='int32', name="input_ids")
attention_mask = tf.keras.Input(shape=(None,), dtype='int32', name="attention_mask")

model_input={'input_ids': input_ids, 'attention_mask': attention_mask}

#create a new Model which has model.input and model.output properties
new_model = Model(inputs=model_input, outputs=HF_model(model_input))

#create the GA model
model = GradientAccumulateModel(accum_steps=1, inputs=new_model.input, outputs=new_model.output)

#delete the unnecessary models
del HF_model, new_model

#simply use this GradientAccumulateModel instead of the HF_model


## Example for TFEsmForSequenceClassification

In [1]:
#imports
import pandas as pd
import keras
import tensorflow as tf
import numpy as np
import random

from gradient_accumulator import GradientAccumulateModel
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, set_seed
from datasets import Dataset

#reset random state
def set_seeds(s):
    tf.random.set_seed(s)
    tf.config.experimental.enable_op_determinism()
    np.random.seed(s)
    random.seed(s)
    set_seed(s)

#load a model for sequence regression and the corresponding tokenizer
def load_model(checkpoint, dropout):
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1, classifier_dropout=dropout)

    return tokenizer,model

#get training inputs and labels
def load_lists():
    train=pd.read_pickle("./hf_example_data.pkl")
    #reduced size to run on CPU
    train_sequences=train.iloc[:256,0].tolist() 
    train_labels=train.iloc[:256,1].tolist()

    return train_sequences, train_labels

#create a Hugging Face Dataset
def create_dataset(tokenizer, seqs, labels):
    #reduced sequence length to run on CPU
    tokenized = tokenizer(seqs, max_length=64, padding=True, truncation=True) 
    dataset = Dataset.from_dict(tokenized)
    dataset = dataset.add_column("labels", labels)

    return dataset

#create a tf dataset
def tf_dataset(model, tokenizer, dataset, batch, seed, shuffle=True):
    tf.random.set_seed(seed)     
    tf_set = model.prepare_tf_dataset(
      dataset,
      batch_size=batch,
      shuffle=shuffle,
      tokenizer=tokenizer)

    return tf_set

#training function
def run_experiment(checkpoint, dropout=0.2, batch=8, accum=1, epoch=10, lr=2e-5, seed=42, GA=True):

    set_seeds(seed)
    tf.keras.backend.clear_session()
    tf.compat.v1.reset_default_graph()

    tokenizer, model = load_model(checkpoint,dropout)
    model.summary()

    train_sequences, train_labels = load_lists()
    train_set=create_dataset(tokenizer, train_sequences, train_labels)
    train_tf=tf_dataset(model, tokenizer, train_set, batch,seed)

    if GA:
        input_ids = tf.keras.Input(shape=(None,), dtype='int32', name="input_ids")
        attention_mask = tf.keras.Input(shape=(None,), dtype='int32', name="attention_mask")
        model_input={'input_ids': input_ids, 'attention_mask': attention_mask}

        new_model = Model(inputs=model_input, outputs=model(model_input))
        acum_model = GradientAccumulateModel(accum_steps=accum, inputs=new_model.input, outputs=new_model.output)

        del model, new_model

        model = acum_model
    
    #often transformer models are compiled without specifiying loss
    #this does not work here!
    #look up the model internal loss function and specifiy it here

    #compile model
    model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss=tf.keras.losses.MeanSquaredError()) 

    #train model
    model.fit(train_tf, epochs=epoch)  

    #save results if needed
    
    
    #reset tf backend
    del model
    tf.keras.backend.clear_session()
    tf.compat.v1.reset_default_graph() 


In [2]:
#Train ESM2 model without GradientAccumulateModel as Baseline
run_experiment("facebook/esm2_t6_8M_UR50D", batch=8, accum=1, epoch=5, lr=2e-5, GA=False)

Some layers from the model checkpoint at facebook/esm2_t6_8M_UR50D were not used when initializing TFEsmForSequenceClassification: ['lm_head', 'esm/contact_head/regression/kernel:0', 'esm/contact_head/regression/bias:0']
- This IS expected if you are initializing TFEsmForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFEsmForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFEsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_esm_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 esm (TFEsmMainLayer)        multiple                  7409008   
                                                                 
 classifier (TFEsmClassifica  multiple                 103041    
 tionHead)                                                       
                                                                 
Total params: 7,512,049
Trainable params: 7,512,049
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [3]:
#Train ESM2 model with GradientAccumulateModel
run_experiment("facebook/esm2_t6_8M_UR50D", batch=8, accum=1, epoch=5, lr=2e-5, GA=True)

Some layers from the model checkpoint at facebook/esm2_t6_8M_UR50D were not used when initializing TFEsmForSequenceClassification: ['lm_head', 'esm/contact_head/regression/kernel:0', 'esm/contact_head/regression/bias:0']
- This IS expected if you are initializing TFEsmForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFEsmForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFEsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_esm_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 esm (TFEsmMainLayer)        multiple                  7409008   
                                                                 
 classifier (TFEsmClassifica  multiple                 103041    
 tionHead)                                                       
                                                                 
Total params: 7,512,049
Trainable params: 7,512,049
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [4]:
#Train equivalent ESM2 model with GradientAccumulateModel batch size 4
run_experiment("facebook/esm2_t6_8M_UR50D", batch=4, accum=2, epoch=5, lr=2e-5, GA=True)

Some layers from the model checkpoint at facebook/esm2_t6_8M_UR50D were not used when initializing TFEsmForSequenceClassification: ['lm_head', 'esm/contact_head/regression/kernel:0', 'esm/contact_head/regression/bias:0']
- This IS expected if you are initializing TFEsmForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFEsmForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFEsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_esm_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 esm (TFEsmMainLayer)        multiple                  7409008   
                                                                 
 classifier (TFEsmClassifica  multiple                 103041    
 tionHead)                                                       
                                                                 
Total params: 7,512,049
Trainable params: 7,512,049
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [5]:
#Train equivalent ESM2 model with GradientAccumulateModel batch size 2
run_experiment("facebook/esm2_t6_8M_UR50D", batch=2, accum=4, epoch=5, lr=2e-5, GA=True)

Some layers from the model checkpoint at facebook/esm2_t6_8M_UR50D were not used when initializing TFEsmForSequenceClassification: ['lm_head', 'esm/contact_head/regression/kernel:0', 'esm/contact_head/regression/bias:0']
- This IS expected if you are initializing TFEsmForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFEsmForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFEsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_esm_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 esm (TFEsmMainLayer)        multiple                  7409008   
                                                                 
 classifier (TFEsmClassifica  multiple                 103041    
 tionHead)                                                       
                                                                 
Total params: 7,512,049
Trainable params: 7,512,049
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
