# NOTE:
# Run using a TPU-enabled environment
# Recommended - Kaggle TPU-enabled kernel (TPU V3-8) - comes with most packages
# There would be little differences in result between multiple runs but Leaderboard position is maintained

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from category_encoders import OrdinalEncoder

import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, Dropout 
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.models import Model

# from kaggle_datasets import KaggleDatasets

import transformers
from transformers import BertTokenizer, BertConfig, TFBertModel
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

import re
import os
import gc

In [24]:
# used to get AdamW optimizer
!pip install -q tf-models-official==2.3.0

In [3]:
from official import nlp
import official.nlp.optimization

### Input Path to Data

In [5]:
DATA_PATH = "../input/instadeep-enzyme"

In [6]:
df = pd.read_csv(os.path.join(DATA_PATH, "Train.csv"))
df_test = pd.read_csv(os.path.join(DATA_PATH, "Test.csv"))
sample_submission = pd.read_csv(os.path.join(DATA_PATH, "SampleSubmission.csv"))

In [7]:
df.shape, df_test.shape

((858777, 4), (253146, 3))

In [8]:
# Include spaces between amino acids
df.SEQUENCE = df.SEQUENCE.apply(lambda row: " ".join(row))
df_test.SEQUENCE = df_test.SEQUENCE.apply(lambda row: " ".join(row))

In [9]:
# rename column names to lowercase
df.rename({
    "SEQUENCE_ID": "sequence_id",
    "SEQUENCE": "sequence",
    "CREATURE": "creature",
    "LABEL": "label",
}, axis = 1, inplace = True)

df_test.rename({
    "SEQUENCE_ID": "sequence_id",
    "SEQUENCE": "sequence",
    "CREATURE": "creature",
}, axis = 1, inplace = True)

In [10]:
# Drop duplicates
df = df.drop_duplicates(subset = ['sequence', 'label'], keep = 'first')

### Encode labels

In [11]:
le = OrdinalEncoder(cols = ["label"], return_df = False, mapping = 
                    [{"col": "label", "mapping": {
                        "class0": 0,
                        "class1": 1,
                        "class2": 2,
                        "class3": 3,
                        "class4": 4,
                        "class5": 5,
                        "class6": 6,
                        "class7": 7,
                        "class8": 8,
                        "class9": 9,
                        "class10": 10,
                        "class11": 11,
                        "class12": 12,
                        "class13": 13,
                        "class14": 14,
                        "class15": 15,
                        "class16": 16,
                        "class17": 17,
                        "class18": 18,
                        "class19": 19,
                    }
                     }]
                   )

In [27]:
df["label"] = le.fit_transform(df.label)[:,0]

In [13]:
sequences = df.sequence.values
test_sequences = df_test.sequence.values

In [14]:
labels = df.label.values

In [15]:
del df

In [16]:
gc.collect()

102

# Connect to TPU

In [17]:
def connect_to_tpu():
    try: # detect TPUs
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() 
        strategy = tf.distribute.TPUStrategy(tpu)
        
        print('Running on TPU ', tpu.master())
    except ValueError: # otherwise detect GPUs
        strategy = tf.distribute.MirroredStrategy()

    print("REPLICAS: ", strategy.num_replicas_in_sync)
    
    return strategy

In [18]:
strategy = connect_to_tpu()

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


# Model Configurations

In [19]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
# GCS_DS_PATH = KaggleDatasets().get_gcs_path()

EPOCHS = 1
BATCH_SIZE = 16 * strategy.num_replicas_in_sync # 128
MAX_LEN = 384
MODEL = "Rostlab/prot_bert_bfd"

## Slpit data into train/validation

In [20]:
train_sequences, valid_sequences, train_labels, valid_labels = train_test_split(sequences, labels, test_size = 0.2, shuffle = True, stratify = labels, random_state = 42)

In [28]:
train_sequences.shape, valid_sequences.shape

In [22]:
del sequences
del labels

In [23]:
gc.collect()

100

# Encode data

### Encoder/Tokenizer

In [24]:
# Ref: https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
def fast_encode(texts, tokenizer, chunk_size = 1024, max_len = 384):

    tokenizer.enable_truncation(max_length=max_len)
    tokenizer.enable_padding()
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size]#.tolist()
        encs = tokenizer.encode_batch(text_chunk, )
        all_ids.extend([enc.ids for enc in encs])
        del text_chunk
        del encs
    gc.collect()
    
    return np.array(all_ids)

In [26]:
tokenizer = BertTokenizer.from_pretrained(MODEL)

# Save the tokenizer
tokenizer.save_pretrained('.')

# Reload tokenizer
fast_tokenizer = BertWordPieceTokenizer('./vocab.txt', lowercase=False)
fast_tokenizer

In [26]:
# Map rare amino acids "U,Z,O,B" to "X" according to pretrained model
# https://huggingface.co/Rostlab/prot_bert_bfd

train_sequences = [re.sub(r"[UZOB]", "X", sequence) for sequence in train_sequences]
valid_sequences = [re.sub(r"[UZOB]", "X", sequence) for sequence in valid_sequences]
test_sequences = [re.sub(r"[UZOB]", "X", sequence) for sequence in test_sequences]

In [27]:
gc.collect()

80

### Encode

In [29]:
# Get input ids
input_ids_train = fast_encode(train_sequences, fast_tokenizer, max_len = MAX_LEN)
# Get attention mask using some heuristics
attention_masks_train = (input_ids_train != 0) * 1

In [29]:
del train_sequences
gc.collect()

20

In [30]:
input_ids_valid = fast_encode(valid_sequences, fast_tokenizer, max_len = MAX_LEN)
attention_masks_valid = (input_ids_valid != 0) * 1

In [31]:
del valid_sequences
gc.collect()

20

In [31]:
input_ids_test = fast_encode(test_sequences, fast_tokenizer, max_len = MAX_LEN)
attention_masks_test = (input_ids_test != 0) * 1

In [33]:
del test_sequences
gc.collect()

20

# Modelling

In [38]:
def build_model(transformer_layer, optimizer, max_len = 384):
    
    input_word_ids = Input(shape = (max_len,), dtype = tf.int32, name = "input_ids")
    input_masks = Input(shape = (max_len,),  dtype = tf.int32, name = "attention_mask")
    
    outputs = transformer_layer(input_word_ids, attention_mask = input_masks)
    
    last_hidden_state = outputs.last_hidden_state   
    cls_token = last_hidden_state[:, 0, :]
    
    out = Dense(20, activation='softmax')(cls_token)   
    
    model = Model(inputs=[input_word_ids, input_masks], outputs=out)
    
    model.compile(optimizer, loss = SparseCategoricalCrossentropy(), metrics=['accuracy'], steps_per_execution = 1)    
    
    return model

### Config

In [32]:
config = BertConfig.from_pretrained(MODEL)

In [40]:
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 40000,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 30,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.2.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30
}

In [41]:
# Doesn't matter?
config.max_position_embeddings = 384

### Optimizer w/ LR and scheduling

In [42]:
steps_per_epoch = int(np.ceil(len(input_ids_train) / BATCH_SIZE))
num_train_steps = steps_per_epoch * EPOCHS
warmup_prop = 0.1
warmup_steps = int(warmup_prop * num_train_steps)

In [43]:
# High level wrapper that creates an optimizer with learning rate scheduler
optimizer = nlp.optimization.create_optimizer(
    5e-5, 
    num_train_steps = num_train_steps, 
    num_warmup_steps = warmup_steps, 
    end_lr = 0.0, 
    optimizer_type = "adamw"
)

In [44]:
type(optimizer)

official.nlp.optimization.AdamWeightDecay

In [45]:
warmup_steps

574

In [33]:
# connect_to_tpu() # For model reruns within same runtime OR restart session to clear TPU memory

### Prepare data in required format

In [None]:
train_data = (
    tf.data.Dataset
    .from_tensor_slices(
        (
            {
        "input_ids": input_ids_train,
        "attention_mask": attention_masks_train
            },
            train_labels
        )
    )
    .repeat(EPOCHS)
    .shuffle(2048, seed = 42)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

print("Done")
    
valid_data = (
    tf.data.Dataset
    .from_tensor_slices(
        (
            {
        "input_ids": input_ids_valid,
        "attention_mask": attention_masks_valid
            },
            valid_labels
        )
    )
    .batch(BATCH_SIZE)
    .cache()        
    .prefetch(AUTO)
)

print("Done")

test_data = (
    tf.data.Dataset
    .from_tensor_slices(
        (
            {
        "input_ids": input_ids_test,
        "attention_mask": attention_masks_test
            },
        )
    )
    .batch(BATCH_SIZE)
)

print("Done")

### Instantiate model within strategy scope

In [34]:
with strategy.scope():
    transformer_layer = TFBertModel.from_pretrained(MODEL)
    
    model = build_model(transformer_layer, optimizer, max_len = MAX_LEN)
    
model.summary()

In [35]:
train_history = model.fit(
    train_data,
    steps_per_epoch = steps_per_epoch,
    validation_data = valid_data,
    epochs = EPOCHS,
)

### Test predictions

In [36]:
test_predictions = model.predict(test_data, verbose = 1)

In [49]:
submission = sample_submission.copy()

In [50]:
submission.LABEL = np.argmax(test_predictions, axis = 1)
submission.LABEL = 'class' + submission.LABEL.astype(str)

In [37]:
submission

In [52]:
submission.to_csv("final_submission.csv", index = False)