<a href="https://colab.research.google.com/github/and-rgr/contradiction_and_entailment/blob/main/dce_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
# for RoBERTa
# !pip install sentencepiece

In [3]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
pd.set_option('display.expand_frame_repr', False)

In [5]:
# set up the TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU

print('Number of replicas:', strategy.num_replicas_in_sync)

Number of replicas: 1


In [6]:
# load data
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

na_train = len(train) - len(train.dropna())
na_test = len(test) - len(test.dropna())

print("check for missing rows:", "\n\ttrain data:", na_train, ", test data:", na_test, "\n")

# reduce data - optional
train = train.iloc[:1000]
test = test.iloc[:200]

print("train data shape: ", train.shape)
print("test data shape: ", test.shape)

check for missing rows: 
	train data: 0 , test data: 0 

train data shape:  (1000, 6)
test data shape:  (200, 5)


In [7]:
# model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# model_name = 'albert-base-v2'

#model_name = 'distilroberta-base'
# loss: 1.1003 - accuracy: 0.3487 - val_loss: 1.0984 - val_accuracy: 0.3650

#model_name = 'distilgpt2'
# Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token`

model_name = 'bert-base-uncased'

# model_name = "joeddav/xlm-roberta-large-xnli"
# roberta is more likely to run out of RAM

In [8]:
# from transformers import TFAutoModel,AutoTokenizer
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

# tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

# learning_rate = 1e-5
# batch_size = 32
# warmup = 600
# max_seq_length = 128
# num_train_epochs = 3.0

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [9]:
# is .batch_encode_plus needed here?
# tf_batch = tokenizer()

max_len = 120

tf_batch = tokenizer.batch_encode_plus(
    train[['premise','hypothesis']].values.tolist(),
    padding='max_length',
    truncation=True,
    max_length=max_len,
    return_attention_mask=True,
    # return_tensors="tf"
)

In [10]:
# other ways to make a dictionary of tensors?

train_tf1=tf.convert_to_tensor(tf_batch['input_ids'],dtype=tf.int32)
train_tf2=tf.convert_to_tensor(tf_batch['attention_mask'],dtype=tf.int32)
train_input={'input_word_ids':train_tf1,'input_mask':train_tf2}

In [11]:
### TOKENIZATION DONE ###

In [12]:
# # what does this do?
# # why training = True?
# tf_outputs = tf_model(tf_batch, training = True)

In [13]:
# tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)

In [14]:
# If you have labels, you can provide them to the model, it will return a tuple with the loss and the final activations.
# tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))

In [15]:
### DistilBERT ###

In [16]:
with strategy.scope():
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")

    tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
    # tf_model = TFAutoModel.from_pretrained(model_name)

    embedding = tf_model([input_word_ids, input_mask])[0]
    output = tf.keras.layers.Dense(3, activation = 'softmax')(embedding)

    model = tf.keras.Model(inputs = [input_word_ids,input_mask], outputs = output)

    model.compile(optimizer = tf.keras.optimizers.Adam(lr = 1e-5),
                  loss = 'sparse_categorical_crossentropy',
                  metrics = ['accuracy'])
    
    model.summary()

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super(Adam, self).__init__(name, **kwargs)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 120)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 120)]        0           []                               
                                                                                                  
 tf_bert_for_sequence_classific  TFSequenceClassifie  109483778  ['input_word_ids[0][0]',         
 ation (TFBertForSequenceClassi  rOutput(loss=None,               'input_mask[0][0]']             
 fication)                      logits=(None, 2),                                                 
                                 hidden_states=None                                           

In [17]:
type(tf_model([input_word_ids, input_mask]))

transformers.modeling_tf_outputs.TFSequenceClassifierOutput

In [18]:
len(tf_model([input_word_ids, input_mask]))

1

In [19]:
tf_model([input_word_ids, input_mask])[0]

<KerasTensor: shape=(None, 2) dtype=float32 (created by layer 'tf_bert_for_sequence_classification')>

In [20]:
early_stop = tf.keras.callbacks.EarlyStopping(patience = 4, restore_best_weights = True, verbose = 1, monitor = 'val_accuracy')

In [21]:
# model.fit(train_input, train.label.values, epochs = 3, verbose = 1, batch_size = None, validation_split = 0.2, callbacks=[early_stop])

#*# is it necessary for train_input to be a dictionary of tensors?

model.fit(train_input, train.label.values, epochs = 10, verbose = 1, batch_size = 16*strategy.num_replicas_in_sync, validation_split = 0.2, callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
11/50 [=====>........................] - ETA: 13:51 - loss: 1.1002 - accuracy: 0.3750

KeyboardInterrupt: ignored