<a href="https://colab.research.google.com/github/and-rgr/contradiction_and_entailment/blob/main/contradiction_and_entailment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Detecting Contradiction and Entailment
# Learning project for NLP and Tensorflow
# Based on the following notebook: tinyurl.com/hu66vtby

In [2]:
### LIBRARIES AND FUNCTIONS ###

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import datetime
import pandas as pd
import tensorflow as tf
from tensorflow.keras import regularizers
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
import sentencepiece

In [6]:
## Set up the TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU

print('Number of replicas:', strategy.num_replicas_in_sync)

Number of replicas: 1


In [7]:
def history_log(filename):
    accuracy = history.history['accuracy']
    loss = history.history['loss']
    val_accuracy = history.history['val_accuracy']
    val_loss = history.history['val_loss']

    my_list = ['loss, accuracy, val_loss, val_accuracy']
    for i in range(len(accuracy)):
        line = str(round(loss[i],4)) + ", " + str(round(accuracy[i],4)) + ", " + str(round(val_loss[i],4)) + ", " + str(round(val_accuracy[i],4))
        my_list.append(line)

    with open(filename + ".csv", 'w') as f:
        for item in my_list:
            f.write("%s\n" % item)

In [8]:
def hyperparameter_log(filename):
    with open(filename + ".txt", 'w') as f:
        f.write("model_name = " + str(model_name) + "\n")   
        f.write("tokenizer_length = " + str(tokenizer_length) + "\n")       
        f.write("learning_rate = " + str(learning_rate) + "\n")
        f.write("batch_factor = " + str(batch_factor) + "\n")
        f.write("validation_split = " + str(validation_split) + "\n")
        f.write("kernel_initializer = " + str(kernel_initializer) + "\n")
        f.write("epochs = " + str(epochs) + "\n")
        f.write("patience = " + str(patience) + "\n")
        f.write("dropout = " + str(dropout) + "\n")
        f.write("optimizer = " + str(optimizer) + "\n")        

In [9]:
def gpu_info():
    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
        print('Not connected to a GPU')
    else:
        print(gpu_info)

In [10]:
gpu_info()

Fri Jun  3 20:42:41 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    31W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
### LOAD DATA ###

In [12]:
dataset = pd.read_csv('dataset.csv')
train_labels = dataset['label']
train = dataset.drop(columns='label')

In [13]:
print("training data shape: \t", train.shape)
print("training labels shape: \t", train_labels.shape)

training data shape: 	 (12120, 5)
training labels shape: 	 (12120,)


In [14]:
### SET HYPERPARAMETERS ###

In [15]:
# these hyperparameters are tuned using a high ram + gpu colab runtime

In [16]:
model_name = "joeddav/xlm-roberta-large-xnli"
tokenizer_length = 100
learning_rate =  1e-5
batch_factor = 24
validation_split = 0.2
kernel_initializer = "lecun_normal"
epochs = 20
patience = 7
dropout = 0.015
optimizer = tf.keras.optimizers.Adamax(learning_rate = learning_rate)

In [17]:
### TOKENIZATION ###

In [18]:
# premise average length: 107
# hypothesis average length: 54

print('min length of \"premise\" column:', train['premise'].apply(len).min())
print('average length of \"premise\" column:', round(train['premise'].apply(len).mean()))
print('max length of \"premise\" column:', train['premise'].apply(len).max())

print('min length of \"hypothesis\" column:', train['hypothesis'].apply(len).min())
print('average length of \"hypothesis\" column:', round(train['hypothesis'].apply(len).mean()))
print('max length of \"hypothesis\" column:', train['hypothesis'].apply(len).max())

min length of "premise" column: 4
average length of "premise" column: 107
max length of "premise" column: 967
min length of "hypothesis" column: 4
average length of "hypothesis" column: 54
max length of "hypothesis" column: 276


In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_batch = tokenizer.batch_encode_plus(
    train[['premise','hypothesis']].values.tolist(),
    padding='max_length',
    truncation=True,
    max_length=tokenizer_length,
    return_attention_mask=True,
)

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [20]:
train_tf1=tf.convert_to_tensor(train_batch['input_ids'],dtype=tf.int32)
train_tf2=tf.convert_to_tensor(train_batch['attention_mask'],dtype=tf.int32)
train_input={'input_word_ids':train_tf1,'input_mask':train_tf2}

In [21]:
### BUILD & TRAIN MODEL ###

In [22]:
with strategy.scope():
    input_word_ids = tf.keras.Input(shape=(tokenizer_length,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(tokenizer_length,), dtype=tf.int32, name="input_mask")
   
    tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
    embedding = tf_model([input_word_ids, input_mask])[0]

    output = tf.keras.layers.Dense(units = 3, 
                                   kernel_initializer = kernel_initializer,  
                                   activation = 'softmax')(embedding)
    
    output = tf.keras.layers.Dropout(dropout)(output)
    model = tf.keras.Model(inputs = [input_word_ids,input_mask], outputs = output)
    
    model.compile(optimizer = optimizer,
                  loss = 'sparse_categorical_crossentropy',
                  metrics = ['accuracy'])

Downloading:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

All the layers of TFXLMRobertaForSequenceClassification were initialized from the model checkpoint at joeddav/xlm-roberta-large-xnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForSequenceClassification for predictions without further training.


In [23]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 100)]        0           []                               
                                                                                                  
 tfxlm_roberta_for_sequence_cla  TFSequenceClassifie  559893507  ['input_word_ids[0][0]',         
 ssification (TFXLMRobertaForSe  rOutput(loss=None,               'input_mask[0][0]']             
 quenceClassification)          logits=(None, 3),                                                 
                                 hidden_states=None                                           

In [24]:
timestamp = datetime.datetime.now().strftime('%Y-%m-%d__%H-%M-%S')

early_stop = tf.keras.callbacks.EarlyStopping(patience = patience, 
                                              restore_best_weights = True, 
                                              verbose = 1, 
                                              monitor = 'val_accuracy')

checkpoint = tf.keras.callbacks.ModelCheckpoint(timestamp + ".hdf5", 
                                                verbose = 0, 
                                                save_best_only = True, 
                                                monitor = 'val_accuracy')

In [25]:
history = model.fit(train_input, 
                    train_labels, 
                    epochs = epochs, 
                    verbose = 1, 
                    batch_size = batch_factor * strategy.num_replicas_in_sync, 
                    validation_split = validation_split,
                    shuffle = True, 
                    callbacks=[early_stop, checkpoint])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 15: early stopping


In [26]:
### SAVE RESULTS ###

In [27]:
history_log("training_log_" + timestamp)
hyperparameter_log("hyperparameter_log_" + timestamp)