<a href="https://colab.research.google.com/github/and-rgr/contradiction_and_entailment/blob/main/contradiction_and_entailment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# based on the following notebook:
# https://github.com/sukanyabag/Detecting-Contradictions-and-Entailment-in-Multilingual-Text/tree/main/Detecting%20Contradictions%20in%20Multilingual%20Text

In [2]:
!pip install transformers



In [3]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
pd.set_option('display.expand_frame_repr', False)

In [5]:
# set up the TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU

print('Number of replicas:', strategy.num_replicas_in_sync)

Number of replicas: 1


In [6]:
### LOAD DATA ###

In [7]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

na_train = len(train) - len(train.dropna())
na_test = len(test) - len(test.dropna())

print("check for missing rows:", "\n\ttrain data:", na_train, ", test data:", na_test, "\n")

# reduce data - optional
# train = train.iloc[:5000]
# test = test.iloc[:2000]

print("train data shape: ", train.shape)
print("test data shape: ", test.shape)

check for missing rows: 
	train data: 0 , test data: 0 

train data shape:  (12120, 6)
test data shape:  (5195, 5)


In [8]:
### SELECT MODEL ###

In [9]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

In [10]:
### TOKENIZATION ###

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_len = 150

tf_batch = tokenizer.batch_encode_plus(
    train[['premise','hypothesis']].values.tolist(),
    padding='max_length',
    truncation=True,
    max_length=max_len,
    return_attention_mask=True,
    # return_tensors="tf"
)

In [12]:
train_tf1=tf.convert_to_tensor(tf_batch['input_ids'],dtype=tf.int32)
train_tf2=tf.convert_to_tensor(tf_batch['attention_mask'],dtype=tf.int32)
train_input={'input_word_ids':train_tf1,'input_mask':train_tf2}

In [13]:
### TRAIN MODEL ###

In [14]:
with strategy.scope():
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")

    tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

    # can also use .shape or .flatten
    embedding = tf_model([input_word_ids, input_mask])[0]
    output = tf.keras.layers.Dense(3, activation = 'softmax')(embedding)

    model = tf.keras.Model(inputs = [input_word_ids,input_mask], outputs = output)

    model.compile(optimizer = tf.keras.optimizers.Adam(lr = 1e-5),
                  loss = 'sparse_categorical_crossentropy',
                  metrics = ['accuracy'])
    
    model.summary()

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 150)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 150)]        0           []                               
                                                                                                  
 tf_distil_bert_for_sequence_cl  TFSequenceClassifie  66955010   ['input_word_ids[0][0]',         
 assification (TFDistilBertForS  rOutput(loss=None,               'input_mask[0][0]']             
 equenceClassification)         logits=(None, 2),                                                 
                                 hidden_states=None                                           

  super(Adam, self).__init__(name, **kwargs)


In [15]:
early_stop = tf.keras.callbacks.EarlyStopping(patience = 5, restore_best_weights = True, verbose = 1, monitor = 'val_accuracy')

In [None]:
# BERT is trained to receive data as tensors, pairs of sentences distinguished by lists of 1's and 0's, and separated by the CLS and SEP tokens

model.fit(train_input, train.label.values, epochs = 20, verbose = 1, batch_size = 32*strategy.num_replicas_in_sync, validation_split = 0.2, callbacks=[early_stop])

Epoch 1/20
Epoch 2/20

KeyboardInterrupt: ignored