# **Sequence Classification**
Sequence classification is a natural language processing task of assigning a label or class to a given text.

We shall fine-tune a BERT model using TensorFlow to identify if a given two sentences are paraphrases or not (i.e., if both sentences mean the same thing) using MRPC (Microsoft Research Paraphrase Corpus) dataset.

### **1. Install and Import Required Libraries**

In [None]:
!pip install datasets transformers evaluate

In [None]:
import tensorflow as tf
import numpy as np

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset
from evaluate import load

### **2. Load Data**

In [None]:
raw_dataset = load_dataset('glue', 'mrpc')

In [None]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

### **3. Preprocess Data**

In [None]:
model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def tokenizer_function(example):
  return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

tokenized_dataset = raw_dataset.map(tokenizer_function, batched=True)

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [None]:
batch_size = 8
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

tf_train_dataset = tokenized_dataset['train'].to_tf_dataset(
    columns = ['input_ids', 'token_type_ids', 'attention_mask'],
    label_cols = ['label'],
    collate_fn = data_collator,
    shuffle = True,
    batch_size = batch_size
)

tf_validation_dataset = tokenized_dataset['validation'].to_tf_dataset(
    columns = ['input_ids', 'token_type_ids', 'attention_mask'],
    label_cols = ['label'],
    collate_fn = data_collator,
    shuffle = False,
    batch_size = batch_size
)

tf_test_dataset = tokenized_dataset['test'].to_tf_dataset(
    columns = ['input_ids', 'token_type_ids', 'attention_mask'],
    label_cols = ['label'],
    collate_fn = data_collator,
    shuffle = False,
    batch_size = batch_size
)

### **4. Define and Fine-tune the Model**

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

In [None]:
num_epochs = 5
num_train_steps = len(tf_train_dataset) * num_epochs

lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
history = model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=num_epochs, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### **5. Predict using the Fine-tuned Model**



In [None]:
predictions = model.predict(tf_test_dataset)['logits']
class_preds = np.argmax(predictions, axis=-1)
class_preds



array([1, 1, 1, ..., 1, 1, 1])

### **6. Compute Metrics**

In [None]:
metric = load('glue', 'mrpc')

In [None]:
metric.compute(predictions=class_preds, references=tokenized_dataset['test']['label'])

{'accuracy': 0.8394202898550724, 'f1': 0.8834665544804374}