In [1]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

  from .autonotebook import tqdm as notebook_tqdm


### In the cell below, the model can be changed, as well as the learning rate and optimiser.

Any model from https://huggingface.co/models?sort=downloads can be passed to the <i>AutoTokenizer.from_pretrained</i> and <i>AutoModelForSequenceClassification.from_pretrained</i> functions

In [2]:
# Use CPU or GPU if its detected and Pytorch was installed with Cuda-capabilities
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seeds for reproducibility
random.seed(26)
np.random.seed(26)
torch.manual_seed(26)

# Here, models from  can be downloaded to be retrained
# On a specialised dataset. Roberta models work with no modifications, Bert based models need minor modifications
# And other models will need more modifications to be retrained using this notebook.
tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/roberta-base-boolq") 

model = AutoModelForSequenceClassification.from_pretrained("shahrukhx01/roberta-base-boolq")
model.to(device) # Send the model to the GPU if we have one

learning_rate = 1e-7
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)



In [3]:
def t_split(df, frac=0.2):
    """
        Splits dataset into test and train sets.
        df: Pandas dataframe containing the data
        frac (optional): the percentage of data that is taken for the test set, defaults to 0.2 if not set.
        returns: tuple of (train_set, test_set)
    """
    
    n = int(len(df)*frac)
    
    # get random sample 
    test = df.sample(n=n, axis=0)
    
    # get everything but the test sample
    train = df.drop(index=test.index)

    return train, test

In [6]:
def encode_data(tokenizer, questions, passages, max_length):
    """
        Encode the question/passage pairs into features than can be fed to the model.
        tokenizer: AutoTokenizer class from the Transformers package
        questions: Questions to be vectorised
        passages: Text with context that answers the questions
        max_length: maximum length of text sequences, including passages and questions.
        returns: Input_ids and attention masks to be passed to the model.
    """
    input_ids = []
    attention_masks = []
    
    for question, passage in zip(questions, passages):
        encoded_data = tokenizer.encode_plus(question, passage, max_length=max_length, pad_to_max_length=True, truncation_strategy="longest_first")
        encoded_pair = encoded_data["input_ids"]
        attention_mask = encoded_data["attention_mask"]

        input_ids.append(encoded_pair)
        attention_masks.append(attention_mask)

    return np.array(input_ids), np.array(attention_masks)

# Loading data
train_df = pd.read_csv('../data/generatedQuestions4k.csv', header=0)
train_data_df, dev_data_df = t_split(train_df, frac=0.2)
# print(test_df)

passages_train = train_data_df.Recipe.values
questions_train = train_data_df.Question.values
answers_train = train_data_df.label.values.astype(int)

passages_dev = dev_data_df.Recipe.values
questions_dev = dev_data_df.Question.values
answers_dev = dev_data_df.label.values.astype(int)

# Encoding data
max_seq_length = 256
input_ids_train, attention_masks_train = encode_data(tokenizer, questions_train, passages_train, max_seq_length)
input_ids_dev, attention_masks_dev = encode_data(tokenizer, questions_dev, passages_dev, max_seq_length)

train_features = (input_ids_train, attention_masks_train, answers_train)
dev_features = (input_ids_dev, attention_masks_dev, answers_dev)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
# Building Dataloaders
batch_size = 8

train_features_tensors = [torch.tensor(feature, dtype=torch.long) for feature in train_features]
dev_features_tensors = [torch.tensor(feature, dtype=torch.long) for feature in dev_features]

train_dataset = TensorDataset(*train_features_tensors)
dev_dataset = TensorDataset(*dev_features_tensors)

train_sampler = RandomSampler(train_dataset)
dev_sampler = SequentialSampler(dev_dataset)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=batch_size)

<h5> In the cell above, the dataloader settings can be changed, including batch size and type of sampler (Random, Sequential), and in the cell below the number epochs can be changed and the training variables can be accessed </h5>

In [8]:
from tqdm import tqdm
epochs = 1
grad_acc_steps = 1
train_loss_values = []
dev_acc_values = []

for _ in tqdm(range(epochs), desc="Epoch"):

    # Training
    epoch_train_loss = 0 # Cumulative loss
    model.train()
    model.zero_grad()

    for step, batch in enumerate(train_dataloader):

        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)     

        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels)
#         outputs = model(input_ids, attention_mask=attention_masks, labels=labels)

        loss = outputs[0]
        loss = loss / grad_acc_steps
        epoch_train_loss += loss.item()

        loss.backward()

        if (step+1) % grad_acc_steps == 0: # Gradient accumulation is over
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Clipping gradients
            optimizer.step()
            model.zero_grad()

    epoch_train_loss = epoch_train_loss / len(train_dataloader)          
    train_loss_values.append(epoch_train_loss)

    # Evaluation
    epoch_dev_accuracy = 0 # Cumulative accuracy
    model.eval()

    for batch in dev_dataloader:

        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2]

        with torch.no_grad():        
            outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
#             outputs = model(input_ids, attention_mask=attention_masks)

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()

        predictions = np.argmax(logits, axis=1).flatten()
        labels = labels.numpy().flatten()

        epoch_dev_accuracy += np.sum(predictions == labels) / len(labels)
        print(predictions)
        print(labels)

    epoch_dev_accuracy = epoch_dev_accuracy / len(dev_dataloader)
    dev_acc_values.append(epoch_dev_accuracy)
    print(epoch_dev_accuracy)


Epoch:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]

[1 0 0 1 0 1 0 0]
[1 1 1 0 1 1 0 0]
[1 0 0 0 1 0 1 1]
[1 0 0 0 0 0 0 0]
[0 0 1 1 1 0 0 0]
[0 0 1 1 1 0 0 1]
[1 0 1 0 1 1 0 1]
[1 0 0 1 0 0 0 0]
[0 0 0 0 1 0 0 0]
[0 0 0 1 1 0 1 1]
[0 0 1 0 1 0 0 0]
[0 1 1 1 1 0 0 0]
[1 1 0 1 1 1 1 1]
[0 0 1 1 0 1 1 0]
[1 0 0 0 1 1 0 0]
[0 0 1 1 1 1 1 0]
[0 0 1 0 0 0 1 0]
[0 0 0 1 1 0 0 1]
[1 1 0 0 1 1 0 1]
[0 0 1 1 1 1 1 1]
[1 1 0 1 1 0 0 0]
[0 1 0 0 1 0 1 0]
[1 0 0 0 0 0 1 1]
[1 0 1 1 0 1 0 1]
[0 0 0 0 1 0 0 0]
[0 0 1 0 0 0 1 1]
[0 1 1 1 1 0 1 0]
[1 1 1 1 1 0 0 1]
[1 0 1 0 0 0 0 0]
[0 0 0 1 1 1 1 1]
[0 1 0 1 1 0 1 1]
[0 0 0 1 0 0 1 0]
[0 1 0 1 1 1 1 1]
[0 0 0 0 0 1 0 1]
[0 1 0 1 0 0 1 1]
[0 0 1 1 1 0 1 1]
[0 1 0 1 1 0 0 0]
[0 0 0 1 0 1 1 1]
[1 1 0 1 1 0 0 1]
[0 1 1 0 0 1 0 0]
[0 0 0 0 0 0 0 1]
[0 0 0 1 1 1 0 1]
[0 1 1 0 1 0 1 1]
[0 0 1 1 1 0 0 0]
[1 1 1 1 0 1 1 0]
[1 0 0 1 1 0 1 0]
[0 0 0 1 0 0 0 0]
[1 1 1 1 1 0 0 1]
[0 1 0 1 1 0 0 1]
[0 1 1 1 0 1 1 1]
[1 0 1 0 0 0 0 1]
[1 0 0 0 0 0 1 0]
[0 1 0 0 1 0 1 1]
[0 0 1 0 1 1 0 0]
[0 0 0 0 0 0 0 0]
[0 1 0 0 0

Epoch: 100%|████████████████████████████████████████████████████████████████████████████| 1/1 [01:52<00:00, 112.89s/it]

[1 1 0 0 1 0 0 0]
[1 0 0 1 1 0 1 0]
[1 0 0 1 0 0 1 0]
[0 0 1 0 0 0 1 0]
0.557





In [9]:
epoch_dev_accuracy

0.557

In [None]:
# In case of GPU OOM errors, this may fix the problem, otherwise decrease batch size.
torch.cuda.empty_cache()

In [None]:
# Save model
torch.save(model.state_dict(), "../models/4kRobertaBool576.h5")
