In [1]:
!pip install transformers
!pip install sentencepiece
!pip install datasets transformers 
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla T4, n_gpu: 1


In [6]:
!pip install transformers
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

import os
import zipfile

# Download dataset functions file
dataset_file = drive.CreateFile({'id': '1EUWxxoqKlNmQNc82c3NpCsYByO3NRe-4'})
dataset_file.GetContentFile('mydataset.py')
print('dataset file downloaded! (mydataset.py)')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
success!
dataset file downloaded! (mydataset.py)


In [7]:
from __future__ import print_function
from typing import List, Tuple
from tqdm import tqdm
import torch

from datasets import load_dataset
from transformers import PreTrainedTokenizer, T5ForConditionalGeneration, T5Tokenizer, AdamW, set_seed
from torch.utils.data import DataLoader
import argparse

from mydataset import Datasetid
import mydataset

In [9]:
def train(model: T5ForConditionalGeneration, tokenizer: PreTrainedTokenizer, optimizer: AdamW, train_set: Datasetid, validation_set: Datasetid, num_train_epochs: int, device: str, batch_size: int, max_input_length: int = 512):
    """_summary_

    Args:
        model (T5ForConditionalGeneration): _description_
        tokenizer (PreTrainedTokenizer): _description_
        optimizer (AdamW): _description_
        train_set (Dataset): _description_
        validation_set (Dataset): _description_
        num_train_epochs (int): _description_
        device (str): _description_
        batch_size (int): _description_
    """
    my_trainset_dataloader = DataLoader(train_set, batch_size=batch_size,
                                        num_workers=10, collate_fn=lambda data: train_set.pack_minibatch(data))
    my_validation_dataloader = DataLoader(validation_set, batch_size=batch_size,
                                          num_workers=10, collate_fn=lambda data: validation_set.pack_minibatch(data))

    # set training mode on the model
    model.train()

    # model to device
    model.to(device)

    f1_old: int = 0
    for epoch in range(num_train_epochs):      
        epoch_train_loss = 0.
        for contexts,questions,answers in tqdm(my_trainset_dataloader):
            optimizer.zero_grad()

            inputs = list(map(lambda tuple: f"question:{tuple[0]}  context:{tuple[1]}", zip(questions,contexts)))
            encoded_inputs = tokenizer(
                                    inputs,
                                    padding="longest",
                                    max_length=max_input_length,
                                    truncation=True,
                                    return_tensors="pt",
                                )
            encoded_targets = tokenizer(
                                    answers,
                                    padding="longest",
                                    max_length=max_input_length,
                                    truncation=True,
                                    return_tensors="pt",
                                )

            input_ids, attention_mask = encoded_inputs.input_ids, encoded_inputs.attention_mask
            encoded_targets = encoded_targets.input_ids

            # replace padding target token id's of the labels by -100, crossEntropy skip target label == -100
            encoded_targets[encoded_targets == tokenizer.pad_token_id] = -100

            input_ids = input_ids.to(device)
            encoded_targets = encoded_targets.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=encoded_targets)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item() * batch_size
        print(f"epoch={epoch + 1}/{num_train_epochs}")
        print(f"\t Train loss = {epoch_train_loss/len(train_set):.4f}")

        model.eval()
        with torch.no_grad():
            model_predictions_encoded = []
            target_encoded = []
            for contexts, questions, answers in tqdm(my_validation_dataloader):
                inputs = list(map(lambda tuple: f"question: {tuple[0]}  context:{tuple[1]}", zip(
                    questions, contexts)))
                encoded_inputs = tokenizer(
                    inputs,
                    padding="longest",
                    max_length=max_input_length,
                    truncation=True,
                    return_tensors="pt",
                )
                encoded_targets = tokenizer(
                    answers,
                    padding="longest",
                    max_length=max_input_length,
                    truncation=True,
                    return_tensors="pt",
                )
                encoded_inputs, attention_mask = encoded_inputs.input_ids, encoded_inputs.attention_mask
                encoded_targets = encoded_targets.input_ids

                encoded_inputs = encoded_inputs.to(device)
                encoded_targets = encoded_targets.to(device)
                attention_mask = attention_mask.to(device)
                model_predictions = model.generate(
                    input_ids=encoded_inputs, attention_mask=attention_mask)

                model_predictions_encoded += model_predictions.tolist()
                target_encoded += encoded_targets.tolist()
        f1, exact_match = validation_set.evaluate(model_predictions_encoded, target_encoded)

        print(f"\t Validation F1 = {f1:.2f}, EM = {exact_match:.2f}")
        if f1 > f1_old :
            f1_old = f1
        model.train()

In [14]:
# Set seed
set_seed(42)

_data = load_dataset('squad')

model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
# creating the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)





  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
from datasets import Dataset as dst
ind=int(0.2*(_data["train"].num_rows))
_data_train = dst.from_dict(_data["train"][:ind])
ind=int(0.2*(_data["validation"].num_rows))
_data_valid = dst.from_dict(_data["validation"][:ind])

In [12]:
train_set = Datasetid(_data_train, tokenizer,parser=mydataset.DatasetMap.squad)
validation_set = Datasetid(_data_valid, tokenizer, parser=mydataset.DatasetMap.squad)


100%|██████████| 17519/17519 [00:01<00:00, 14859.72it/s]
100%|██████████| 2114/2114 [00:00<00:00, 14098.88it/s]


In [15]:
train(model=model,
      tokenizer=tokenizer,
      optimizer=optimizer,
      train_set=train_set,
      validation_set=validation_set,
      num_train_epochs=1, device='cuda', batch_size=16)

100%|██████████| 1095/1095 [04:34<00:00,  3.99it/s]


epoch=1/20
	 Train loss = 0.4392


100%|██████████| 397/397 [00:55<00:00,  7.18it/s]
6342it [00:00, 44666.33it/s]


	 Validation F1 = 76.74, EM = 58.01


100%|██████████| 1095/1095 [04:34<00:00,  3.99it/s]


epoch=2/20
	 Train loss = 0.3932


100%|██████████| 397/397 [00:55<00:00,  7.15it/s]
6342it [00:00, 86958.85it/s]


	 Validation F1 = 76.38, EM = 57.43


100%|██████████| 1095/1095 [04:35<00:00,  3.98it/s]


epoch=3/20
	 Train loss = 0.3675


100%|██████████| 397/397 [00:56<00:00,  7.01it/s]
6342it [00:00, 49792.83it/s]


	 Validation F1 = 76.78, EM = 57.82


100%|██████████| 1095/1095 [04:34<00:00,  3.99it/s]


epoch=4/20
	 Train loss = 0.3504


100%|██████████| 397/397 [00:55<00:00,  7.19it/s]
6342it [00:00, 89157.66it/s]


	 Validation F1 = 76.74, EM = 57.47


100%|██████████| 1095/1095 [04:34<00:00,  3.98it/s]


epoch=5/20
	 Train loss = 0.3294


100%|██████████| 397/397 [00:56<00:00,  7.03it/s]
6342it [00:00, 47588.87it/s]


	 Validation F1 = 77.13, EM = 58.04


100%|██████████| 1095/1095 [04:34<00:00,  3.99it/s]


epoch=6/20
	 Train loss = 0.3136


100%|██████████| 397/397 [00:55<00:00,  7.15it/s]
6342it [00:00, 86643.03it/s]


	 Validation F1 = 77.26, EM = 57.81


100%|██████████| 1095/1095 [04:34<00:00,  3.98it/s]


epoch=7/20
	 Train loss = 0.3027


100%|██████████| 397/397 [00:56<00:00,  7.06it/s]
6342it [00:00, 48450.38it/s]


	 Validation F1 = 77.09, EM = 57.96


100%|██████████| 1095/1095 [04:34<00:00,  3.99it/s]


epoch=8/20
	 Train loss = 0.2924


100%|██████████| 397/397 [00:55<00:00,  7.10it/s]
6342it [00:00, 81626.99it/s]


	 Validation F1 = 77.19, EM = 57.92


100%|██████████| 1095/1095 [04:34<00:00,  3.98it/s]


epoch=9/20
	 Train loss = 0.2775


100%|██████████| 397/397 [00:56<00:00,  7.08it/s]
6342it [00:00, 83254.65it/s]


	 Validation F1 = 77.44, EM = 58.10


100%|██████████| 1095/1095 [04:35<00:00,  3.98it/s]


epoch=10/20
	 Train loss = 0.2686


100%|██████████| 397/397 [00:55<00:00,  7.14it/s]
6342it [00:00, 85735.43it/s]


	 Validation F1 = 77.25, EM = 57.87


100%|██████████| 1095/1095 [04:35<00:00,  3.98it/s]


epoch=11/20
	 Train loss = 0.2583


100%|██████████| 397/397 [00:56<00:00,  6.97it/s]
6342it [00:00, 43745.63it/s]


	 Validation F1 = 77.58, EM = 58.39


100%|██████████| 1095/1095 [04:34<00:00,  3.98it/s]


epoch=12/20
	 Train loss = 0.2480


100%|██████████| 397/397 [00:56<00:00,  7.06it/s]
6342it [00:00, 84914.10it/s]


	 Validation F1 = 77.54, EM = 58.40


100%|██████████| 1095/1095 [04:34<00:00,  3.99it/s]


epoch=13/20
	 Train loss = 0.2358


100%|██████████| 397/397 [00:56<00:00,  7.07it/s]
6342it [00:00, 43168.04it/s]


	 Validation F1 = 77.64, EM = 58.53


100%|██████████| 1095/1095 [04:34<00:00,  3.98it/s]


epoch=14/20
	 Train loss = 0.2321


100%|██████████| 397/397 [00:56<00:00,  7.04it/s]
6342it [00:00, 85379.89it/s]


	 Validation F1 = 77.43, EM = 58.07


100%|██████████| 1095/1095 [04:34<00:00,  3.98it/s]


epoch=15/20
	 Train loss = 0.2208


100%|██████████| 397/397 [00:56<00:00,  7.06it/s]
6342it [00:00, 81945.84it/s]


	 Validation F1 = 77.47, EM = 58.06


100%|██████████| 1095/1095 [04:35<00:00,  3.98it/s]


epoch=16/20
	 Train loss = 0.2119


100%|██████████| 397/397 [00:56<00:00,  6.99it/s]
6342it [00:00, 82751.93it/s]


	 Validation F1 = 77.24, EM = 58.15


100%|██████████| 1095/1095 [04:35<00:00,  3.98it/s]


epoch=17/20
	 Train loss = 0.2083


100%|██████████| 397/397 [00:56<00:00,  6.97it/s]
6342it [00:00, 81096.18it/s]


	 Validation F1 = 77.54, EM = 57.82


100%|██████████| 1095/1095 [04:36<00:00,  3.96it/s]


epoch=18/20
	 Train loss = 0.1983


100%|██████████| 397/397 [00:58<00:00,  6.75it/s]
6342it [00:00, 86205.56it/s]


	 Validation F1 = 77.76, EM = 58.26


100%|██████████| 1095/1095 [04:35<00:00,  3.98it/s]


epoch=19/20
	 Train loss = 0.1940


100%|██████████| 397/397 [00:56<00:00,  6.98it/s]
6342it [00:00, 73872.83it/s]


	 Validation F1 = 77.34, EM = 57.62


100%|██████████| 1095/1095 [04:35<00:00,  3.98it/s]


epoch=20/20
	 Train loss = 0.1869


100%|██████████| 397/397 [00:56<00:00,  7.00it/s]
6342it [00:00, 43469.05it/s]

	 Validation F1 = 77.83, EM = 58.39



