In [1]:
!pip install transformers
!pip install sentencepiece
!pip install datasets transformers 
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: NVIDIA A100-SXM4-40GB, n_gpu: 1


In [3]:
!pip install transformers
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

import os
import zipfile

# Download dataset functions file
dataset_file = drive.CreateFile({'id': '1wKVeLjctNhG0E8Yf2s9a6n5FTN2mEVXy'})
dataset_file.GetContentFile('mydataset.py')
print('dataset file downloaded! (mydataset.py)')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
success!
dataset file downloaded! (mydataset.py)


In [4]:
from __future__ import print_function
from typing import List, Tuple
from tqdm import tqdm
import torch

from datasets import load_dataset
from transformers import PreTrainedTokenizer, T5ForConditionalGeneration, T5Tokenizer, AdamW, set_seed
from torch.utils.data import DataLoader
import argparse

from mydataset import Datasetid
import mydataset

In [5]:
!pip install nlpaug
# Import required libraries
import torch
import pandas as pd
import numpy as np
import re
import sys
from sklearn.model_selection import train_test_split
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import random

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
# Noise funcs 

# FOR QA ONLY
char_action = ['insert',
        'substitute',
        'delete',
        'swap',
]

def get_action(type):
    return random.choice(char_action)

def add_noise(question, p=0.7):
    """
    Augment a tweet with character-level and word-level noise.

    Args:
        tweet (str): The original tweet.
        p (float): The probability of applying the char level augmentation.

    Returns:
        str: The augmented question.
    """
    # Define a list of character-level augmentation techniques
    char_augmenters = [
        nac.KeyboardAug(aug_char_p=0.2, aug_word_p=0.2, include_special_char=False, include_numeric=False),
        nac.RandomCharAug(action=get_action("char"), aug_char_p=0.1, aug_word_p=0.1),
    ]

    # Define a list of word-level augmentation techniques
    word_augmenters = [
        naw.SpellingAug(),
        naw.SynonymAug(),
    ]

    # Randomly apply a character-level or word-level augmentation with probability p
    if random.random() < p:
        aug = random.choice(char_augmenters)
        noisy_text = aug.augment(question)
    else:
        aug = random.choice(word_augmenters)
        noisy_text = aug.augment(question)
      
    return noisy_text[0]


In [7]:
def train(model: T5ForConditionalGeneration, tokenizer: PreTrainedTokenizer, optimizer: AdamW, train_set: Datasetid, validation_set: Datasetid, num_train_epochs: int, device: str, batch_size: int, max_input_length: int = 512):
    """_summary_

    Args:
        model (T5ForConditionalGeneration): _description_
        tokenizer (PreTrainedTokenizer): _description_
        optimizer (AdamW): _description_
        train_set (Dataset): _description_
        validation_set (Dataset): _description_
        num_train_epochs (int): _description_
        device (str): _description_
        batch_size (int): _description_
    """
    my_trainset_dataloader = DataLoader(train_set, batch_size=batch_size,
                                        num_workers=10, collate_fn=lambda data: train_set.pack_minibatch(data))
    my_validation_dataloader = DataLoader(validation_set, batch_size=batch_size,
                                          num_workers=10, collate_fn=lambda data: validation_set.pack_minibatch(data))

    # set training mode on the model
    model.train()

    # model to device
    model.to(device)

    f1_old: int = 0
    for epoch in range(num_train_epochs):      
        epoch_train_loss = 0.
        for contexts,questions,answers in tqdm(my_trainset_dataloader):
            optimizer.zero_grad()

            inputs = list(map(lambda tuple: f"question:{tuple[0]}  context:{tuple[1]}", zip(questions,contexts)))
            encoded_inputs = tokenizer(
                                    inputs,
                                    padding="longest",
                                    max_length=max_input_length,
                                    truncation=True,
                                    return_tensors="pt",
                                )
            encoded_targets = tokenizer(
                                    answers,
                                    padding="longest",
                                    max_length=max_input_length,
                                    truncation=True,
                                    return_tensors="pt",
                                )

            input_ids, attention_mask = encoded_inputs.input_ids, encoded_inputs.attention_mask
            encoded_targets = encoded_targets.input_ids

            # replace padding target token id's of the labels by -100, crossEntropy skip target label == -100
            encoded_targets[encoded_targets == tokenizer.pad_token_id] = -100

            input_ids = input_ids.to(device)
            encoded_targets = encoded_targets.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=encoded_targets)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item() * batch_size
        print(f"epoch={epoch + 1}/{num_train_epochs}")
        print(f"\t Train loss = {epoch_train_loss/len(train_set):.4f}")

        model.eval()
        with torch.no_grad():
            model_predictions_encoded = []
            target_encoded = []
            for contexts, questions, answers in tqdm(my_validation_dataloader):
                inputs = list(map(lambda tuple: f"question: {tuple[0]}  context:{tuple[1]}", zip(
                    questions, contexts)))
                encoded_inputs = tokenizer(
                    inputs,
                    padding="longest",
                    max_length=max_input_length,
                    truncation=True,
                    return_tensors="pt",
                )
                encoded_targets = tokenizer(
                    answers,
                    padding="longest",
                    max_length=max_input_length,
                    truncation=True,
                    return_tensors="pt",
                )
                encoded_inputs, attention_mask = encoded_inputs.input_ids, encoded_inputs.attention_mask
                encoded_targets = encoded_targets.input_ids

                encoded_inputs = encoded_inputs.to(device)
                encoded_targets = encoded_targets.to(device)
                attention_mask = attention_mask.to(device)
                model_predictions = model.generate(
                    input_ids=encoded_inputs, attention_mask=attention_mask)

                model_predictions_encoded += model_predictions.tolist()
                target_encoded += encoded_targets.tolist()
        f1, exact_match = validation_set.evaluate(model_predictions_encoded, target_encoded)

        print(f"\t Validation F1 = {f1:.2f}, EM = {exact_match:.2f}")
        if f1 > f1_old :
            f1_old = f1
        model.train()

In [8]:
# Set seed
set_seed(42)

_data = load_dataset('squad')

model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
# creating the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)





  0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
data_split=['train','validation']
data_set={}
for i in data_split:
  ind=int(0.2*(_data[i].num_rows))
  data_set[i]=_data[i][:ind]
  for num in range(len(data_set[i]['question'])):
    if(num<0.15*len(data_set[i]['question'])):
      data_set[i]['question'][num]=add_noise(data_set[i]['question'][num])

In [21]:
from datasets import DatasetDict, Dataset as dst
dataset_train = dst.from_dict(data_set["train"])
dataset_validation = dst.from_dict(data_set["validation"])
data_sets=DatasetDict({'train': dataset_train,'validation': dataset_validation})

In [22]:
train_set = Datasetid(data_sets['train'], tokenizer,parser=mydataset.DatasetMap.squad)
validation_set = Datasetid(data_sets['validation'], tokenizer, parser=mydataset.DatasetMap.squad)


100%|██████████| 17519/17519 [00:01<00:00, 13777.69it/s]
100%|██████████| 2114/2114 [00:00<00:00, 12732.42it/s]


In [23]:
train(model=model,
      tokenizer=tokenizer,
      optimizer=optimizer,
      train_set=train_set,
      validation_set=validation_set,
      num_train_epochs=17, device='cuda', batch_size=16)

100%|██████████| 1095/1095 [01:40<00:00, 10.86it/s]


epoch=1/17
	 Train loss = 0.3139


100%|██████████| 397/397 [00:44<00:00,  8.84it/s]
6342it [00:00, 78096.91it/s]


	 Validation F1 = 75.27, EM = 56.43


100%|██████████| 1095/1095 [01:41<00:00, 10.83it/s]


epoch=2/17
	 Train loss = 0.3024


100%|██████████| 397/397 [00:44<00:00,  8.95it/s]
6342it [00:00, 80023.69it/s]


	 Validation F1 = 75.53, EM = 56.18


100%|██████████| 1095/1095 [01:41<00:00, 10.81it/s]


epoch=3/17
	 Train loss = 0.2849


100%|██████████| 397/397 [00:44<00:00,  9.00it/s]
6342it [00:00, 80618.62it/s]


	 Validation F1 = 75.24, EM = 56.20


100%|██████████| 1095/1095 [01:40<00:00, 10.89it/s]


epoch=4/17
	 Train loss = 0.2745


100%|██████████| 397/397 [00:44<00:00,  8.96it/s]
6342it [00:00, 78155.42it/s]


	 Validation F1 = 75.26, EM = 56.32


100%|██████████| 1095/1095 [01:41<00:00, 10.78it/s]


epoch=5/17
	 Train loss = 0.2632


100%|██████████| 397/397 [00:44<00:00,  8.99it/s]
6342it [00:00, 72784.34it/s]


	 Validation F1 = 75.74, EM = 56.80


100%|██████████| 1095/1095 [01:41<00:00, 10.79it/s]


epoch=6/17
	 Train loss = 0.2526


100%|██████████| 397/397 [00:43<00:00,  9.14it/s]
6342it [00:00, 79872.07it/s]


	 Validation F1 = 75.65, EM = 56.62


100%|██████████| 1095/1095 [01:41<00:00, 10.80it/s]


epoch=7/17
	 Train loss = 0.2420


100%|██████████| 397/397 [00:44<00:00,  8.96it/s]
6342it [00:00, 78722.10it/s]


	 Validation F1 = 75.19, EM = 55.83


100%|██████████| 1095/1095 [01:41<00:00, 10.81it/s]


epoch=8/17
	 Train loss = 0.2308


100%|██████████| 397/397 [00:44<00:00,  8.98it/s]
6342it [00:00, 77374.97it/s]


	 Validation F1 = 75.37, EM = 56.12


100%|██████████| 1095/1095 [01:41<00:00, 10.79it/s]


epoch=9/17
	 Train loss = 0.2234


100%|██████████| 397/397 [00:44<00:00,  8.90it/s]
6342it [00:00, 79017.21it/s]


	 Validation F1 = 75.68, EM = 56.50


100%|██████████| 1095/1095 [01:41<00:00, 10.79it/s]


epoch=10/17
	 Train loss = 0.2194


100%|██████████| 397/397 [00:45<00:00,  8.79it/s]
6342it [00:00, 79507.29it/s]


	 Validation F1 = 75.64, EM = 56.48


100%|██████████| 1095/1095 [01:41<00:00, 10.77it/s]


epoch=11/17
	 Train loss = 0.2076


100%|██████████| 397/397 [00:44<00:00,  8.85it/s]
6342it [00:00, 81020.10it/s]


	 Validation F1 = 75.82, EM = 56.42


100%|██████████| 1095/1095 [01:41<00:00, 10.77it/s]


epoch=12/17
	 Train loss = 0.2015


100%|██████████| 397/397 [00:43<00:00,  9.05it/s]
6342it [00:00, 79360.22it/s]


	 Validation F1 = 75.56, EM = 56.24


100%|██████████| 1095/1095 [01:41<00:00, 10.81it/s]


epoch=13/17
	 Train loss = 0.1970


100%|██████████| 397/397 [00:44<00:00,  8.95it/s]
6342it [00:00, 79006.18it/s]


	 Validation F1 = 75.68, EM = 56.32


100%|██████████| 1095/1095 [01:41<00:00, 10.77it/s]


epoch=14/17
	 Train loss = 0.1884


100%|██████████| 397/397 [00:44<00:00,  8.86it/s]
6342it [00:00, 75741.75it/s]


	 Validation F1 = 75.51, EM = 55.99


100%|██████████| 1095/1095 [01:40<00:00, 10.89it/s]


epoch=15/17
	 Train loss = 0.1819


100%|██████████| 397/397 [00:44<00:00,  8.92it/s]
6342it [00:00, 79347.67it/s]


	 Validation F1 = 75.12, EM = 55.87


100%|██████████| 1095/1095 [01:40<00:00, 10.88it/s]


epoch=16/17
	 Train loss = 0.1776


100%|██████████| 397/397 [00:44<00:00,  8.90it/s]
6342it [00:00, 78037.57it/s]


	 Validation F1 = 75.22, EM = 56.23


100%|██████████| 1095/1095 [01:40<00:00, 10.89it/s]


epoch=17/17
	 Train loss = 0.1685


100%|██████████| 397/397 [00:44<00:00,  8.97it/s]
6342it [00:00, 77554.31it/s]

	 Validation F1 = 74.97, EM = 55.85





In [24]:
from google.colab import drive

drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [25]:
torch.save(model.state_dict(), '/content/gdrive/MyDrive/T5_QA_Noisy15')