# Import libraries

In [1]:
!pip install --quiet pytorch_lightning

[0m

In [2]:
from pprint import pprint
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset, DatasetDict
from sklearn.utils import shuffle
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.utils.data import Dataset, DataLoader
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup
import copy
import torch
from torch.optim import AdamW
from torchmetrics.text import MatchErrorRate, BLEUScore

pl.seed_everything(42)

Seed set to 42


42

# Prepare the datasetT5ForConditionalGeneration

### Creating directories

In [3]:
DIR="t5"

!mkdir -p "{DIR}/dataset"
!mkdir -p "{DIR}/model"
!mkdir -p "{DIR}/tokenizer"

!ls -r "{DIR}"

tokenizer  model  dataset


### Getting SQUAD dataset

In [4]:
ds_name = "squad"
ds = load_dataset(ds_name)

train_testvalid = ds['train'].train_test_split(test_size=0.2, seed=42)

# Split the 20% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

# gather everyone if you want to have a single DatasetDict
ds_ = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
pprint(ds_)

{'test': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 8760
}),
 'train': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 70079
}),
 'valid': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 8760
})}


### Visualizing a sample

In [6]:
sample_valid_dataset = next(iter(ds_["valid"]))
pprint(sample_valid_dataset)

{'answers': {'answer_start': [38], 'text': ['Britain and France']},
 'context': 'News of this arrived in Europe, where Britain and France '
            'unsuccessfully attempted to negotiate a solution. The two nations '
            'eventually dispatched regular troops to North America to enforce '
            'their claims. The first British action was the assault on Acadia '
            'on 16 June 1755 in the Battle of Fort Beauséjour, which was '
            'immediately followed by their expulsion of the Acadians. In July '
            'British Major General Edward Braddock led about 2,000 army troops '
            'and provincial militia on an expedition to retake Fort Duquesne, '
            'but the expedition ended in disastrous defeat. In further action, '
            'Admiral Edward Boscawen fired on the French ship Alcide on 8 June '
            '1755, capturing it and two troop ships. In September 1755, French '
            'and British troops met in the inconclusive Batt

### Parsing the datasets into a Pandas Dataframe

#### Creating empty DataFrames

In [7]:
pd.set_option("display.max_colwidth", None)

columns=["context", "answer", "question"]

df_train = pd.DataFrame(columns=columns)
df_validation = pd.DataFrame(columns=columns)
df_test = pd.DataFrame(columns=columns)

#### Defining populate_dataframe function 

In [8]:
def populate_dataframe(dataset: any, dataframe: pd.DataFrame, type_dataset:str) -> tuple:
    
    print(f"Populating {type_dataset} dataset...")

    count_long = 0
    count_short = 0

    for index, val in enumerate(tqdm(dataset)):
        context = val["context"]
        question = val["question"]
        answer = val["answers"]["text"][0]
        num_of_words = len(answer.split())
        if num_of_words >= 7:
            count_long += 1
            continue
        else:
            dataframe.loc[count_short] = [context] + [answer] + [question]
            count_short += 1

    print(f"Count long answers on {type_dataset} dataset: {count_long}")
    print(f"Count short answers on {type_dataset} dataset: {count_short}")

#### Populating train dataset

In [9]:
populate_dataframe(dataset=ds_["train"], dataframe=df_train, type_dataset="train")

Populating train dataset...


  0%|          | 0/70079 [00:00<?, ?it/s]

Count long answers on train dataset: 7176
Count short answers on train dataset: 62903


In [10]:
print(f"number of instances: {df_train.shape}")

number of instances: (62903, 3)


#### Populating validation dataset

In [11]:
populate_dataframe(dataset=ds_["valid"], dataframe=df_validation, type_dataset="validation")

Populating validation dataset...


  0%|          | 0/8760 [00:00<?, ?it/s]

Count long answers on validation dataset: 843
Count short answers on validation dataset: 7917


In [12]:
print(f"number of instances: {df_validation.shape}")

number of instances: (7917, 3)


#### Populating test dataset

In [13]:
populate_dataframe(dataset=ds_["test"], dataframe=df_test, type_dataset="test")

Populating test dataset...


  0%|          | 0/8760 [00:00<?, ?it/s]

Count long answers on test dataset: 916
Count short answers on test dataset: 7844


In [14]:
print(f"number of instances: {df_test.shape}")

number of instances: (7844, 3)


In [15]:
df_train = shuffle(df_train)
df_validation = shuffle(df_validation)
df_test = shuffle(df_test)

print(f"Train dataframe shape: {df_train.shape}")
print(f"Validation dataframe shape: {df_validation.shape}")
print(f"Test dataframe shape: {df_test.shape}")

Train dataframe shape: (62903, 3)
Validation dataframe shape: (7917, 3)
Test dataframe shape: (7844, 3)


In [16]:
df_train.head()

Unnamed: 0,context,answer,question
12678,"By 1640, the town's theocratic government and nine-square grid plan were in place, and the town was renamed Newhaven from Quinnipiac. However, the area north of New Haven remained Quinnipiac until 1678, when it was renamed Hamden. The settlement became the headquarters of the New Haven Colony. At the time, the New Haven Colony was separate from the Connecticut Colony, which had been established to the north centering on Hartford. One of the principal differences between the two colonies was that the New Haven colony was an intolerant theocracy that did not permit other churches to be established, while the Connecticut colony permitted the establishment of other churches.",Hamden,In 1678 what was the new name of the Northern part of New Haven?
48986,"By 1885, a new summer retreat was contemplated. That summer, the Bells had a vacation on Cape Breton Island in Nova Scotia, spending time at the small village of Baddeck. Returning in 1886, Bell started building an estate on a point across from Baddeck, overlooking Bras d'Or Lake. By 1889, a large house, christened The Lodge was completed and two years later, a larger complex of buildings, including a new laboratory, were begun that the Bells would name Beinn Bhreagh (Gaelic: beautiful mountain) after Bell's ancestral Scottish highlands.[N 21] Bell also built the Bell Boatyard on the estate, employing up to 40 people building experimental craft as well as wartime lifeboats and workboats for the Royal Canadian Navy and pleasure craft for the Bell family. An enthusiastic boater, Bell and his family sailed or rowed a long series of vessels on Bras d'Or Lake, ordering additional vessels from the H.W. Embree and Sons boatyard in Port Hawkesbury, Nova Scotia. In his final, and some of his most productive years, Bell split his residency between Washington, D.C., where he and his family initially resided for most of the year, and at Beinn Bhreagh where they spent increasing amounts of time.",40,How many people worked for the Bell Boatyard?
8406,"Both Locke and Rousseau developed social contract theories in Two Treatises of Government and Discourse on Inequality, respectively. While quite different works, Locke, Hobbes, and Rousseau agreed that a social contract, in which the government's authority lies in the consent of the governed, is necessary for man to live in civil society. Locke defines the state of nature as a condition in which humans are rational and follow natural law; in which all men are born equal and with the right to life, liberty and property. However, when one citizen breaks the Law of Nature, both the transgressor and the victim enter into a state of war, from which it is virtually impossible to break free. Therefore, Locke said that individuals enter into civil society to protect their natural rights via an ""unbiased judge"" or common authority, such as courts, to appeal to. Contrastingly, Rousseau's conception relies on the supposition that ""civil man"" is corrupted, while ""natural man"" has no want he cannot fulfill himself. Natural man is only taken out of the state of nature when the inequality associated with private property is established. Rousseau said that people join into civil society via the social contract to achieve unity while preserving individual freedom. This is embodied in the sovereignty of the general will, the moral and collective legislative body constituted by citizens.",Rousseau,Who wrote Discourse on Inequality?
2592,"During World War II, Imperial Japan invaded most of the former western colonies. The Shōwa occupation regime committed violent actions against civilians such as the Manila massacre and the implementation of a system of forced labour, such as the one involving 4 to 10 million romusha in Indonesia. A later UN report stated that four million people died in Indonesia as a result of famine and forced labour during the Japanese occupation. The Allied powers who defeated Japan in the South-East Asian theatre of World War II then contended with nationalists to whom the occupation authorities had granted independence.",4 to 10 million,"According to the UN report, what was the count of people who perished due to famine?"
55669,"Among Christians, the Pew Research survey found that 74% were Protestant, 25% were Catholic, and 1% belonged to other Christian denominations, including a small Orthodox Christian community. In terms of Nigeria's major ethnic groups, the Hausa ethnic group (predominant in the north) was found to be 95% Muslim and 5% Christian, the Yoruba tribe (predominant in the west) was 55% Muslim, 35% Christian and 10% adherents of other religions, while the Igbos (predominant in the east) and the Ijaw (south) were 98% Christian, with 2% practising traditional religions. The middle belt of Nigeria contains the largest number of minority ethnic groups in Nigeria, who were found to be mostly Christians and members of traditional religions, with a small proportion of Muslims.",1%,How many Nigerian Christians are Orthodox and other sects?


In [17]:
df_validation.head()

Unnamed: 0,context,answer,question
3968,"Dog behavior is the internally coordinated responses (actions or inactions) of the domestic dog (individuals or groups) to internal and/or external stimuli. As the oldest domesticated species, with estimates ranging from 9,000–30,000 years BCE, the minds of dogs inevitably have been shaped by millennia of contact with humans. As a result of this physical and social evolution, dogs, more than any other species, have acquired the ability to understand and communicate with humans and they are uniquely attuned to our behaviors. Behavioral scientists have uncovered a surprising set of social-cognitive abilities in the otherwise humble domestic dog. These abilities are not possessed by the dog's closest canine relatives nor by other highly intelligent mammals such as great apes. Rather, these skills parallel some of the social-cognitive skills of human children.",humans.,Dogs are very well attuned to what other species' behaviors?
2013,"Midway through the 19th century, the focus of geology shifted from description and classification to attempts to understand how the surface of the Earth had changed. The first comprehensive theories of mountain building were proposed during this period, as were the first modern theories of earthquakes and volcanoes. Louis Agassiz and others established the reality of continent-covering ice ages, and ""fluvialists"" like Andrew Crombie Ramsay argued that river valleys were formed, over millions of years by the rivers that flow through them. After the discovery of radioactivity, radiometric dating methods were developed, starting in the 20th century. Alfred Wegener's theory of ""continental drift"" was widely dismissed when he proposed it in the 1910s, but new data gathered in the 1950s and 1960s led to the theory of plate tectonics, which provided a plausible mechanism for it. Plate tectonics also provided a unified explanation for a wide range of seemingly unrelated geological phenomena. Since 1970 it has served as the unifying principle in geology.",fluvialists,What group did Andrew Crombie Ramsay belong to?
7367,"High speed Internet connectivity has become more widely available at a reasonable cost and the cost of video capture and display technology has decreased. Consequently, personal videoconferencing systems based on a webcam, personal computer system, software compression and broadband Internet connectivity have become affordable to the general public. Also, the hardware used for this technology has continued to improve in quality, and prices have dropped dramatically. The availability of freeware (often as part of chat programs) has made software based videoconferencing accessible to many.",chat programs,Videoconferencing freeware is widely available in what programs?
3084,"Victoria was pleased when Gladstone resigned in 1885 after his budget was defeated. She thought his government was ""the worst I have ever had"", and blamed him for the death of General Gordon at Khartoum. Gladstone was replaced by Lord Salisbury. Salisbury's government only lasted a few months, however, and Victoria was forced to recall Gladstone, whom she referred to as a ""half crazy & really in many ways ridiculous old man"". Gladstone attempted to pass a bill granting Ireland home rule, but to Victoria's glee it was defeated. In the ensuing election, Gladstone's party lost to Salisbury's and the government switched hands again.",1885,When did Gladstone resign?
2434,"Within a population, it is common for different ages and/or sexes to have different patterns of timing and distance. Female chaffinches Fringilla coelebs in Eastern Fennoscandia migrate earlier in the autumn than males do.",autumn,When do the chaffinches Fringilla coelebs migrate?


In [18]:
df_test.head()

Unnamed: 0,context,answer,question
1824,"Incandescent light bulbs come in a range of shapes and sizes. The names of the shapes may be slightly different in some regions. Many of these shapes have a designation consisting of one or more letters followed by one or more numbers, e.g. A55 or PAR38. The letters represent the shape of the bulb. The numbers represent the maximum diameter, either in 1⁄8 of an inch, or in millimeters, depending on the shape and the region. For example, 63 mm reflectors are designated R63, but in the US, they are known as R20 (2.5 in). However, in both regions, a PAR38 reflector is known as PAR38.",the maximum diameter,What do the numbers identify in a bulb shape designation?
3160,"In 885 AD the Armenians reestablished themselves as a sovereign kingdom under the leadership of Ashot I of the Bagratid Dynasty. A considerable portion of the Armenian nobility and peasantry fled the Byzantine occupation of Bagratid Armenia in 1045, and the subsequent invasion of the region by Seljuk Turks in 1064. They settled in large numbers in Cilicia, an Anatolian region where Armenians were already established as a minority since Roman times. In 1080, they founded an independent Armenian Principality then Kingdom of Cilicia, which became the focus of Armenian nationalism. The Armenians developed close social, cultural, military, and religious ties with nearby Crusader States, but eventually succumbed to Mamluk invasions. In the next few centuries, Djenghis Khan, Timurids, and the tribal Turkic federations of the Ak Koyunlu and the Kara Koyunlu ruled over the Armenians.",the Bagratid Dynasty,What dynasty was Ashot I part of?
394,"Timber was the chief building material during the Han dynasty; it was used to build palace halls, multi-story residential towers and halls and single-story houses. Because wood decays rapidly, the only remaining evidence of Han wooden architecture is a collection of scattered ceramic roof tiles. The oldest surviving wooden halls in China date to the Tang dynasty (618–907 AD). Architectural historian Robert L. Thorp points out the scarcity of Han-era archaeological remains, and claims that often unreliable Han-era literary and artistic sources are used by historians for clues about lost Han architecture.",907 AD,What is considered to be the last year of the Tang dynasty?
6115,"Federalism also finds expression in ecclesiology (the doctrine of the church). For example, presbyterian church governance resembles parliamentary republicanism (a form of political federalism) to a large extent. In Presbyterian denominations, the local church is ruled by elected elders, some of which are ministerial. Each church then sends representatives or commissioners to presbyteries and further to a general assembly. Each greater level of assembly has ruling authority over its constituent members. In this governmental structure, each component has some level of sovereignty over itself. As in political federalism, in presbyterian ecclesiology there is shared sovereignty.",authority over its constituent members,What does each greater level of assembly have?
7453,"The first legislation providing federal authority for regulating pesticides was enacted in 1910; however, decades later during the 1940s manufacturers began to produce large amounts of synthetic pesticides and their use became widespread. Some sources consider the 1940s and 1950s to have been the start of the ""pesticide era."" Although the U.S. Environmental Protection Agency was established in 1970 and amendments to the pesticide law in 1972, pesticide use has increased 50-fold since 1950 and 2.3 million tonnes (2.5 million short tons) of industrial pesticides are now[when?] used each year. Seventy-five percent of all pesticides in the world are used in developed countries, but use in developing countries is increasing. A study of USA pesticide use trends through 1997 was published in 2003 by the National Science Foundation's Center for Integrated Pest Management.",USA,Trends about use of pesticides have been published from which country?


#### Saving datasets into csv files

In [19]:
ds_path = "t5/dataset"
train_ds_file = "squad_train.csv"
validation_ds_file = "squad_validation.csv"
test_ds_file = "squad_test.csv"

df_train.to_csv(f"{ds_path}/{train_ds_file}", index=False)
df_validation.to_csv(f"{ds_path}/{validation_ds_file}", index=False)
df_test.to_csv(f"{ds_path}/{test_ds_file}", index=False)

# Define the model

Checking GPU availability

In [20]:
!nvidia-smi

Sun Oct 15 21:44:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A4000    Off  | 00000000:00:05.0 Off |                  Off |
| 41%   52C    P8    17W / 140W |      1MiB / 16376MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Defining class to handle the dataset

In [6]:
class QuestionGenerationDataset(Dataset):
        def __init__(self, tokenizer, file_path, nrows=1000, max_len_input=512, max_len_output=96):
            self.file_path = file_path
            self.context = "context"
            self.answer = "answer"
            self.question = "question"
            self.data = pd.read_csv(self.file_path, nrows=nrows)
            self.max_len_input = max_len_input
            self.max_len_output = max_len_output
            self.tokenizer = tokenizer
            self.inputs = []
            self.plain_text_inputs = []
            self.targets = []
            self.plain_text_target = []
            self.skippedcount = 0
            self._build()
            
        def __len__(self):
            return len(self.inputs)
        
        def __getitem__(self, idx):
            source_ids = self.inputs[idx]["input_ids"].squeeze()
            target_ids = self.targets[idx]["input_ids"].squeeze()
            
            src_mask = self.inputs[idx]["attention_mask"].squeeze()
            target_mask = self.targets[idx]["attention_mask"].squeeze()
    
            labels = copy.deepcopy(target_ids)
            labels[labels == 0] = -100
            
            return {
                "source_ids": source_ids,
                "source_mask": src_mask,
                "target_ids": target_ids,
                "target_mask": target_mask,
                "labels": labels,
                "plain_text_inputs": self.plain_text_inputs[idx],
                "plain_text_target": self.plain_text_target[idx]
            }
        
        def _build(self):
            for idx in tqdm(range(len(self.data))):
                context = self.data.loc[idx, self.context]
                answer = self.data.loc[idx, self.answer]
                target = self.data.loc[idx, self.question]
                
                input_ = f"context: {context} answer: {answer}"
                target_ = f"question: {(str(target))}"
            
                test_input_encoding = self.tokenizer.encode_plus(input_,
                                                                 truncation=False,
                                                                 return_tensors="pt")
                
                length_of_input_encoding = len(test_input_encoding["input_ids"][0])
                
                if length_of_input_encoding > self.max_len_input:
                    self.skippedcount += 1
                    continue
                
                tokenized_inputs = self.tokenizer.batch_encode_plus([input_], 
                                                                    max_length=self.max_len_input, 
                                                                    padding="max_length",
                                                                    return_tensors="pt")
                
                tokenized_targets = self.tokenizer.batch_encode_plus([target_],
                                                                    max_length=self.max_len_output,
                                                                    padding="max_length",
                                                                    return_tensors="pt")
                
                self.inputs.append(tokenized_inputs)
                self.plain_text_inputs.append(input_)
                self.targets.append(tokenized_targets)
                self.plain_text_target.append(target_)

### Defining the model

In [7]:
class T5Model(pl.LightningModule):
    
    def __init__(self, 
                 hyper_params, 
                 model, 
                 tokenizer,
                 train_dataset,
                 validation_dataset, 
                 test_dataset):
        
        super(T5Model, self).__init__()
        self.hyper_params = hyper_params
        self.model = model
        self.tokenizer = tokenizer
        self.train_dataset = train_dataset
        self.validation_dataset = validation_dataset
        self.test_dataset = test_dataset
        self.targets = []
        self.predictions = []
        self.mer_scores_per_epoch = []
        
    def forward(self,
                input_ids, 
                attention_mask=None, 
                decoder_input_ids=None, 
                decoder_attention_mask=None, 
                labels=None):
        
        outputs = self.model(input_ids=input_ids,
                             attention_mask=attention_mask,
                             decoder_attention_mask=decoder_attention_mask,
                             labels=labels)
        
        return outputs
    
    def training_step(self, batch, batch_idx):
        outputs = self.forward(input_ids=batch["source_ids"],
                              attention_mask=batch["source_mask"],
                              decoder_input_ids=batch["target_ids"],
                              decoder_attention_mask=batch["target_mask"],
                              labels=batch["labels"])
        
        loss = outputs[0]
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True, batch_size=self.hyper_params["batch_size"])
        return loss
    
    def validation_step(self, batch, batch_idx):
        outputs = self.forward(input_ids=batch["source_ids"],
                              attention_mask=batch["source_mask"],
                              decoder_input_ids=batch["target_ids"],
                              decoder_attention_mask=batch["target_mask"],
                              labels=batch["labels"])
        
        loss = outputs[0]
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True, batch_size=self.hyper_params["batch_size"])
        
        # Get prediction to be calculated
        beam_outputs = self.model.generate(input_ids=batch["source_ids"],
                                           attention_mask=batch["source_mask"],
                                           max_length=72,
                                           early_stopping=True,
                                           num_beams=5,
                                           num_return_sequences=1)
        
        for beam_output in beam_outputs:
            prediction_sentence = self.tokenizer.decode(beam_output,
                                    skip_special_tokens=True,
                                    clean_up_tokenization_spaces=True)
        
            self.predictions.append(prediction_sentence)
        
        self.targets = batch["plain_text_target"] 
        return loss
    
    def test_step(self, batch, batch_idx):
        outputs = self.forward(input_ids=batch["source_ids"],
                              attention_mask=batch["source_mask"],
                              decoder_input_ids=batch["target_ids"],
                              decoder_attention_mask=batch["target_mask"],
                              labels=batch["labels"])
        
        loss = outputs[0]
        self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True, batch_size=1)
        
        # Get prediction to be calculated
        beam_outputs = self.model.generate(input_ids=batch["source_ids"],
                                           attention_mask=batch["source_mask"],
                                           max_length=72,
                                           early_stopping=True,
                                           num_beams=5,
                                           num_return_sequences=1)
        prediction_sentences = []
        
        for beam_output in beam_outputs:
        
            prediction_sentence = self.tokenizer.decode(beam_output,
                                    skip_special_tokens=True,
                                    clean_up_tokenization_spaces=True)
            
            prediction_sentences.append(prediction_sentence)
  
        mer = MatchErrorRate()
        mer_score = mer(prediction_sentences, batch["plain_text_target"])
        self.log("test_mer_score", mer_score, on_step=False, on_epoch=True, prog_bar=True, batch_size=1)
        
        bleu = BLEUScore()
        bleu_score = bleu(prediction_sentences, [[i] for i in batch["plain_text_target"]])
        self.log("test_bleu_score", bleu_score, on_step=False, on_epoch=True, prog_bar=True, batch_size=1)
        
        return loss
    
    def on_validation_epoch_end(self):
        mer = MatchErrorRate()
        mer_score = mer(self.predictions, self.targets)
        self.log("val_mer_score", mer_score, on_step=False, on_epoch=True, prog_bar=True)
        
        bleu = BLEUScore()
        bleu_score = bleu(self.predictions, [[i] for i in self.targets])
        self.log("val_bleu_score", bleu_score, on_step=False, on_epoch=True, prog_bar=True)
        
        self.targets = []
        self.predictions = []
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.hyper_params["batch_size"], 
                          num_workers=self.hyper_params["num_workers"])
    
    def val_dataloader(self):
        return DataLoader(self.validation_dataset, 
                          batch_size=self.hyper_params["batch_size"], 
                          num_workers=self.hyper_params["num_workers"])
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=1, 
                          num_workers=self.hyper_params["num_workers"])
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), 
                          lr=self.hyper_params["learning_rate"], eps=self.hyper_params["epsilon"])
        return optimizer
    
        

### Training the model

In [12]:
config = {
    "batch_size": 4,
    "num_workers": 4,
    "learning_rate": 2e-5,
    "epsilon": 1e-8,
    "max_epochs": 10,
    "max_len_input": 512, 
    "max_len_output": 96,
    "num_rows_train": 12000,
    "num_rows_validation": 1200,
    "num_rows_test": 1200,
    "train_file_path": "t5/dataset/squad_train.csv",
    "validation_file_path": "t5/dataset/squad_validation.csv",
    "test_file_path": "t5/dataset/squad_test.csv",
    "model_file_path": "t5/model",
    "tokenizer_file_path": "t5/tokenizer"
}

t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
model_config = T5Config(decoder_start_token_id=t5_tokenizer.convert_tokens_to_ids(['<pad>'])[0])
t5_model = T5ForConditionalGeneration(model_config)

train_dataset = QuestionGenerationDataset(tokenizer=t5_tokenizer,
                                          file_path=config["train_file_path"],
                                          max_len_input=config["max_len_input"],
                                          max_len_output=config["max_len_output"],
                                          nrows=config["num_rows_train"])

validation_dataset = QuestionGenerationDataset(tokenizer=t5_tokenizer, 
                                               file_path=config["validation_file_path"],
                                               max_len_input=config["max_len_input"],
                                               max_len_output=config["max_len_output"],     
                                               nrows=config["num_rows_validation"])

test_dataset = QuestionGenerationDataset(tokenizer=t5_tokenizer, 
                                               file_path=config["test_file_path"],
                                               max_len_input=config["max_len_input"],
                                               max_len_output=config["max_len_output"],     
                                               nrows=config["num_rows_test"])

model = T5Model(hyper_params=config,
               model=t5_model,
               tokenizer=t5_tokenizer,
               train_dataset=train_dataset,
               validation_dataset=validation_dataset,
               test_dataset=test_dataset)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


  0%|          | 0/12000 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (648 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

In [90]:
trainer = pl.Trainer(max_epochs=config["max_epochs"],
                     accelerator="auto",
                     callbacks=[EarlyStopping(monitor="val_loss", mode="min")])

trainer.fit(model)

print("Saving model")
model.model.save_pretrained(config["model_file_path"])
t5_tokenizer.save_pretrained(config["tokenizer_file_path"])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


Saving model


('t5/tokenizer/tokenizer_config.json',
 't5/tokenizer/special_tokens_map.json',
 't5/tokenizer/spiece.model',
 't5/tokenizer/added_tokens.json')

### Testing the model

In [91]:
trainer.test(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 4.341996192932129,
  'test_mer_score': 0.8047617077827454,
  'test_bleu_score': 0.014201955869793892}]

In [93]:
experiments = {
    "Experiment": ["exp_1", "exp_2", "exp_3", "exp_4", "exp_5"],
    "Num of epochs": [10, 10, 10, 10, 10],
    "Data points": ["Train: 1000, Val: 100, Test: 100", 
                    "Train: 1000, Val: 100, Test: 100", 
                    "Train: 1000, Val: 100, Test: 100", 
                    "Train: 1000, Val: 100, Test: 100",
                    "Train: 20000, Val: 2000, Test: 2000"],
    "batch_size": [4, 16, 4, 4, 4],
    "learning_rate": [3e-4, 3e-4, 2e-5, 1e-4, 2e-5],
    "max_len_input": [512, 512, 512, 256, 512],
    "test_loss": [5.391, 5.697, 6.223, 5.638, 4.341],
    "test_mer_score": [0.787, 0.819, 0.792, 0.814, 0.804],
    "test_bleu_score": [0.0091, 0.0080, 0.0164, 0.0081, 0.01420],
}

pd.DataFrame(experiments)

Unnamed: 0,Experiment,Num of epochs,Data points,batch_size,learning_rate,max_len_input,test_loss,test_mer_score,test_bleu_score
0,exp_1,10,"Train: 1000, Val: 100, Test: 100",4,0.0003,512,5.391,0.787,0.0091
1,exp_2,10,"Train: 1000, Val: 100, Test: 100",16,0.0003,512,5.697,0.819,0.008
2,exp_3,10,"Train: 1000, Val: 100, Test: 100",4,2e-05,512,6.223,0.792,0.0164
3,exp_4,10,"Train: 1000, Val: 100, Test: 100",4,0.0001,256,5.638,0.814,0.0081
4,exp_5,10,"Train: 20000, Val: 2000, Test: 2000",4,2e-05,512,4.341,0.804,0.0142


In [13]:
test_config = {
    "trained_model_path": "t5/model",
    "trained_tokenizer": "t5/tokenizer"
}

saved_model = T5ForConditionalGeneration.from_pretrained(test_config["trained_model_path"])
saved_tokenizer = T5Tokenizer.from_pretrained(test_config["trained_tokenizer"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)
test_model = saved_model.to(device)

Device:  cuda


In [14]:
def set_input_structure(context, answer):
    return f"context: {context} answer: {answer} </s>"


def generate_questions(model, tokenizer, context, answer):

    text = set_input_structure(context, answer)
    encoding = saved_tokenizer.encode_plus(text, max_length=512, padding="max_length", return_tensors="pt")

    input_ids, attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    test_model.eval()
    beam_outputs = model.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  max_length=72,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=3)

    print("Generating questions from context:", context)
    
    for beam_output in beam_outputs:
        sent = tokenizer.decode(beam_output,
                                skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
        print(sent)
        print("answer:", answer)
        
    print("-" * 30)
        
        
test_data = [
    { 
        "context": "The Social Dilemma, released in 2020, talks about the rise of social media and how it affects society’s minds as a whole. It is a docudrama film directed by Jeff Orlowski and covered various social media aspects talking about how it manipulates individuals’ minds if not used correctly.", 
        "answer": "The Social Dilemma"
    },
    { 
        "context": "It is more than likely that you will not see references to training, validation, and test datasets in modern applied machine learning.", 
        "answer": "training, validation, and test datasets"
    }
]

for i in test_data:
    generate_questions(test_model, saved_tokenizer, i["context"], i["answer"])



Generating questions from context: The Social Dilemma, released in 2020, talks about the rise of social media and how it affects society’s minds as a whole. It is a docudrama film directed by Jeff Orlowski and covered various social media aspects talking about how it manipulates individuals’ minds if not used correctly.
question: What is the name of the a name of the film's name?
answer: The Social Dilemma
question: What is the name of the a name of the film's film?
answer: The Social Dilemma
question: What is the name of the a name of the film's film'?
answer: The Social Dilemma
------------------------------
Generating questions from context: It is more than likely that you will not see references to training, validation, and test datasets in modern applied machine learning.
question: What is the name of a term for s?
answer: training, validation, and test datasets
question: What is the name of the a term for s?
answer: training, validation, and test datasets
question: What is the na

In [108]:
#!tar -czvf custom_model__10_16_2023.tar.gz lightning_logs t5 requirements.txt T5-Model-QG-Custom.ipynb

lightning_logs/
lightning_logs/version_8/
lightning_logs/version_8/events.out.tfevents.1697501095.nbz72pdid3.59.13
lightning_logs/version_8/hparams.yaml
lightning_logs/version_8/checkpoints/
lightning_logs/version_8/checkpoints/epoch=9-step=49840.ckpt
lightning_logs/version_8/events.out.tfevents.1697511609.nbz72pdid3.59.14
t5/
t5/tokenizer/
t5/tokenizer/tokenizer_config.json
t5/tokenizer/special_tokens_map.json
t5/tokenizer/spiece.model
t5/model/
t5/model/pytorch_model.bin
t5/model/config.json
t5/dataset/
t5/dataset/squad_validation.csv
t5/dataset/squad_train.csv
t5/dataset/squad_test.csv
requirements.txt
T5-Model-QG-Custom.ipynb


In [15]:
def generate_questions_from_text(model, tokenizer, text):
    
    encoding = saved_tokenizer.encode_plus(text, max_length=512, padding="max_length", return_tensors="pt")

    input_ids, attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    test_model.eval()
    beam_outputs = model.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  max_length=72,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=3)
    
    preds = []
    
    for beam_output in beam_outputs:
        pred = tokenizer.decode(beam_output,
                                skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
        preds.append(pred)
    
    return preds


evaluation_data = []

for data_point in range(0, 10):
    print(f"{data_point}.- Context and Answer", "-"*100)
    text = test_dataset[data_point]["plain_text_inputs"]
    print(text)
    print("Target", "-"*111)
    print("\t", test_dataset[data_point]["plain_text_target"])
    print("Text Generated", "-"*103)
    preds = generate_questions_from_text(model=test_model, tokenizer=saved_tokenizer, text=text)
    for idx, val in enumerate(preds):
        
        print("\t", f"{idx}.- {val}")
        
    print("\n")

0.- Context and Answer ----------------------------------------------------------------------------------------------------
context: The third war of the Diadochi broke out because of the growing power and ambition of Antigonus. He began removing and appointing satraps as if he were king and also raided the royal treasuries in Ectabana, Persepolis and Susa, making off with 25,000 talents. Seleucus was forced to flee to Egypt and Antigonus was soon at war with Ptolemy, Lysimachus, and Cassander. He then invaded Phoenicia, laid siege to Tyre, stormed Gaza and began building a fleet. Ptolemy invaded Syria and defeated Antigonus' son, Demetrius Poliorcetes, in the Battle of Gaza of 312 BC which allowed Seleucus to secure control of Babylonia, and the eastern satrapies. In 310, Cassander had young King Alexander IV and his mother Roxane murdered, ending the Argead Dynasty which had ruled Macedon for several centuries. answer: 25,000
Target ---------------------------------------------------