In [None]:
!pip install datasets evaluate

In [None]:
from transformers import AutoTokenizer, BartModel, BartForConditionalGeneration, BartConfig, DataCollatorForSeq2Seq, T5ForConditionalGeneration
from datasets import load_dataset, Dataset
import evaluate
from accelerate import Accelerator
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch

In [None]:
# model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
# tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-base")
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")


**Freeze all layers except the last one which is reponsible for sequence generation**

In [None]:
print(model.parameters)

In [None]:
# freeze all layers except the last one which is used for seq generation
for param in model.parameters():
    param.requires_grad = False

for param in model.lm_head.parameters():
  param.requires_grad = True


In [None]:
for name , param in model.named_parameters(remove_duplicate=False):
    print(name, param.requires_grad)


**Download the dataset and start preprocssing it**


In [None]:
!wget https://huggingface.co/datasets/liweili/c4_200m/resolve/main/data.zip

In [None]:
!unzip data.zip

In [None]:
# checking number of lines

!wc -l C4_200M.tsv-00000-of-00010

In [None]:
X = []
Y = []

with open("C4_200M.tsv-00000-of-00010", "r") as f:

  for i, line in enumerate(f):
    if i == 500000:
      break

    x, y = line.split("\t")
    X.append("fix grammar: " + x)
    Y.append(y)


In [None]:
dataset  = pd.DataFrame({"X": X, "Y": Y})

The whole majority of the sequences' lengths lies between 0 and 500 words.

This will matter when deciding how much to truncate and max length of a sequence.

In [None]:
dataset.head(10)

In [None]:
%matplotlib inline

X_lengths = dataset["X"].apply(len)

Y_lengths = dataset["Y"].apply(len)

# Get the frequency distribution of the lengths
plt.hist(X_lengths.values)
plt.hist(Y_lengths.values)

In [None]:
# hugging face dataset for efficient tokenization
hugging_dataset = Dataset.from_pandas(dataset)

def tokenize_function(data):
  return tokenizer(data["X"], text_target = data["Y"], truncation=True, max_length=1024)


tokenized_dataset = hugging_dataset.map(tokenize_function, batched = True)


In [None]:
print(tokenized_dataset)

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(["X", "Y"])
tokenized_dataset.set_format("numpy")


In [None]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)

train_dataset = tokenized_dataset['train']
val_dataset = tokenized_dataset['test']

In [None]:
print(train_dataset)
print(val_dataset)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [None]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size = 8, collate_fn = data_collator
)

val_dataloader = torch.utils.data.DataLoader(
    val_dataset, batch_size = 8, collate_fn = data_collator
)



In [None]:
# for batch in train_dataloader:
#   break

# {k: v.shape for k, v in batch.items()}

In [None]:
# outputs = model(**batch)
# print(outputs.loss)

In [None]:
# distribute training acoss multiple GPUs
model = torch.nn.DataParallel(model)

In [None]:
epochs_to_run = 1
# number of batches to report loss
batch_report_every = 300
optim = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
metric = evaluate.load("google_bleu")

In [None]:
# checkpoint = torch.load("/kaggle/working/checkpoint_epoch_0.pth",  map_location=torch.device('cpu'))

# model.module.load_state_dict(checkpoint["model_state_dict"])
# optim.load_state_dict(checkpoint["optimizer_state_dict"])

# starting_epoch = checkpoint["epoch"] + 1

In [None]:
for state in optim.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()
            

In [None]:
# new learning rate
# for param_group in optim.param_groups:
#     param_group["lr"] = 1e-7
    # print(param_group["lr"])
    
    # param_group["lr"] = lr=1e-6

In [None]:
def evaluate_gleu(model, dataloader, metric ,tokenizer, sample_size = 300):
    """
    evaluate the 'metric' on sentences generated by the 'model' 
    using inputs randomly sampled (of total size 'sample_size') from the dataset underlying the 'dataloader'.

    return gleu score averaged across all samples.
    """
    model.eval()
    # sample indices without replacement(no duplicates)
    indices = np.random.choice(range(len(dataloader.dataset)), size=sample_size, replace=False )
    random_dataloader = torch.utils.data.DataLoader(dataloader.dataset, 
                                                    batch_size= dataloader.batch_size,
                                                   sampler = torch.utils.data.SubsetRandomSampler(indices),
                                                   collate_fn = data_collator)
    scores = []
    pad_index = tokenizer.pad_token_id
    with torch.no_grad():
        for batch in random_dataloader:
            
            batch = {k : v.cuda() for k, v in batch.items()}
            
            # get max num of tokens(without padding) among sentences in the batch
            # to determine the max new tokens when generating
            max_new_tokens = max((batch["input_ids"] != pad_index).sum(dim = -1))
            # the number of tokens of the generated sentence should not differ vastly with its input counterpart
            outputs = model.module.generate(**batch, 
                                       max_length= int(max_new_tokens + 10),
                                       num_beams=4,
                                        length_penalty=1.0)

            preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            
            batch["labels"][batch["labels"] == -100] = pad_index

            references = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
            # avg score across elements in a batch
            score = metric.compute(predictions=preds, references=references)
            
            scores.append(score["google_bleu"])

    # avg score across batches
    return sum(scores) / len(scores)

    

In [None]:
def evaluate_loss(model, val_dataloader):
  model.eval()
  val_loss = 0
  with torch.no_grad():
    for batch in val_dataloader:
      batch = {k : v.cuda() for k, v in batch.items()}

      outputs = model(**batch)

      val_loss += outputs.loss.mean().item()

  val_loss /= len(val_dataloader)
  return val_loss

In [None]:
# used on gleu
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optim, mode='max', factor=0.3, patience=2)

In [None]:
Epochs = starting_epoch + epochs_to_run
model = model.cuda()

for epoch in range(starting_epoch, Epochs):
  print(f"Epoch: {epoch}")
  model.train()
  avg_batch_loss = 0
  epoch_loss = 0
    
  for i, batch in enumerate(train_dataloader):
    batch = {k : v.cuda() for k, v in batch.items()}

    outputs = model(**batch)

    # loss.mean() because of distributed training
    loss = outputs.loss.mean()
      
    optim.zero_grad()

    loss.backward()
    
    optim.step()
      
    avg_batch_loss += loss.item()
    epoch_loss += loss.item()

    # print avg batch loss
    if not (i + 1) % batch_report_every:
      avg_batch_loss /= batch_report_every
      print(f"-----Batches {i + 1 - batch_report_every} -- {i+1} | Avg Batch Training Loss: {avg_batch_loss}", flush=True)
      avg_batch_loss = 0

    # save after 5,000 batches
    if not (i + 1) % 5000:
        print(f"Saved Model at Batch {i}", flush=True)
        torch.save({
        'epoch': epoch,
        'model_state_dict': model.module.state_dict(),
        'optimizer_state_dict': optim.state_dict(),
        'loss': loss.item(),
        }, f'checkpoint_epoch_{epoch}_step_{i}.pth')
 
        gleu_score = evaluate_gleu(model, val_dataloader, metric, tokenizer)
        val_loss = evaluate_loss(model, val_dataloader)
        print(f"-----Gleu Score On Validation Data: {gleu_score}", flush=True)
        print(f"-----Val Loss: {val_loss}", flush=True)
        with open("gleu_scores.txt", "a") as file:
            file.write(f"Epoch: {epoch} | Batch: {i} | Gleu Score: {gleu_score} | Val loss: {val_loss}\n")

        scheduler.step(gleu_score)
        
        
  epoch_loss /= len(train_dataloader)
  print(f"-----Training Loss: {epoch_loss}", flush=True)
    
  # save after each epoch
  torch.save({
        'epoch': epoch,
        'model_state_dict': model.module.state_dict(),
        'optimizer_state_dict': optim.state_dict(),
        'loss': loss.item(),
    }, f'checkpoint_epoch_{epoch}.pth')
    
  val_loss = evaluate_loss(model, val_dataloader)
  
  print(f"-----Validation Loss: {val_loss}", flush=True)
  print("=============================================", flush=True)

In [None]:
def correct_grammar(model, metric, ungrammatical_sen, target =None):
    model.eval()
    google_bleu = None
    
    inputs = tokenizer(ungrammatical_sen, truncation=True, max_length=1024, return_tensors = "pt")

    inputs = {k : v.cuda() for k, v in inputs.items()}

    outputs = model.module.generate(**inputs,
                                    max_length=len(ungrammatical_sen) + 20, 
                                    num_beams=5, 
                                    do_sample=True,
                                   repetition_penalty=2.6,
                                     temperature= 0.01
                                    )

    sentence = tokenizer.decode(outputs[0], skip_special_tokens=True )
    
    # compute gleu score between target and pred
    if target:
        google_bleu = metric.compute(predictions=[sentence], references=[target])
        
    return sentence, google_bleu
    
    
    

In [None]:

# correct_grammar(model, metric,
#                 "Bitcoin is for $7,094 this morning, which CoinDesk says.",
#                target = "Bitcoin goes for $7,094 this morning, according to CoinDesk.")

inp = "fix grammar:My names is ali and i went at school yesterday"

sentence, score = correct_grammar(model, 
                                  metric, 
                                  inp,
                            target = "My name is ali and i went to school yesterday")

print(f"Input: {inp}")
print(f"Output: {sentence}")
if score:
    print(f"Google Bleu: {score['google_bleu'] }")

# correct_grammar(model, metric,
#                 "The effect of widespread dud targets two face up attack position monsters on the field",
#                target = 'The effect of "widespread dud" targets two face up attack position monsters on the field.')
               

# correct_grammar(model, metric,
#                 "tax on sales of stores for non residents are set at 21% for 2014 and 20% in 2015 payable on sales tentatively earned from the difference of the property value some time of purchase (price differences according to working time) and theyear to which sale couples (sales costs), based on the approved annual on the base approved by law).",
#                target = "Capital Gains tax on the sale of properties for non-residents is set at 21% for 2014 and 20% in 2015 payable on profits earned on the difference of the property value between the year of purchase (purchase price plus costs) and the year of sale (sales price minus costs), based on the approved annual percentage increase on the base value approved by law.")