<a href="https://colab.research.google.com/github/Arjavjain100/TOS-Summarization/blob/wandb/Wandb_init.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wandb onnx -Uq
!pip install transformers
!pip install sentencepiece
!pip install rouge
!git clone https://github.com/Arjavjain100/TOS-Summarization.git

In [None]:
import os
import random
import pandas as pd

import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from tqdm.auto import tqdm
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments, pipeline, PretrainedConfig
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.model_selection import train_test_split

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Dataset location
filename = "/content/TOS-Summarization/Dataset/all_v1_transpose.csv"

In [None]:
import wandb

In [None]:
wandb.login()

In [None]:
# Change this every run
config = dict(
    epochs=10,
    batch_size=32,
    learning_rate=0.005,
    model_name ='nsi319/legal-pegasus')

In [None]:
def model_pipeline(hyperparameters):

    # tell wandb to get started
    with wandb.init(project="pytorch-demo", config=hyperparameters):
      # access all HPs through wandb.config, so logging matches execution!
      config = wandb.config

      # make the model, data, and optimization problem
      model, train_loader, test_loader, criterion, optimizer = make(config)
      print(model)

      # and use them to train the model
      train(model, train_loader, criterion, optimizer, config)

      # and test its final performance
      test(model, test_loader)

    return model

In [None]:
def make(config):
    # Make the data

    train_texts, train_labels, test_texts, test_labels = get_data()

    train_loader, test_loader = make_loader(config.model_name, config.batch_size, train_texts, train_labels, test_texts, test_labels)

    # Make the model
    model = PegasusForConditionalGeneration.from_pretrained(config.model_name).to(device)


    # Make the loss and optimizer (Arnav Please tell)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(
        model.parameters(), lr=config.learning_rate)

    return model, train_loader, test_loader, criterion, optimizer

In [None]:
def get_data():
    df = pd.read_csv(filename)
    df = df[['original_text','reference_summary']]
    df.rename(columns = {'original_text':'source', 'reference_summary':'target'}, inplace = True)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

    train_texts, train_labels = list(X_train), list(y_train)
    test_texts, test_labels = list(X_test), list(y_test)

    return train_texts, train_labels, test_texts, test_labels


def make_loader(model_name, batch_size, train_texts, train_labels, test_texts, test_labels):

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    prepare_test = False if test_texts is None or test_labels is None else True

    def tokenize_data(texts, labels):
      encodings = tokenizer(texts, truncation=True, padding=True, max_length = 600)
      decodings = tokenizer(labels, truncation=True, padding=True, max_length = 256)
      dataset_tokenized = PegasusDataset(encodings, decodings)
      return dataset_tokenized

    train_dataset = tokenize_data(train_texts, train_labels)
    test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None


    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                              batch_size=batch_size,
                                              shuffle=True)

    test_loader = torch.util.data.DataLoader(dataset=test_dataset,
                                             batch_size=batch_size,
                                             shuffle=True)

    return train_loader, test_loader

In [None]:
# Pegasus Transformer Dataset

class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])


In [None]:
def train(model, loader, criterion, optimizer, config):
    # Tell wandb to watch what the model gets up to: gradients, weights, and more!
    wandb.watch(model, criterion, log="all", log_freq=10)

    # Run training and track with wandb
    total_batches = len(loader) * config.epochs
    example_ct = 0  # number of examples seen
    batch_ct = 0
    for epoch in tqdm(range(config.epochs)):
        for _, (images, labels) in enumerate(loader):

            loss = train_batch(images, labels, model, optimizer, criterion)
            example_ct +=  len(images)
            batch_ct += 1

            # Report metrics every 25th batch
            if ((batch_ct + 1) % 25) == 0:
                train_log(loss, example_ct, epoch)


def train_batch(images, labels, model, optimizer, criterion):
    images, labels = images.to(device), labels.to(device)

    # Forward pass ➡
    outputs = model(images)
    loss = criterion(outputs, labels)

    # Backward pass ⬅
    optimizer.zero_grad()
    loss.backward()

    # Step with optimizer
    optimizer.step()

    return loss

In [None]:
def train_log(loss, example_ct, epoch):
    # Where the magic happens
    wandb.log({"epoch": epoch, "loss": loss}, step=example_ct)
    print(f"Loss after {str(example_ct).zfill(5)} examples: {loss:.3f}")

In [None]:
def test(model, test_loader):
    model.eval()

    # Run the model on some test examples
    with torch.no_grad():
        correct, total = 0, 0
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f"Accuracy of the model on the {total} " +
              f"test images: {correct / total:%}")

        wandb.log({"test_accuracy": correct / total})

    # Save the model in the exchangeable ONNX format
    torch.onnx.export(model, images, "model.onnx")
    wandb.save("model.onnx")

In [None]:
model = model_pipeline(config)