In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch

# Clear cache at some point during training, e.g., after every epoch
torch.cuda.empty_cache()


In [None]:
# !pip install -q pytorch-lightning
!pip install -q --upgrade transformers datasets rouge_score
!pip install -q wandb

In [None]:
!nvidia-smi

In [None]:
import transformers
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
import pandas as pd
import numpy as np

import torch.nn.functional as F
# import pytorch_lightning as pl
import torch
# from pytorch_lightning.callbacks import ModelCheckpoint

import math
import random
import re
import argparse
import datetime

import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
from IPython.display import display, HTML
import random
import nltk  # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset, load_metric

import transformers
from filelock import FileLock
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
from transformers.file_utils import is_offline_mode
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version

nltk.download('punkt')

import wandb
wandb.login(key='fbdd2cbf25d057a815541eb041427f3c6e18fc1d')

In [None]:
MODEL_PATH = "google/flan-t5-large"
TOKENIZER_PATH = "google/flan-t5-large"
SAVE_PATH = "/kaggle/working/Model/"
LOGGING_PATH = "/kaggle/working/FlanT5-models/"
SAVE_MODEL_PATH = "/kaggle/working/flanT5-model"

In [None]:
import pandas as pd
train = pd.read_csv("/kaggle/input/claim-decomp-formatted/output5.csv")
train
TRAIN_FILE_PATH = "/kaggle/input/claim-decomp-formatted/output5.csv"
data_files = {}
data_files["train"] = TRAIN_FILE_PATH

In [None]:
val = pd.read_csv('/kaggle/input/claim-decomp-restruct-val/output_val.csv')
val
VAL_FILE_PATH = "/kaggle/input/claim-decomp-restruct-val/output_val.csv"
data_files["val"] = VAL_FILE_PATH

In [None]:
xmetric = load_metric("rouge",trust_remote_code=True)
data_files
raw_datasets = load_dataset("csv", data_files=data_files)

In [None]:
metric = load_metric("rouge",trust_remote_code=True)

In [None]:
raw_datasets["train"][0]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

In [None]:
tokenizer

In [None]:
max_input_length = 64
max_target_length = 128

In [None]:

def preprocess_function(examples, prefix="decompose the compositional question:"):
    inputs = [prefix + doc for doc in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # The "labels" are the tokenized outputs:
#     print(examples["subquestions"])
    labels = tokenizer(text_target=examples["subquestions"], max_length=max_target_length, truncation=True)
#     print(labels)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets['train'].column_names)

In [None]:
tokenized_datasets

## Building the model

### Metrics

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    print(predictions[0])
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    print("decoded gen",decoded_preds)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

## Loading the model

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

In [None]:
model

## Training args

In [None]:
epochs = 6
batch_size = 8
lr = 2e-5

In [None]:
!pip install accelerate -U
device =torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print("Device", device)

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir=SAVE_PATH,
    learning_rate=lr,
    do_train=True,
    do_eval=True,
    eval_strategy="steps",
    per_device_train_batch_size=1,  
    per_device_eval_batch_size=1,   
    gradient_accumulation_steps=4,  
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    num_train_epochs=epochs,
    predict_with_generate=True,
    generation_max_length=512,
    logging_dir=LOGGING_PATH,
    logging_steps=300,
    save_steps=300,
    report_to="wandb",
)

In [None]:
wandb_run = wandb.init(
    project="flant5_subq",
    config={
        "per_device_train_batch_size": batch_size,
        "learning_rate": lr})

now = datetime.datetime.now()
current_time = now.strftime("%m/%d/%Y, %H:%M:%S")
wandb_run.name = "run_" + current_time

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

## Training

In [None]:
# %%wandb
torch.cuda.empty_cache()
model.to(device)
trainer.train()

In [None]:
trainer.save_model(SAVE_MODEL_PATH)

In [None]:
wandb_run.finish()