In [None]:
#!pip install accelerate -U

In [None]:
#!pip install datasets --upgrade

In [None]:
#!pip install zipfile

In [None]:
#!pip install gdown

In [None]:
# !gdown --id <File ID>
#!gdown --id 1mnp_3qv41lbG6VKanGI4Woj1S6FdehuS

In [None]:
import os
import pandas as pd
import wandb
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import pipeline
from transformers import DataCollatorForSeq2Seq
from datasets import Dataset

In [None]:
os.environ['WANDB_SILENT'] = 'true'

In [None]:
# Device Selection
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
model_checkpoint = 'google/flan-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

In [None]:
# Read Data From Excel Files
train_df = pd.read_excel(r'/kaggle/input/summaries-dataset/TrainData.xlsx')
val_df = pd.read_excel(r'/kaggle/input/summaries-dataset/ValidationData.xlsx')

In [None]:
# Data Preprocessing
def preprocess_text(text):
    return text.strip("[]").replace("'", "").replace("\"", "")

train_df['summary'] = train_df['summary'].apply(preprocess_text)
val_df['summary'] = val_df['summary'].apply(preprocess_text)

In [None]:
# Put The Data In Form Of Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
# Data Preprocessing
def get_feature(batch):
    encodings = tokenizer(batch['text'], text_target=batch['summary'], max_length=1024, truncation=True)
    encodings = {'input_ids': encodings['input_ids'], 'attention_mask': encodings['attention_mask'], 'labels': encodings['labels']}
    return encodings

train_dataset = train_dataset.map(get_feature, batched=True)
val_dataset = val_dataset.map(get_feature, batched=True)

In [None]:
# Put The Data In Form Of Torch Tensors
columns = ['input_ids', 'labels', 'attention_mask']
train_dataset.set_format(type='torch', columns=columns)
val_dataset.set_format(type='torch', columns=columns)

In [None]:
# Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/T5-model',  # Output directory for checkpoints and evaluation results
    overwrite_output_dir=False,  # Do not overwrite the output directory
    num_train_epochs=1,  # Number of training epochs to run
    per_device_train_batch_size=2,  # Batch size per GPU
    per_device_eval_batch_size=2,  # Batch size for evaluation per GPU
    warmup_steps=500,  # Number of warmup steps
    weight_decay=0.01,  # Weight decay for regularization
    logging_steps=10,  # Log every N steps
    evaluation_strategy='steps',  # Evaluate every `eval_steps` steps
    eval_steps=500,  # Number of steps between evaluations
    save_steps=3000,  # Save checkpoint every N steps
    gradient_accumulation_steps=8,  # Number of gradient accumulation steps
    load_best_model_at_end=True,  # Load the best model from the checkpoint at the end of training
    metric_for_best_model="eval_loss",  # Metric to use for determining the best model
    greater_is_better=False,
)

In [None]:
# Use GPU
model = model.to(device)

# Trainer Initialization
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
wandb.login(key="d4d3b2430fb32f105f7640e67c1d04be00ea9c11")

In [None]:
# Train the model
trainer.train()

In [None]:
# Save the trained model
trainer.save_model('/kaggle/working/T5_finetuned')

In [None]:
import shutil

# Path to the finetuned model
model_dir = '/kaggle/working/T5_finetuned'

# Path where the zip file will be saved
zip_file = '/kaggle/working/T5_finetuned.zip'

# Create a zip file
shutil.make_archive(zip_file.replace('.zip', ''), 'zip', model_dir)
