In [1]:
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    DataCollatorForSeq2Seq, TrainingArguments, Trainer
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from warnings import filterwarnings
filterwarnings('ignore')


Load and clean dataset

In [3]:
df = pd.read_csv('/content/ak_lectures_summarized 2.csv')
df = df.dropna(subset=['Transcript', 'Summary'])
df['input_text'] = 'summarize: ' + df['Transcript']
df['target_text'] = df['Summary']


In [None]:
df.head()

Unnamed: 0,Summary,Transcript,input_text,target_text
0,Instead of being localized at a specific organ...,"Unlike most other systems of our body, the im...",summarize: Unlike most other systems of our b...,Instead of being localized at a specific organ...
1,"Leukocytes, or white blood cells, are the cell...","Lucocytes, also known as white blood cells, a...","summarize: Lucocytes, also known as white blo...","Leukocytes, or white blood cells, are the cell..."
2,Our innate immune system consists of non-speci...,Our immune system consists of two divisions. ...,summarize: Our immune system consists of two ...,Our innate immune system consists of non-speci...
3,One way in which our innate immune system deal...,One important aspect of the innate immune sys...,summarize: One important aspect of the innate...,One way in which our innate immune system deal...
4,The innate immune system begins to act immedia...,When one of the many different types of barri...,summarize: When one of the many different typ...,The innate immune system begins to act immedia...


In [4]:
df.shape

(1842, 4)

In [5]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.shape, val_df.shape

((1473, 4), (369, 4))

In [8]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)




Load tokenizer and model

In [9]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [10]:
def preprocess(example):
    inputs = tokenizer(
        example['input_text'],
        max_length=1024,
        truncation=True,
        padding='max_length'
    )
    targets = tokenizer(
        example['target_text'],
        max_length=128,
        truncation=True,
        padding='max_length'
    )
    inputs['labels'] = targets['input_ids']
    return inputs

In [11]:
tokenized_train = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_val = val_dataset.map(
    preprocess,
    batched=True,
    remove_columns=val_dataset.column_names
)

Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=500,
    learning_rate=3e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    logging_dir='./logs',
    fp16=True,
    save_total_limit=2,
    report_to='none',
    load_best_model_at_end=True,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.999881
2,No log,1.937469
3,No log,1.917307
4,No log,1.899316
5,No log,1.888577
6,2.009100,1.881313
7,2.009100,1.884131
8,2.009100,1.879801


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=744, training_loss=1.9208783795756679, metrics={'train_runtime': 768.7177, 'train_samples_per_second': 15.329, 'train_steps_per_second': 0.968, 'total_flos': 3189735577092096.0, 'train_loss': 1.9208783795756679, 'epoch': 8.0})

In [16]:
results = trainer.evaluate()
print("Final Evaluation Results:", results)

Final Evaluation Results: {'eval_loss': 1.87980055809021, 'eval_runtime': 7.8601, 'eval_samples_per_second': 46.946, 'eval_steps_per_second': 3.053, 'epoch': 8.0}


In [17]:
model.save_pretrained('./t5_finetuned')
tokenizer.save_pretrained('./t5_finetuned')

('./t5_finetuned/tokenizer_config.json',
 './t5_finetuned/special_tokens_map.json',
 './t5_finetuned/spiece.model',
 './t5_finetuned/added_tokens.json')

In [18]:
!zip -r saved_model.zip t5_finetuned

  adding: t5_finetuned/ (stored 0%)
  adding: t5_finetuned/spiece.model (deflated 48%)
  adding: t5_finetuned/added_tokens.json (deflated 83%)
  adding: t5_finetuned/model.safetensors (deflated 8%)
  adding: t5_finetuned/config.json (deflated 63%)
  adding: t5_finetuned/generation_config.json (deflated 27%)
  adding: t5_finetuned/special_tokens_map.json (deflated 85%)
  adding: t5_finetuned/tokenizer_config.json (deflated 94%)


In [19]:
from google.colab import files
files.download('saved_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>