load the dataset 

In [1]:
from datasets import load_dataset

dataset_file = 'test_t5.csv' #the dataset will be stored in the format (n rows :2 columns{source,target})

dataset = load_dataset('csv', data_files=dataset_file, split='train') #this line of command does not do any actual splitting, we want HF to think our custom dataset is actually a built-in dataset on their server   

dataset = dataset.train_test_split(test_size=0.1) #90% training data, this is where it is actually split 
train_dataset = dataset['train']
val_dataset = dataset['test']

Using custom data configuration default-731b845314f578c7
Reusing dataset csv (C:\Users\Mir Info\.cache\huggingface\datasets\csv\default-731b845314f578c7\0.0.0\2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


tokenization 

In [4]:
from transformers import AutoTokenizer
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained('t5-base')

df = pd.read_csv('test_t5.csv')

source_text = df['source']
target_text = df['target']

tokenized_source_text = tokenizer(list(source_text), truncation=False, padding=False)
tokenized_target_text = tokenizer(list(target_text), truncation=False, padding=False)

max_source = 0
for item in tokenized_source_text['input_ids']:
    if len(item) > max_source:
        max_source = len(item)

max_target = 0
for item in tokenized_target_text['input_ids']:
    if len(item) > max_target:
        max_target = len(item)

In [5]:
def tokenize(batch):
    tokenized_input = tokenizer(batch['source'], padding='max_length', truncation=True, max_length=max_source)
    tokenized_label = tokenizer(batch['target'], padding='max_length', truncation=True, max_length=max_target)

    tokenized_input['labels'] = tokenized_label['input_ids']

    return tokenized_input


In [6]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=512)
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

train_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




save the dataset 

In [7]:
train_dataset.save_to_disk('t5_train')
val_dataset.save_to_disk('t5_test')

training 

In [8]:
import torch 

In [3]:
getcwd()

NameError: name 'getcwd' is not defined

In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained('t5-base')

output_dir = 'output/dir'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=1, # Number of eval steps to keep in GPU (the higher, the mor vRAM used)
    prediction_loss_only=True, # If I need co compute only loss and not other metrics, setting this to true will use less RAM
    learning_rate=0.001,
    evaluation_strategy='steps', # Run evaluation every eval_steps
    save_steps=1000, # How often to save a checkpoint
    save_total_limit=1, # Number of maximum checkpoints to save
    remove_unused_columns=True, # Removes useless columns from the dataset
    run_name='run_name', # Wandb run name
    logging_steps=1000, # How often to log loss to wandb
    eval_steps=1000, # How often to run evaluation on the val_set
    logging_first_step=False, # Whether to log also the very first training step to wandb
    load_best_model_at_end=True, # Whether to load the best model found at each evaluation.
    metric_for_best_model="loss", # Use loss to evaluate best model.
    greater_is_better=False # Best model is the one with the lowest loss, not highest.
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()
trainer.save_model(output_dir + '/model')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…

evaluation 

In [None]:
model_dir = 'your/model/dir'
output_dir = 'your/output/dir'

model = T5ForConditionalGeneration.from_pretrained(model_dir)

pred_args = TrainingArguments(
    output_dir=output_dir,
    per_device_eval_batch_size=8,
    remove_unused_columns=True,
    eval_accumulation_steps=1
)

trainer = Trainer(model=model, args=pred_args)

preds, labels, *_ = trainer.predict(val_dataset)
preds_tokens = preds.argmax(axis=2)

decoded_sources = []
for row in val_dataset:
    decoded_sources.append(tokenizer.decode(row['input_ids']))

decoded_preds = [tokenizer.decode(pred) for pred in preds_tokens]
decoded_labels = [tokenizer.decode(label) for label in labels]

output = pd.DataFrame({'Source Text': decoded_sources, 'Target Text': decoded_labels, 'Generated Text': decoded_preds})
output.to_excel(output_dir + "/predictions.xlsx")

model testing with new input 

In [None]:
import torch

print("input:")
input_text = input()

with torch.no_grad():
    tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')

    source_ids = tokenized_text['input_ids'].to(device, dtype = torch.long)
    source_mask = tokenized_text['attention_mask'].to(device, dtype = torch.long)

    generated_ids = model.generate(
        input_ids = source_ids,
        attention_mask = source_mask, 
        max_length=512,
        num_beams=5,
        repetition_penalty=1, 
        length_penalty=1, 
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

print("\noutput:\n" + pred)