In [1]:
import os

from transformers import (
    T5ForConditionalGeneration,
    T5Config,
    T5Tokenizer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer
)

from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader

import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = T5Config()
model = T5ForConditionalGeneration(config=config)

In [3]:
tokenizer = T5Tokenizer.from_pretrained('google/mt5-base')

In [4]:

training_args = Seq2SeqTrainingArguments(
    output_dir='./output/',
    evaluation_strategy="steps",
    num_train_epochs=10,
    eval_steps=200,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="blue",
    report_to="wandb"
)

In [5]:

def preprocess_data(data):

    input_text = data['input']
    target_text = data['target']

    input_ids = tokenizer(input_text, padding=True, truncation=True, max_length=512).input_ids
    target_ids = tokenizer(target_text, padding=True, truncation=True, max_length=512).input_ids

    return {'input_ids': input_ids, 'labels': target_ids}


dataset = load_dataset('csv', data_files='g2p_data.csv', split='train')
dataset = dataset.map(preprocess_data, batched=True)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

Downloading and preparing dataset csv/default to /opt/ml/.cache/huggingface/datasets/csv/default-5545ef1f95046fe4/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1837.99it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 737.01it/s]
                                                                    

Dataset csv downloaded and prepared to /opt/ml/.cache/huggingface/datasets/csv/default-5545ef1f95046fe4/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


                                                                   

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'target', 'input_ids', 'labels'],
        num_rows: 32282
    })
    test: Dataset({
        features: ['input', 'target', 'input_ids', 'labels'],
        num_rows: 8071
    })
})

In [7]:
os.environ['WANDB_DISABLED'] = 'false'

In [8]:
wandb.init(project='TC_competition',
           name='g2p_cleansing',
           tags=['g2p', 't5'],
           group='t5')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfortunetiger[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = dataset['train'],
    eval_dataset = dataset['test'],
    tokenizer = tokenizer   
)

In [10]:
trainer.train()



ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).