In [1]:
"""
This Training script is based on the Training script of the huggingface user dehio.
It is adjusted to fit the needs of this bachelors thesis´.
Please see the original training script here: https://github.com/d-e-h-i-o/german-qg/blob/main/run_qg.py
"""

import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
from datasets import load_dataset, load_metric

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
check_min_version("4.13.0.dev0")
require_version(
    "datasets>=1.8.0",
    "To fix: pip install -r examples/pytorch/translation/requirements.txt",
)
logger = logging.getLogger(__name__)


In [3]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={
            "help": "Path to pretrained model or model identifier from huggingface.co/models"
        }
    )
    config_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "Pretrained config name or path if not the same as model_name"
        },
    )
    tokenizer_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "Pretrained tokenizer name or path if not the same as model_name"
        },
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={
            "help": "Where to store the pretrained models downloaded from huggingface.co"
        },
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={
            "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."
        },
    )
    model_revision: str = field(
        default="main",
        metadata={
            "help": "The specific model version to use (can be a branch name, tag name or commit id)."
        },
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
            "with private models)."
        },
    )

In [4]:
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    source_lang: str = field(
        default=None, metadata={"help": "Source language id for translation."}
    )
    target_lang: str = field(
        default=None, metadata={"help": "Target language id for translation."}
    )

    dataset_name: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the dataset to use (via the datasets library)."},
    )
    e2e: Optional[bool] = (
        field(default=False, metadata={"help": "Prepare data for e2e"}),
    )
    dataset_config_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "The configuration name of the dataset to use (via the datasets library)."
        },
    )
    dataset_field_name: Optional[str] = field(
        default=None,
        metadata={"help": "The field name, useful when the data is stored as json."},
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a jsonlines)."}
    )
    validation_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input evaluation data file to evaluate the metrics (sacreblue) on "
            "a jsonlines file."
        },
    )
    test_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input test data file to evaluate the metrics (sacreblue) on "
            "a jsonlines file."
        },
    )
    overwrite_cache: bool = field(
        default=False,
        metadata={"help": "Overwrite the cached training and evaluation sets"},
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_source_length: Optional[int] = field(
        default=1024,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    max_target_length: Optional[int] = field(
        default=128,
        metadata={
            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    val_max_target_length: Optional[int] = field(
        default=None,
        metadata={
            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
            "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
            "during ``evaluate`` and ``predict``."
        },
    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": "Whether to pad all samples to model maximum sentence length. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
            "efficient on GPU but very bad for TPU."
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
            "value if set."
        },
    )
    num_beams: Optional[int] = field(
        default=None,
        metadata={
            "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
            "which is used during ``evaluate`` and ``predict``."
        },
    )
    ignore_pad_token_for_loss: bool = field(
        default=True,
        metadata={
            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
        },
    )
    source_prefix: Optional[str] = field(
        default=None,
        metadata={
            "help": "A prefix to add before every source text (useful for T5 models)."
        },
    )
    forced_bos_token: Optional[str] = field(
        default=None,
        metadata={
            "help": "The token to force as the first generated token after the :obj:`decoder_start_token_id`."
            "Useful for multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token "
            "needs to be the target language token.(Usually it is the target language token)"
        },
    )

    def __post_init__(self):

        if self.train_file is not None:
            extension = self.train_file.split(".")[-1]
            assert extension == "json", "`train_file` should be a json file."
        if self.validation_file is not None:
            extension = self.validation_file.split(".")[-1]
            assert extension == "json", "`validation_file` should be a json file."
        if self.val_max_target_length is None:
            self.val_max_target_length = self.max_target_length


In [5]:
parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)
    )
model_args, data_args, training_args = parser.parse_json_file(
    json_file='data/config/t5_training.json'
)

In [6]:
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.logging.set_verbosity(log_level)
transformers.logging.enable_default_handler()
transformers.logging.enable_explicit_format()
logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
logger.info(f"Training/evaluation parameters {training_args}")
set_seed(training_args.seed)


10/01/2023 22:17:21 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=True,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
generation_max_length=None,
generation_num_beams=None,
gradient_accumulation_steps=8,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_s

In [7]:
raw_datasets = load_dataset("csv", split='train', data_files="data/t5_dataset.csv")
raw_datasets = raw_datasets.train_test_split(test_size=0.1)
raw_datasets.save_to_disk('data/t5_bobby_dataset')

10/01/2023 22:17:22 - INFO - datasets.info - Loading Dataset Infos from C:\Users\Adam\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.7_qbz5n2kfra8p0\LocalCache\local-packages\Python37\site-packages\datasets\packaged_modules\csv
10/01/2023 22:17:22 - INFO - datasets.builder - Overwrite dataset info from restored data version.
10/01/2023 22:17:22 - INFO - datasets.info - Loading Dataset info from C:\Users\Adam\.cache\huggingface\datasets\csv\default-a69777fe21ba2ca4\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317
10/01/2023 22:17:22 - INFO - datasets.info - Loading Dataset info from C:/Users/Adam/.cache/huggingface/datasets/csv/default-a69777fe21ba2ca4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317


                                                                                              



                                                                                            

In [8]:
test_dataset = datasets.load_from_disk(dataset_path='data/t5_bobby_dataset')
test_dataset



DatasetDict({
    train: Dataset({
        features: ['name', 'sourceName', 'sourceUrl', 'faqLabels', 'answer', 'question'],
        num_rows: 6309
    })
    test: Dataset({
        features: ['name', 'sourceName', 'sourceUrl', 'faqLabels', 'answer', 'question'],
        num_rows: 702
    })
})

In [9]:
config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name
        else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name
    if model_args.tokenizer_name
    else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
    use_fast=model_args.use_fast_tokenizer,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)
tokenizer.add_tokens(["<hl>"])
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)

model.resize_token_embeddings(len(tokenizer))

[INFO|configuration_utils.py:654] 2023-10-01 22:17:23,620 >> loading configuration file config.json from cache at cache\models--dehio--german-qg-t5-quad\snapshots\e5eeeeaef49576b5679469f2d186971e4f647ea7\config.json
[INFO|configuration_utils.py:706] 2023-10-01 22:17:23,632 >> Model config T5Config {
  "_name_or_path": "dehio/german-qg-t5-quad",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "earl

Embedding(32101, 768)

In [10]:

last_checkpoint = None
if model.config.decoder_start_token_id is None:
    raise ValueError(
        "Make sure that `config.decoder_start_token_id` is correctly defined"
    )

prefix = data_args.source_prefix if data_args.source_prefix is not None else ""

In [11]:
if training_args.do_train:
        column_names = raw_datasets["train"].column_names
elif training_args.do_eval:
    column_names = raw_datasets["validation"].column_names
elif training_args.do_predict:
    column_names = raw_datasets["test"].column_names
else:
    logger.info(
        "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`."
    )

In [12]:
max_target_length = data_args.max_target_length
padding = "max_length" if data_args.pad_to_max_length else False
if training_args.label_smoothing_factor > 0 and not hasattr(
    model, "prepare_decoder_input_ids_from_labels"
):
    logger.warning(
        "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
        f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
    )

In [13]:
def preprocess_function(samples):
    inputs = []
    targets = []

    for i in range(len(samples["answer"])):
        inputs.append(samples["answer"][i])
        targets.append(samples["question"][i])

    inputs = [prefix + inp for inp in inputs]
    model_inputs = tokenizer(
        inputs,
        max_length=data_args.max_source_length,
        padding=padding,
        truncation=True,
    )

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=max_target_length, padding=padding, truncation=True
        )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length" and data_args.ignore_pad_token_for_loss:
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [14]:
if training_args.do_train:
    if "train" not in raw_datasets:
        raise ValueError("--do_train requires a train dataset")
    train_dataset = raw_datasets["train"]
    if data_args.max_train_samples is not None:
        train_dataset = train_dataset.select(range(data_args.max_train_samples))
    with training_args.main_process_first(desc="train dataset map pre-processing"):
        train_dataset = train_dataset.map(
            preprocess_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on train dataset",
        )

Running tokenizer on train dataset:   0%|          | 0/7 [00:00<?, ?ba/s]

10/01/2023 22:17:27 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\Users\Adam\.cache\huggingface\datasets\csv\default-a69777fe21ba2ca4\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-b5ff44a89fd06d54.arrow


  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
Running tokenizer on train dataset: 100%|██████████| 7/7 [00:01<00:00,  3.64ba/s]


In [15]:
if training_args.do_eval:
    max_target_length = data_args.val_max_target_length
    if "validation" not in raw_datasets:
        raise ValueError("--do_eval requires a validation dataset")
    eval_dataset = raw_datasets["validation"]
    if data_args.max_eval_samples is not None:
        eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
    with training_args.main_process_first(
        desc="validation dataset map pre-processing"
    ):
        eval_dataset = eval_dataset.map(
            preprocess_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on validation dataset",
        )

In [16]:
if training_args.do_predict:
    max_target_length = data_args.val_max_target_length
    if "test" not in raw_datasets:
        raise ValueError("--do_predict requires a test dataset")
    predict_dataset = raw_datasets["test"]
    if data_args.max_predict_samples is not None:
        predict_dataset = predict_dataset.select(
            range(data_args.max_predict_samples)
        )
    with training_args.main_process_first(
        desc="prediction dataset map pre-processing"
    ):
        predict_dataset = predict_dataset.map(
            preprocess_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on prediction dataset",
        )

Running tokenizer on prediction dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

10/01/2023 22:17:29 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\Users\Adam\.cache\huggingface\datasets\csv\default-a69777fe21ba2ca4\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-34be345d700e6cdc.arrow


Running tokenizer on prediction dataset: 100%|██████████| 1/1 [00:00<00:00,  4.88ba/s]


In [17]:
label_pad_token_id = (
        -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
    )
if data_args.pad_to_max_length:
    data_collator = default_data_collator
else:
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=8 if training_args.fp16 else None,
    )

# Metric
metric = load_metric("sacrebleu")

  from ipykernel import kernelapp as app


In [18]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [19]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if data_args.ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [20]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    if training_args.predict_with_generate
    else None,
)

In [21]:
# Training
if training_args.do_train:
    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint
    elif last_checkpoint is not None:
        checkpoint = last_checkpoint
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
    trainer.save_model()  # Saves the tokenizer too for easy upload

    metrics = train_result.metrics
    max_train_samples = (
        data_args.max_train_samples
        if data_args.max_train_samples is not None
        else len(train_dataset)
    )
    metrics["train_samples"] = min(max_train_samples, len(train_dataset))

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

# Evaluation
results = {}
max_length = (
    training_args.generation_max_length
    if training_args.generation_max_length is not None
    else data_args.val_max_target_length
)
num_beams = (
    data_args.num_beams
    if data_args.num_beams is not None
    else training_args.generation_num_beams
)

[INFO|trainer.py:1634] 2023-10-01 22:17:31,721 >> ***** Running training *****
[INFO|trainer.py:1635] 2023-10-01 22:17:31,722 >>   Num examples = 6309
[INFO|trainer.py:1636] 2023-10-01 22:17:31,723 >>   Num Epochs = 10
[INFO|trainer.py:1637] 2023-10-01 22:17:31,724 >>   Instantaneous batch size per device = 2
[INFO|trainer.py:1638] 2023-10-01 22:17:31,724 >>   Total train batch size (w. parallel, distributed & accumulation) = 16
[INFO|trainer.py:1639] 2023-10-01 22:17:31,725 >>   Gradient Accumulation steps = 8
[INFO|trainer.py:1640] 2023-10-01 22:17:31,725 >>   Total optimization steps = 3940
[INFO|trainer.py:1642] 2023-10-01 22:17:31,728 >>   Number of trainable parameters = 222882816
  3%|▎         | 100/3940 [02:27<1:29:16,  1.39s/it]

{'loss': 2.4208, 'learning_rate': 9.746192893401017e-05, 'epoch': 0.25}


  5%|▌         | 200/3940 [04:48<1:25:34,  1.37s/it]

{'loss': 2.1888, 'learning_rate': 9.49238578680203e-05, 'epoch': 0.51}


  8%|▊         | 300/3940 [07:14<1:32:58,  1.53s/it]

{'loss': 2.0801, 'learning_rate': 9.238578680203046e-05, 'epoch': 0.76}


 10%|█         | 400/3940 [09:39<1:22:52,  1.40s/it]

{'loss': 1.9914, 'learning_rate': 8.984771573604062e-05, 'epoch': 1.02}


 13%|█▎        | 500/3940 [12:12<1:29:40,  1.56s/it]

{'loss': 1.7216, 'learning_rate': 8.730964467005075e-05, 'epoch': 1.27}


 15%|█▌        | 600/3940 [14:45<1:27:07,  1.57s/it]

{'loss': 1.7431, 'learning_rate': 8.477157360406092e-05, 'epoch': 1.52}


 18%|█▊        | 700/3940 [17:13<1:14:04,  1.37s/it]

{'loss': 1.6866, 'learning_rate': 8.223350253807108e-05, 'epoch': 1.78}


 20%|██        | 800/3940 [19:42<1:24:08,  1.61s/it]

{'loss': 1.5969, 'learning_rate': 7.969543147208121e-05, 'epoch': 2.03}


 23%|██▎       | 900/3940 [22:10<1:09:00,  1.36s/it]

{'loss': 1.4498, 'learning_rate': 7.715736040609137e-05, 'epoch': 2.28}


 25%|██▌       | 1000/3940 [24:33<1:09:50,  1.43s/it]

{'loss': 1.4432, 'learning_rate': 7.461928934010153e-05, 'epoch': 2.54}


 28%|██▊       | 1100/3940 [26:51<1:05:14,  1.38s/it]

{'loss': 1.4433, 'learning_rate': 7.208121827411168e-05, 'epoch': 2.79}


 30%|███       | 1200/3940 [29:13<1:12:57,  1.60s/it]

{'loss': 1.3827, 'learning_rate': 6.954314720812183e-05, 'epoch': 3.05}


 33%|███▎      | 1300/3940 [31:44<1:16:12,  1.73s/it]

{'loss': 1.2969, 'learning_rate': 6.700507614213199e-05, 'epoch': 3.3}


 36%|███▌      | 1400/3940 [34:29<1:13:50,  1.74s/it]

{'loss': 1.2737, 'learning_rate': 6.446700507614213e-05, 'epoch': 3.55}


 38%|███▊      | 1500/3940 [37:13<1:01:44,  1.52s/it]

{'loss': 1.2449, 'learning_rate': 6.192893401015228e-05, 'epoch': 3.81}


 41%|████      | 1600/3940 [39:59<59:34,  1.53s/it]  

{'loss': 1.223, 'learning_rate': 5.939086294416244e-05, 'epoch': 4.06}


 43%|████▎     | 1700/3940 [42:45<58:36,  1.57s/it]  

{'loss': 1.1408, 'learning_rate': 5.68527918781726e-05, 'epoch': 4.31}


 46%|████▌     | 1800/3940 [45:28<57:29,  1.61s/it]  

{'loss': 1.1435, 'learning_rate': 5.431472081218274e-05, 'epoch': 4.57}


 48%|████▊     | 1900/3940 [48:14<1:00:06,  1.77s/it]

{'loss': 1.1129, 'learning_rate': 5.17766497461929e-05, 'epoch': 4.82}


 51%|█████     | 2000/3940 [50:58<50:43,  1.57s/it]  

{'loss': 1.1164, 'learning_rate': 4.9238578680203045e-05, 'epoch': 5.08}


 53%|█████▎    | 2100/3940 [53:43<55:43,  1.82s/it]

{'loss': 1.048, 'learning_rate': 4.67005076142132e-05, 'epoch': 5.33}


 56%|█████▌    | 2200/3940 [56:28<52:39,  1.82s/it]

{'loss': 1.0399, 'learning_rate': 4.416243654822335e-05, 'epoch': 5.58}


 58%|█████▊    | 2300/3940 [59:13<46:38,  1.71s/it]

{'loss': 1.0095, 'learning_rate': 4.162436548223351e-05, 'epoch': 5.84}


 61%|██████    | 2400/3940 [1:02:00<46:28,  1.81s/it]

{'loss': 1.0179, 'learning_rate': 3.9086294416243655e-05, 'epoch': 6.09}


 63%|██████▎   | 2500/3940 [1:04:47<37:49,  1.58s/it]

{'loss': 0.9297, 'learning_rate': 3.654822335025381e-05, 'epoch': 6.34}


 66%|██████▌   | 2600/3940 [1:07:30<34:34,  1.55s/it]

{'loss': 0.9666, 'learning_rate': 3.401015228426396e-05, 'epoch': 6.6}


 69%|██████▊   | 2700/3940 [1:10:17<34:54,  1.69s/it]

{'loss': 0.9393, 'learning_rate': 3.147208121827411e-05, 'epoch': 6.85}


 71%|███████   | 2800/3940 [1:13:05<30:29,  1.60s/it]

{'loss': 0.9286, 'learning_rate': 2.8934010152284264e-05, 'epoch': 7.11}


 74%|███████▎  | 2900/3940 [1:15:51<29:27,  1.70s/it]

{'loss': 0.8979, 'learning_rate': 2.6395939086294418e-05, 'epoch': 7.36}


 76%|███████▌  | 3000/3940 [1:18:41<25:27,  1.62s/it]

{'loss': 0.8896, 'learning_rate': 2.385786802030457e-05, 'epoch': 7.61}


 79%|███████▊  | 3100/3940 [1:21:30<22:13,  1.59s/it]

{'loss': 0.9174, 'learning_rate': 2.1319796954314723e-05, 'epoch': 7.87}


 81%|████████  | 3200/3940 [1:24:15<20:19,  1.65s/it]

{'loss': 0.8749, 'learning_rate': 1.8781725888324874e-05, 'epoch': 8.12}


 84%|████████▍ | 3300/3940 [1:27:05<19:50,  1.86s/it]

{'loss': 0.8756, 'learning_rate': 1.6243654822335024e-05, 'epoch': 8.38}


 86%|████████▋ | 3400/3940 [1:29:47<14:30,  1.61s/it]

{'loss': 0.848, 'learning_rate': 1.3705583756345178e-05, 'epoch': 8.63}


 89%|████████▉ | 3500/3940 [1:32:34<12:30,  1.71s/it]

{'loss': 0.8546, 'learning_rate': 1.116751269035533e-05, 'epoch': 8.88}


 91%|█████████▏| 3600/3940 [1:35:24<10:16,  1.81s/it]

{'loss': 0.8297, 'learning_rate': 8.629441624365483e-06, 'epoch': 9.14}


 94%|█████████▍| 3700/3940 [1:38:11<06:29,  1.62s/it]

{'loss': 0.841, 'learning_rate': 6.091370558375635e-06, 'epoch': 9.39}


 96%|█████████▋| 3800/3940 [1:40:57<03:29,  1.50s/it]

{'loss': 0.8375, 'learning_rate': 3.5532994923857873e-06, 'epoch': 9.64}


 99%|█████████▉| 3900/3940 [1:43:45<01:07,  1.68s/it]

{'loss': 0.8366, 'learning_rate': 1.015228426395939e-06, 'epoch': 9.9}


100%|██████████| 3940/3940 [1:44:52<00:00,  1.82s/it][INFO|trainer.py:1885] 2023-10-02 00:02:24,755 >> 

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 3940/3940 [1:44:53<00:00,  1.60s/it]
[INFO|trainer.py:2693] 2023-10-02 00:02:24,909 >> Saving model checkpoint to models/t5-qg-bobby-2023-10-01
[INFO|configuration_utils.py:447] 2023-10-02 00:02:24,912 >> Configuration saved in models/t5-qg-bobby-2023-10-01\config.json


{'train_runtime': 6293.1183, 'train_samples_per_second': 10.025, 'train_steps_per_second': 0.626, 'train_loss': 1.2286052394034295, 'epoch': 10.0}


[INFO|modeling_utils.py:1637] 2023-10-02 00:02:28,140 >> Model weights saved in models/t5-qg-bobby-2023-10-01\pytorch_model.bin
[INFO|tokenization_utils_base.py:2157] 2023-10-02 00:02:28,142 >> tokenizer config file saved in models/t5-qg-bobby-2023-10-01\tokenizer_config.json
[INFO|tokenization_utils_base.py:2164] 2023-10-02 00:02:28,143 >> Special tokens file saved in models/t5-qg-bobby-2023-10-01\special_tokens_map.json


***** train metrics *****
  epoch                    =       10.0
  train_loss               =     1.2286
  train_runtime            = 1:44:53.11
  train_samples            =       6309
  train_samples_per_second =     10.025
  train_steps_per_second   =      0.626


In [22]:
if training_args.do_eval:
    logger.info("*** Evaluate ***")

    metrics = trainer.evaluate(
        max_length=max_length, num_beams=num_beams, metric_key_prefix="eval"
    )
    max_eval_samples = (
        data_args.max_eval_samples
        if data_args.max_eval_samples is not None
        else len(eval_dataset)
    )
    metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)

In [23]:
if training_args.do_predict:
    logger.info("*** Predict ***")

    predict_results = trainer.predict(
        predict_dataset,
        metric_key_prefix="predict",
        max_length=max_length,
        num_beams=num_beams,
    )
    predictions = [
        tokenizer.decode(ids, skip_special_tokens=True)
        for ids in predict_results.predictions
    ]
    with open(f"{training_args.output_dir}/predictions.txt", "w") as f:
        f.write("\n".join(predictions))
    sources = [
        tokenizer.decode(
            list(map(lambda t: 0 if t == -100 else t, ids)),
            skip_special_tokens=True,
        )
        for ids in predict_results.label_ids
    ]
    with open(f"{training_args.output_dir}/labels.txt", "w") as f:
        f.write("\n".join(sources))
    metrics = predict_results.metrics
    max_predict_samples = (
        data_args.max_predict_samples
        if data_args.max_predict_samples is not None
        else len(predict_dataset)
    )
    metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))

    trainer.log_metrics("test", metrics)
    trainer.save_metrics("test", metrics)

    if trainer.is_world_process_zero():
        if training_args.predict_with_generate:
            predictions = tokenizer.batch_decode(
                predict_results.predictions,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )
            predictions = [pred.strip() for pred in predictions]
            output_prediction_file = os.path.join(
                training_args.output_dir, "generated_predictions.txt"
            )
            with open(output_prediction_file, "w", encoding="utf-8") as writer:
                writer.write("\n".join(predictions))

kwargs = {
    "finetuned_from": model_args.model_name_or_path,
    "tasks": "question generation",
}

10/02/2023 00:02:28 - INFO - __main__ - *** Predict ***


[INFO|trainer.py:2944] 2023-10-02 00:02:28,205 >> ***** Running Prediction *****
[INFO|trainer.py:2946] 2023-10-02 00:02:28,206 >>   Num examples = 702
[INFO|trainer.py:2949] 2023-10-02 00:02:28,207 >>   Batch size = 2
100%|██████████| 351/351 [04:02<00:00,  1.44it/s]

10/02/2023 00:06:31 - INFO - datasets.metric - Removing C:\Users\Adam\.cache\huggingface\metrics\sacrebleu\default\default_experiment-1-0.arrow


100%|██████████| 351/351 [04:02<00:00,  1.45it/s]


***** test metrics *****
  predict_bleu               =    21.8433
  predict_gen_len            =    17.6325
  predict_loss               =     1.0902
  predict_runtime            = 0:04:03.29
  predict_samples            =        702
  predict_samples_per_second =      2.885
  predict_steps_per_second   =      1.443
