In [17]:
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
from datasets import load_dataset, load_metric

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    Seq2SeqTrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version


In [18]:
check_min_version("4.13.0.dev0")
require_version(
    "datasets>=1.8.0",
    "To fix: pip install -r examples/pytorch/translation/requirements.txt",
)
logger = logging.getLogger(__name__)


In [19]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={
            "help": "Path to pretrained model or model identifier from huggingface.co/models"
        }
    )
    config_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "Pretrained config name or path if not the same as model_name"
        },
    )
    tokenizer_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "Pretrained tokenizer name or path if not the same as model_name"
        },
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={
            "help": "Where to store the pretrained models downloaded from huggingface.co"
        },
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={
            "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."
        },
    )
    model_revision: str = field(
        default="main",
        metadata={
            "help": "The specific model version to use (can be a branch name, tag name or commit id)."
        },
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
            "with private models)."
        },
    )

In [20]:
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    source_lang: str = field(
        default=None, metadata={"help": "Source language id for translation."}
    )
    target_lang: str = field(
        default=None, metadata={"help": "Target language id for translation."}
    )

    dataset_name: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the dataset to use (via the datasets library)."},
    )
    e2e: Optional[bool] = (
        field(default=False, metadata={"help": "Prepare data for e2e"}),
    )
    dataset_config_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "The configuration name of the dataset to use (via the datasets library)."
        },
    )
    dataset_field_name: Optional[str] = field(
        default=None,
        metadata={"help": "The field name, useful when the data is stored as json."},
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a jsonlines)."}
    )
    validation_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input evaluation data file to evaluate the metrics (sacreblue) on "
            "a jsonlines file."
        },
    )
    test_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input test data file to evaluate the metrics (sacreblue) on "
            "a jsonlines file."
        },
    )
    overwrite_cache: bool = field(
        default=False,
        metadata={"help": "Overwrite the cached training and evaluation sets"},
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_source_length: Optional[int] = field(
        default=1024,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    max_target_length: Optional[int] = field(
        default=128,
        metadata={
            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    val_max_target_length: Optional[int] = field(
        default=None,
        metadata={
            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
            "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
            "during ``evaluate`` and ``predict``."
        },
    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": "Whether to pad all samples to model maximum sentence length. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
            "efficient on GPU but very bad for TPU."
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
            "value if set."
        },
    )
    num_beams: Optional[int] = field(
        default=None,
        metadata={
            "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
            "which is used during ``evaluate`` and ``predict``."
        },
    )
    ignore_pad_token_for_loss: bool = field(
        default=True,
        metadata={
            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
        },
    )
    source_prefix: Optional[str] = field(
        default=None,
        metadata={
            "help": "A prefix to add before every source text (useful for T5 models)."
        },
    )
    forced_bos_token: Optional[str] = field(
        default=None,
        metadata={
            "help": "The token to force as the first generated token after the :obj:`decoder_start_token_id`."
            "Useful for multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token "
            "needs to be the target language token.(Usually it is the target language token)"
        },
    )

    def __post_init__(self):

        if self.train_file is not None:
            extension = self.train_file.split(".")[-1]
            assert extension == "json", "`train_file` should be a json file."
        if self.validation_file is not None:
            extension = self.validation_file.split(".")[-1]
            assert extension == "json", "`validation_file` should be a json file."
        if self.val_max_target_length is None:
            self.val_max_target_length = self.max_target_length


In [21]:
parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)
    )
model_args, data_args, training_args = parser.parse_json_file(
    json_file='data/config/t5_eval.json'
)

In [22]:
config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name
        else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name
    if model_args.tokenizer_name
    else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
    use_fast=model_args.use_fast_tokenizer,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)
tokenizer.add_tokens(["<hl>"])
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)

model.resize_token_embeddings(len(tokenizer))

Embedding(32101, 768)

In [23]:
test_dataset = datasets.load_from_disk(dataset_path='data/t5_bobby_dataset')
column_names = test_dataset["test"].column_names
padding = "max_length" if data_args.pad_to_max_length else False
prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
max_target_length = data_args.max_target_length
test_dataset



DatasetDict({
    train: Dataset({
        features: ['name', 'sourceName', 'sourceUrl', 'faqLabels', 'answer', 'question'],
        num_rows: 6309
    })
    test: Dataset({
        features: ['name', 'sourceName', 'sourceUrl', 'faqLabels', 'answer', 'question'],
        num_rows: 702
    })
})

In [24]:
def preprocess_function(samples):
    """
    def construct_input(samples, i):
        answer, context = samples["answers"][i], samples["context"][i]
        if isinstance(answer, list):
            answer = answer[0]
        if isinstance(answer["text"], str):
            answer_text = answer["text"].strip()
        else:
            answer_text = answer["text"][0].strip()

        hl_answer = f"<hl>{answer_text}<hl>"
        return context.replace(answer_text, hl_answer)
    """
    inputs = []
    targets = []

    #if not data_args.e2e:
    for i in range(len(samples["answer"])):
        inputs.append(samples["answer"][i])
        targets.append(samples["question"][i])
    #else:
    #    for i in range(len(samples["answer"])):
    #        inputs.append(samples["answer"][i])
    #        targets.append(" <sep> ".join(samples["questions"][i]))

    inputs = [prefix + inp for inp in inputs]
    model_inputs = tokenizer(
        inputs,
        max_length=data_args.max_source_length,
        padding=padding,
        truncation=True,
    )

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=max_target_length, padding=padding, truncation=True
        )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length" and data_args.ignore_pad_token_for_loss:
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [25]:
if "test" not in test_dataset:
    raise ValueError("--do_predict requires a test dataset")
predict_dataset = test_dataset["test"]
if data_args.max_predict_samples is not None:
    predict_dataset = predict_dataset.select(
        range(data_args.max_predict_samples)
    )
with training_args.main_process_first(
    desc="prediction dataset map pre-processing"
):
    predict_dataset = predict_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
        desc="Running tokenizer on prediction dataset",
    )
predict_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

Loading cached processed dataset at d:\GitHub\bachelor-thesis\data\t5_bobby_dataset\test\cache-5993eb26984c59f5.arrow


In [26]:
predict_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 702
})

In [29]:

from tqdm import tqdm
results = []
input_ids = predict_dataset[0]['input_ids'].unsqueeze(0)
attention_mask = predict_dataset[0]['attention_mask'].unsqueeze(0)
q1, q2, q3 = model.generate(input_ids=input_ids,attention_mask=attention_mask,max_length=64,num_return_sequences=3, do_sample=True)
predictions = [q1, q2, q3]
predictions = [
    tokenizer.decode(ids, skip_special_tokens=True)
    for ids in predictions
]
print("\t".join(predictions))

Wer sollte eine Gutschein-Nachnahme veranlassen, wenn ich mein Kind nicht in die Kita betrete?	Was passiert, wenn mein Kind nicht in die Kita kommt?	Was ist bei einer längerfristigen Nichtnutzung des Kitaplatzes zu tun?


In [None]:

for row in tqdm(predict_dataset):
    input_ids = row['input_ids'].unsqueeze(0)
    attention_mask = row['attention_mask'].unsqueeze(0)
    q1, q2, q3 = model.generate(input_ids=input_ids,attention_mask=attention_mask,max_length=64,num_return_sequences=3, do_sample=True)
    predictions = [q1, q2, q3]
    predictions = [
        tokenizer.decode(ids, skip_special_tokens=True)
        for ids in predictions
    ]
    
    results.append("\t".join(predictions))

with open(f"data/t5results/t5_predictions.txt", "w") as f:
    f.write("\n".join(results))