In [1]:
# !pip install transformers==3.0.0
# !pip install datasets
# !pip install --user -U nltk
# !python -m nltk.downloader punkt
# !pip install git+git://github.com/adamnpeace/nlpt5.git

In [1]:
import torch
from transformers import T5Tokenizer, HfArgumentParser
import nlp
from pathlib import Path
import json
import datasets
import pandas as pd
from nlpt5 import DataProcessor
from nlpt5 import run_qg
from nlpt5 import pipeline

In [2]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
tokenizer.add_tokens(['<sep>', '<hl>'])
tokenizer.save_pretrained('t5_qg_tokenizer')

('t5_qg_tokenizer/spiece.model',
 't5_qg_tokenizer/special_tokens_map.json',
 't5_qg_tokenizer/added_tokens.json')

In [3]:
max_source_length = 512
max_target_length = 32

In [4]:
model_type = 't5-base'

In [5]:
if not Path('data/dataset_train.pt').exists():
    df_train = pd.read_json('data/data_train.json')
    df_dev = pd.read_json('data/data_dev.json')

    df_dev.columns

    nlp.SplitGenerator(name=nlp.Split.VALIDATION, gen_kwargs={"filepath": "df_dev.csv"})

    df_dev = df_dev.rename(columns={'passages': 'source_text', 'clues': 'target_text'})
    df_train = df_train.rename(columns={'passages': 'source_text', 'clues': 'target_text'})

    df_dev = df_dev[['source_text', 'target_text']]
    df_train = df_train[['source_text', 'target_text']]

    df_dev.to_csv('df_dev.csv', index=False)
    df_train.to_csv('df_train.csv', index=False)

    ds_train = datasets.load_dataset('csv', data_files='df_train.csv', split=datasets.splits.Split('train'))
    ds_dev = datasets.load_dataset('csv', data_files='df_dev.csv')

    dataset_raw = datasets.load_dataset('csv', data_files={'train': 'df_train.csv', 'validation': 'df_dev.csv'})

    processor = DataProcessor(tokenizer, model_type=model_type,
                 max_source_length=max_source_length,
                 max_target_length=max_target_length)

    dataset = processor.process(dataset_raw)

    columns = ['source_ids', 'target_ids', 'attention_mask']
    dataset.set_format(type='torch', columns=columns)

    torch.save(dataset['train'], 'data/dataset_train.pt')
    torch.save(dataset['validation'], 'data/dataset_valid.pt')

In [6]:
torch.cuda.empty_cache()

In [7]:
args_dict = {
    "model_name_or_path": "valhalla/t5-small-e2e-qg",
    "model_type": "t5",
    "tokenizer_name_or_path": "t5_qg_tokenizer",
    "output_dir": "models/t5-small-e2e-qg-7k",
    "train_file_path": "data/dataset_train.pt",
    "valid_file_path": "data/dataset_valid.pt",
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "gradient_accumulation_steps": 8,
    "learning_rate": 1e-4,
    "num_train_epochs": 10,
    "seed": 42,
    "do_train": True,
    "do_eval": False,
    "evaluate_during_training": False,
    "logging_steps": 100,
    "overwrite_output_dir": True
}

torch.cuda.empty_cache()
run_qg(args_dict)

05/18/2021 13:02:44 - INFO - transformers.training_args -   PyTorch: setting up devices
05/18/2021 13:02:45 - INFO - nlpt5.run_qg -   Training/evaluation parameters TrainingArguments(output_dir='t5-small-e2e-qg-7ktest', overwrite_output_dir=True, do_train=True, do_eval=False, do_predict=False, evaluate_during_training=False, per_device_train_batch_size=16, per_device_eval_batch_size=16, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=8, learning_rate=0.0001, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10, max_steps=-1, warmup_steps=0, logging_dir='runs/May18_13-02-44_monolith', logging_first_step=False, logging_steps=100, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, dataloader_drop_last=False)
05/18/2021 13:02:45 - INFO - transformers.tokenization_utils_base -   Model name 't5_qg_tokenizer' not found in mod

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=359.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=359.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=359.0, style=ProgressStyle(description_wi…

05/18/2021 13:04:01 - INFO - transformers.trainer -   {'loss': 3.510298437476158, 'learning_rate': 7.727272727272727e-05, 'epoch': 2.267409470752089, 'step': 100}





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=359.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=359.0, style=ProgressStyle(description_wi…





KeyboardInterrupt: 