# Init

In [36]:
import numpy as np

import logging
logger = logging.getLogger(__name__)

from datasets import load_dataset
from datasets import Dataset

from transformers import (
    BertTokenizerFast, 
    AutoModelForCausalLM, 
    PreTrainedModel,
    PreTrainedTokenizerFast, 
    DataCollatorForLanguageModeling,
    BatchEncoding,
    GPT2LMHeadModel,
    GenerationConfig,
    get_scheduler,
    TrainingArguments,
)
from huggingface_hub import notebook_login

import torch
from torch import Tensor
from torch.optim.lr_scheduler import LambdaLR
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from tqdm.auto import tqdm, trange

from typing import Tuple
import random
from datetime import datetime
import os

import glob
import re
import shutil

torch.cuda.empty_cache()

In [3]:
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager-core).
Your token has been saved to E:\huggingface_cache\token
Login successful


In [None]:
tokenizer: BertTokenizerFast = BertTokenizerFast.from_pretrained('bert-base-chinese')
model: PreTrainedModel = AutoModelForCausalLM.from_pretrained('ckiplab/gpt2-base-chinese')

train_dataset = load_dataset('asadfgglie/lccc_base_zh', use_auth_token=True, split='validation')
test_dataset = load_dataset('asadfgglie/lccc_base_zh', use_auth_token=True, split='test')

def tokenize_function(example) -> BatchEncoding:
    return tokenizer(''.join('[SEP]'.join(example["dialog"]).split()), truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function)
tokenized_test_dataset = test_dataset.map(tokenize_function)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [40]:
def set_seed(seed) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [2]:
def sorted_checkpoints(args: TrainingArguments, use_mtime=False) -> list[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format('checkpoint')))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format('checkpoint'), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted

def rotate_checkpoints(args: TrainingArguments, use_mtime=False) -> None:
    """Check if we should delete older checkpoint(s)"""
    if args.save_total_limit is None:
        return
    if args.save_total_limit <= 0:
        return

    checkpoints_sorted = sorted_checkpoints(args, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

In [3]:
args = TrainingArguments(
    max_grad_norm=1.0,
    num_train_epochs=1,
    logging_steps=3,
    output_dir='model_log',
    logging_dir='model_log/log',
    max_steps=10,
    logging_first_step=True,
    no_cuda=True,
    evaluation_strategy='steps',
    do_train=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    seed=random.randint(0, 999),
    save_total_limit=2,
    overwrite_output_dir=True
)
args.max_eval_steps = 3
args.should_continue = False
args.tokenizer_path_or_name = 'bert-base-chinese'
args.model_path_or_name = 'ckiplab/gpt2-base-chinese'
logger.debug("Training/evaluation parameters %s", str(args))

In [None]:
tokenizer: BertTokenizerFast = BertTokenizerFast.from_pretrained(args.tokenizer_path_or_name)
model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(args.model_path_or_name)

config = GenerationConfig.from_model_config(model.generation_config)
config.max_new_tokens=50
config.do_sample=True
config.top_k=50
config.pad_token_id=tokenizer.pad_token_id
model.generation_config = config

# Train with Trainer()

In [None]:
samples = tokenized_train_dataset[:8]
samples = {k: v for k, v in samples.items() if k not in ['dialog']}
[len(x) for x in samples["input_ids"]]

In [None]:
samples.keys()

In [None]:
data_collator(samples["input_ids"])

In [None]:
training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

# Train without Trainer()

In [5]:
logging.basicConfig(
        format="%(asctime)s -%(levelname)s- %(name)s - %(message)s",
        datefmt="%Y/%m/%d %H:%M:%S",
        level=logging.INFO, force=True
)

In [39]:
if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) 
    and args.do_train 
    and args.overwrite_output_dir
    and not args.should_continue):
    for checkpoint in sorted_checkpoints(args):
        shutil.rmtree(checkpoint)
    os.remove(os.path.join(args.output_dir, 'eval_results.txt'))
    shutil.rmtree(args.logging_dir)

In [None]:
set_seed(args.seed)

In [25]:
tokenized_train_dataset = tokenized_train_dataset.remove_columns('dialog')
tokenized_train_dataset.set_format("torch")
train_dataloader = DataLoader(tokenized_train_dataset, shuffle=True, batch_size=args.per_device_train_batch_size, collate_fn=data_collator)

tokenized_test_dataset = tokenized_test_dataset.remove_columns('dialog')
tokenized_test_dataset.set_format("torch")
test_dataloader = DataLoader(tokenized_test_dataset, shuffle=True, batch_size=args.per_device_eval_batch_size, collate_fn=data_collator)

In [26]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [27]:
num_training_steps = min(args.num_train_epochs * len(train_dataloader), args.max_steps) if args.max_steps > 0 else args.num_train_epochs * len(train_dataloader)
lr_scheduler: LambdaLR = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [28]:
try:
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    if not args.no_cuda:
        model.to(device)
    else:
        device = torch.device("cpu")
except:
    device = torch.device("cpu")

In [29]:
def evaluate(max_eval_steps=0) -> dict[str, Tensor]:
    eval_output_dir = args.output_dir

    nb_eval_steps = 0
    eval_loss = 0.0
    model.eval()
    
    batch_bar = tqdm(leave=False, iterable=range(min(max_eval_steps, len(test_dataloader)) if max_eval_steps > 0 else len(test_dataloader)), desc="Batch evaluating step")
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        if batch['input_ids'].shape[1] > 1024: continue

        with torch.no_grad():
            outputs = model(**batch)
            loss: Tensor = outputs.loss
            eval_loss += loss.mean().item()
        
        nb_eval_steps += 1

        batch_bar.update()
        
        if max_eval_steps > 0 and nb_eval_steps > max_eval_steps:
            break
    
    batch_bar.close()

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
    with open(output_eval_file, "a") as writer:
        for key in sorted(result.keys()):
            writer.write("%s = %s\n" % (key, str(result[key])))
    
    return result

In [30]:
if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) 
    and args.do_train 
    and not args.overwrite_output_dir
    and not args.should_continue):
    raise ValueError(
        "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir))

In [31]:
if args.should_continue:
    _sorted_checkpoints = sorted_checkpoints(args)
    if len(sorted_checkpoints) == 0:
        raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
    else:
        args.model_path_or_name = _sorted_checkpoints[-1]
        args.model_train_step_from = int(_sorted_checkpoints[-1].split('-')[-1])

In [21]:
tokenizer: BertTokenizerFast = BertTokenizerFast.from_pretrained(args.tokenizer_path_or_name)
model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(args.model_path_or_name)

config = GenerationConfig.from_model_config(model.generation_config)
config.max_new_tokens=50
config.do_sample=True
config.top_k=50
config.pad_token_id=tokenizer.pad_token_id
model.generation_config = config

In [32]:
# Check if saved optimizer or scheduler states exist
if (
    args.model_path_or_name
    and os.path.isfile(os.path.join(args.model_path_or_name, "optimizer.pt"))
    and os.path.isfile(os.path.join(args.model_path_or_name, "scheduler.pt"))
):
    # Load in optimizer and scheduler states
    optimizer.load_state_dict(torch.load(os.path.join(args.model_path_or_name, "optimizer.pt")))
    lr_scheduler.load_state_dict(torch.load(os.path.join(args.model_path_or_name, "scheduler.pt")))

In [37]:
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_dataset))
logger.info("  Num Epochs = %d", args.num_train_epochs)
logger.info("  Total optimization steps = %d", num_training_steps)

tb_writer = SummaryWriter(log_dir=args.logging_dir)

epoch_bar = trange(args.num_train_epochs, desc="Epoch")
batch_bar = tqdm(train_dataloader, desc="Batch training step")
progress_bar = trange(num_training_steps, desc="Total training step")

global_step = 0
train_loss = 0
logging_loss = 0

model.zero_grad()

for epoch in epoch_bar:
    for batch in batch_bar:
        progress_bar.update()
        global_step += 1

        batch = {k: v.to(device) for k, v in batch.items()}
        
        if batch['input_ids'].shape[1] > 1024: continue

        model.train()
        outputs = model(**batch)
        loss: Tensor = outputs.loss
        loss.backward()
        train_loss += loss.to('cpu').item()

        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        if args.logging_steps > 0 and global_step % args.logging_steps == 0:
            logger.info(f"loss: {(train_loss - logging_loss) / args.logging_steps}")
            logger.info(f'lr: {np.array(lr_scheduler.get_last_lr())}')

            tb_writer.add_scalar("lr", np.array(lr_scheduler.get_last_lr()), global_step)
            tb_writer.add_scalar("loss", (train_loss - logging_loss) / args.logging_steps, global_step)
            logging_loss = train_loss

            
            checkpoint_prefix = "checkpoint"
            output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
            os.makedirs(output_dir, exist_ok=True)
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Take care of distributed/parallel training
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)

            torch.save(args, os.path.join(output_dir, "training_args.bin"))

            torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
            torch.save(lr_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))

            rotate_checkpoints(args)
        
        if (args.evaluation_strategy == 'steps' 
            and args.eval_steps is not None 
            and global_step % args.eval_steps == 0 
            and global_step > args.eval_delay):
            results = evaluate(args.max_eval_steps)
            for key, value in results.items():
                logger.info(f"eval_{key}: {value.item()}")
                tb_writer.add_scalar("eval_{}".format(key), value, global_step)
        
        if args.max_steps > 0 and global_step > args.max_steps:
            break

    if args.evaluation_strategy == 'epoch' and epoch > args.eval_delay:
        results = evaluate(args.max_eval_steps)
        for key, value in results.items():
            logger.info(f"eval_{key}: {value.item()}")
            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
    
    if args.max_steps > 0 and global_step > args.max_steps:
        break
    
    batch_bar.reset()

tb_writer.close()

batch_bar.close()
epoch_bar.close()
progress_bar.close()


# Create output directory if needed
os.makedirs(args.output_dir, exist_ok=True)

logger.info("Saving model checkpoint to %s", args.output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = (
    model.module if hasattr(model, "module") else model
)  # Take care of distributed/parallel training
model_to_save.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)

# Good practice: save your training arguments together with the trained model
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

2023/03/19 00:24:13 -INFO- __main__ - ***** Running training *****
2023/03/19 00:24:13 -INFO- __main__ -   Num examples = 20000
2023/03/19 00:24:13 -INFO- __main__ -   Num Epochs = 1
2023/03/19 00:24:13 -INFO- __main__ -   Total optimization steps = 10


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Batch training step:   0%|          | 0/2500 [00:00<?, ?it/s]

Total training step:   0%|          | 0/10 [00:00<?, ?it/s]

2023/03/19 00:24:52 -INFO- __main__ - loss: 5.4634772936503095
2023/03/19 00:24:52 -INFO- __main__ - lr: [5.e-06]


Batch evaluating step:   0%|          | 0/3 [00:00<?, ?it/s]

2023/03/19 00:25:06 -INFO- __main__ - eval_perplexity: 163.0620880126953
2023/03/19 00:25:33 -INFO- __main__ - loss: 5.02346404393514
2023/03/19 00:25:33 -INFO- __main__ - lr: [0.]


Batch evaluating step:   0%|          | 0/3 [00:00<?, ?it/s]

2023/03/19 00:25:47 -INFO- __main__ - eval_perplexity: 202.59303283691406
2023/03/19 00:26:12 -INFO- __main__ - loss: 5.480917135874431
2023/03/19 00:26:12 -INFO- __main__ - lr: [0.]
2023/03/19 00:26:14 -INFO- __main__ - Deleting older checkpoint [model_log\checkpoint-3] due to args.save_total_limit


Batch evaluating step:   0%|          | 0/3 [00:00<?, ?it/s]

2023/03/19 00:26:22 -INFO- __main__ - eval_perplexity: 158.9920654296875
2023/03/19 00:26:42 -INFO- __main__ - Saving model checkpoint to model_log


# Chat with Bot

In [42]:
args = TrainingArguments(
    output_dir='model',
    no_cuda=False,
    seed=random.randint(0, 999),
)
args.tokenizer_path_or_name = 'bert-base-chinese'
set_seed(args.seed)

In [66]:
model_name = args.output_dir
try:
    model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer: BertTokenizerFast = BertTokenizerFast.from_pretrained(model_name, padding_side='left')
except:
    model_name = sorted_checkpoints(args)[-1]

try:
    model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer: BertTokenizerFast = BertTokenizerFast.from_pretrained(model_name, padding_side='left')
except:
    raise ValueError(f'There\'s no model in {args.output_dir}')

print(f'load model: {model_name}')

load model: model


In [11]:
device = torch.device('cuda') if torch.cuda.is_available() and not args.no_cuda else torch.device('cpu')

In [None]:
print(model.generation_config)

In [61]:
model.to(device)
model.eval()

chat_history = []
for step in range(6):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    chat_history.append(input("User:"))
    print("User:", chat_history[-1])

    user_inputs = tokenizer(tokenizer.sep_token + tokenizer.sep_token.join(chat_history), return_tensors='pt')
    user_inputs: dict[str, Tensor] = {k: v.to(device) for (k, v) in user_inputs.items()}

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(generation_config=config, **user_inputs)
    
    chat_history.append(''.join(tokenizer.decode(chat_history_ids[:, user_inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True).split()))

    # pretty print last ouput tokens from bot
    print("GPT: {}".format(chat_history[-1]), flush=True)

user_inputs = {k: v.to('cpu') for (k, v) in user_inputs.items()}
del user_inputs
model.to('cpu')
print()

User: 我好帥
GPT: 可以啊
User: 沒錯
GPT: 沒錯啦
User: 我最棒了
GPT: 一次沒錯！
User: 要不要去看電影?
GPT: 有錯
User: 哈哈
GPT: 一起
User: 甚麼完愣
GPT: 別的時候我沒聊一下了



In [74]:
class a:
    def __init__(self) -> None:
        self.a = 0
    def __str__(self) -> str:
        return str(vars(self))
t=TrainingArguments(output_dir='')
t.b=1
print(vars(t))

{'output_dir': '', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': <IntervalStrategy.NO: 'no'>, 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': -1, 'lr_scheduler_type': <SchedulerType.LINEAR: 'linear'>, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'passive', 'log_on_each_node': True, 'logging_dir': 'runs\\Mar19_18-59-20_DESKTOP-asadfgglie', 'logging_strategy': <IntervalStrategy.STEPS: 'steps'>, 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_strategy': <IntervalStrategy.STEPS: 'ste

In [75]:
shutil.rmtree('model')

In [76]:
BertTokenizerFast.from_pretrained('tmp', padding_side='left')

OSError: Can't load tokenizer for 'tmp'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'tmp' is the correct path to a directory containing all relevant files for a BertTokenizerFast tokenizer.