In [17]:
import os
import pathlib

from datasets import load_dataset, concatenate_datasets
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import wandb

from hf_wrapper import GPTForSequenceClassification
from tokenizer import load_tokenizer
from utils import flatten_multi_features, load_random_from_pretrained_model, compute_metrics

In [18]:
normal_checkpoint_location = pathlib.Path('./cache/checkpoints/russian_polish_normal_12_5_50k/ckpt.pt')
ipa_checkpoint_location = pathlib.Path('./cache/checkpoints/russian_polish_ipa_12_5_50k/ckpt.pt')
hf_cache = pathlib.Path('./cache')
training_checkpoints = pathlib.Path('./cache/checkpoints')
tokenizer_prefix = pathlib.Path('./cache/tokenizers')
ipa_tokenizer_prefix = 'bpe-rus-pol-ipa-number-preservation'
normal_tokenizer_prefix = 'bpe-rus-pol-normal-number-preservation'

dataset_name = {
    'rus': 'iggy12345/russian-xnli-ipa-rosetta',
    'pol': 'iggy12345/cdsc-e-ipa-epitran'
}

epochs = 3
context_size = 1024
batch_size = 16
learning_rate = 2e-5

In [19]:
def load_and_preprocess(lang: str, ipa: bool, split: str, tokenizer):
    ds = load_dataset(dataset_name[lang], split=split, cache_dir=str(hf_cache))
    column_names = ['hypothesis', 'premise']
    if lang == 'pol':
        column_names = ['sentence_A', 'sentence_B']
    suffix = 'phoneme' if lang == 'pol' else 'epitran'
    fields = [
        f'{c}-{suffix}' if ipa else c
        for c in column_names
    ]

    def preprocess(examples):
        features = flatten_multi_features(examples, fields)
        encoded = tokenizer(features, truncation=True, max_length=context_size)
        encoded['label'] = examples['label']
        return encoded

    return ds.map(preprocess, batched=True, num_proc=os.cpu_count())

In [24]:
project_name = f"debug-russian-polish-small-finetuning-xnli-random-initial-epitran"

In [25]:
def train_model(ipa: bool) -> Trainer:
    checkpoint = ipa_checkpoint_location if ipa else normal_checkpoint_location

    temporary_output_dir = training_checkpoints / f"{project_name}-{'ipa' if ipa else 'normal'}/"
    temporary_output_dir.mkdir(parents=True, exist_ok=True)

    vocab_path = tokenizer_prefix / f'{ipa_tokenizer_prefix if ipa else normal_tokenizer_prefix}-vocab.json'
    merges_path = tokenizer_prefix / f'{ipa_tokenizer_prefix if ipa else normal_tokenizer_prefix}-merges.txt'
    tokenizer = load_tokenizer(vocab_path, merges_path)

    base_model = load_random_from_pretrained_model(checkpoint, 'cuda')
    base_model.config.pad_token_id = tokenizer.pad_token_id
    base_model.config.padding_side = tokenizer.padding_side
    model = GPTForSequenceClassification(base_model, num_classes=3).to('cuda')

    rus_train_dataset = load_and_preprocess('rus', ipa, 'train', tokenizer)
    pol_train_dataset = load_and_preprocess('pol', ipa, 'train', tokenizer)
    train_dataset = concatenate_datasets([rus_train_dataset, pol_train_dataset])

    rus_eval_dataset = load_and_preprocess('rus', ipa, 'validation', tokenizer)
    pol_eval_dataset = load_and_preprocess('pol', ipa, 'validation', tokenizer)
    eval_dataset = concatenate_datasets([rus_eval_dataset, pol_eval_dataset])

    training_args = TrainingArguments(
        eval_strategy="steps",
        eval_steps=1000,
        output_dir=str(temporary_output_dir),
        save_strategy='steps',
        save_steps=1000,
        metric_for_best_model="precision",
        load_best_model_at_end=True,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        logging_steps=100,
        fp16=True,
        warmup_ratio=0.3,
        save_safetensors=False,
        # disable_tqdm=True,
    )

    wrun = wandb.init(entity='aaronjencks-the-ohio-state-university', project=project_name, name=f'{"ipa" if ipa else "normal"}')

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )

    print(f"Training model")
    trainer.train()

    wrun.finish()

    return trainer

In [30]:
def finetune_transcription(eval_lang: str, ipa: bool, model: Trainer):
    print('finetuning on {} {}'.format(eval_lang, 'ipa' if ipa else 'normal'))
    vocab_path = tokenizer_prefix / f'{ipa_tokenizer_prefix if ipa else normal_tokenizer_prefix}-vocab.json'
    merges_path = tokenizer_prefix / f'{ipa_tokenizer_prefix if ipa else normal_tokenizer_prefix}-merges.txt'
    tokenizer = load_tokenizer(vocab_path, merges_path)

    if eval_lang == 'both':
        rus_eval_dataset = load_and_preprocess('rus', ipa, 'validation', tokenizer)
        pol_eval_dataset = load_and_preprocess('pol', ipa, 'validation', tokenizer)
        eval_dataset = concatenate_datasets([rus_eval_dataset, pol_eval_dataset])
    else:
        eval_dataset = load_and_preprocess(eval_lang, ipa, 'validation', tokenizer)

    wrun = wandb.init(entity='aaronjencks-the-ohio-state-university', project=project_name, name=f'{eval_lang}-{"ipa" if ipa else "normal"}')

    print(f"Final evaluation on {eval_lang}")
    results = model.evaluate(eval_dataset=eval_dataset)
    print(results)

    wrun.finish()


In [22]:
model = train_model(False)

number of parameters: 123.35M


0,1
eval/accuracy,▁▁▂▂▃▃▄▅▅▆▆▅▆▆▆▇▇▇▇▇▆▇▇█▇▇▇██▇█
eval/f1,▁▁▁▃▄▃▅▄▆▆▇▃▇▆▅▇▇▆▆█▅▇▇█▇▇▇██▇▇
eval/loss,█▇▆▅▅▅▄▅▃▄▂▅▂▃▃▂▂▃▃▂▃▂▂▁▂▂▃▁▁▁▁
eval/precision,▁▁▂▂▃▃▃▄▅▅▆▇▆▆█▇▇▆▆▇▇▇▆▇▇▇▇█▇▇█
eval/recall,▁▁▂▂▃▃▄▅▅▆▆▅▆▆▆▇▇▇▇▇▆▇▇█▇▇▇██▇█
eval/runtime,▃▃▃▃▃▃▃▃▃▃▂▁▂▁▂▁▁▁▃▃▃▃▃▃▃▁▁▁▃▃█
eval/samples_per_second,▆▆▆▆▆▆▆▆▆▆▇█▇█▇███▆▆▆▆▆▆▆███▆▆▁
eval/steps_per_second,▆▆▆▆▆▆▆▆▆▆▇█▇█▇███▆▆▆▆▆▆▆███▆▆▁
train/epoch,▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██

0,1
eval/accuracy,0.5004
eval/f1,0.48737
eval/loss,0.98853
eval/precision,0.52566
eval/recall,0.5004
eval/runtime,3.3262
eval/samples_per_second,748.599
eval/steps_per_second,46.9
train/epoch,1.25379
train/global_step,31400.0


  trainer = Trainer(


Training model


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1000,1.1164,1.102437,0.353295,0.366824,0.353295,0.357517
2000,1.11,1.110719,0.418338,0.376648,0.418338,0.361614
3000,1.0995,1.052309,0.47106,0.428263,0.47106,0.378801
4000,1.0902,1.02527,0.495702,0.495177,0.495702,0.49158
5000,1.0568,1.012704,0.482521,0.497729,0.482521,0.471663
6000,1.0616,0.989607,0.519771,0.520287,0.519771,0.520007
7000,1.0339,0.971783,0.525215,0.511908,0.525215,0.514934
8000,1.0265,0.956076,0.540688,0.528632,0.540688,0.501979
9000,1.0052,0.94301,0.552722,0.560097,0.552722,0.555356
10000,1.0079,0.942268,0.556734,0.570033,0.556734,0.560552


0,1
eval/accuracy,▁▃▄▄▅▆▆▅▇▆▇▆▇▇▇▇▇▇▇▇▇██▇▇█████▇█████████
eval/f1,▁▁▅▅▆▅▆▇▆▆▇▇▇▇▇▇▇▆▇▇▇█▇▇▇▇████▇█████████
eval/loss,█▇▆▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▂▂▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
eval/precision,▁▄▄▅▅▆▇▇▆▆▆▇▆▇▇▇▇▇▆▇▇▇▇▇█▇▇▇█▇██████████
eval/recall,▁▄▅▄▅▆▆▆▅▆▇▆▇▆▇▇▆▇▇▇▇▇▇▇▇█▇▇██▇█████████
eval/runtime,▁▆▂▂▁▄▃▂▇▂▅▁▁▅▆█▇▅▅▅▅▅▅▅▅▅▆▇▅▅▅▅▄▅▅▅▅▅▅▅
eval/samples_per_second,▂▇▂█▅▆▇▁▅▅▄█▄▁▄▃▄▄▂▃▄▄▄▃▂▃▄▄▄▂▄▃▃▄▄▄▄▄▄▄
eval/steps_per_second,█▆█▂█▆▅▆▅▇▅▇▇██▄▄▃▄▄▁▄▂▄▄▄▄▄▄▃▄▄▂▄▄▄▄▄▄▄
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▇▇▇██

0,1
eval/accuracy,0.61834
eval/f1,0.62136
eval/loss,0.80344
eval/precision,0.62686
eval/recall,0.61834
eval/runtime,1.7301
eval/samples_per_second,2017.191
eval/steps_per_second,126.58
total_flos,0.0
train/epoch,3.0


In [32]:
for lang in ['rus', 'pol']:
    finetune_transcription(lang, False, model)

finetuning on rus normal


Final evaluation on rus
{'eval_loss': 0.940982460975647, 'eval_accuracy': 0.5582329317269076, 'eval_precision': 0.583023206611427, 'eval_recall': 0.5582329317269076, 'eval_f1': 0.5521867437545983, 'eval_runtime': 1.2705, 'eval_samples_per_second': 1959.805, 'eval_steps_per_second': 122.783, 'epoch': 3.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁
train/global_step,▁

0,1
eval/accuracy,0.55823
eval/f1,0.55219
eval/loss,0.94098
eval/precision,0.58302
eval/recall,0.55823
eval/runtime,1.2705
eval/samples_per_second,1959.805
eval/steps_per_second,122.783
train/epoch,3.0
train/global_step,75132.0


finetuning on pol normal


Final evaluation on pol
{'eval_loss': 0.48446381092071533, 'eval_accuracy': 0.782, 'eval_precision': 0.7528668352195749, 'eval_recall': 0.782, 'eval_f1': 0.7515182573847186, 'eval_runtime': 0.5379, 'eval_samples_per_second': 1859.147, 'eval_steps_per_second': 117.126, 'epoch': 3.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁
train/global_step,▁

0,1
eval/accuracy,0.782
eval/f1,0.75152
eval/loss,0.48446
eval/precision,0.75287
eval/recall,0.782
eval/runtime,0.5379
eval/samples_per_second,1859.147
eval/steps_per_second,117.126
train/epoch,3.0
train/global_step,75132.0


In [33]:
model = train_model(True)

number of parameters: 123.35M


  trainer = Trainer(


Training model


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1000,1.1209,1.123244,0.383668,0.363487,0.383668,0.362575
2000,1.1045,1.076266,0.451862,0.399139,0.451862,0.386382
3000,1.1095,1.032806,0.457307,0.400172,0.457307,0.352088
4000,1.0951,1.003933,0.473352,0.445404,0.473352,0.416336
5000,1.078,0.994341,0.485673,0.460979,0.485673,0.454481
6000,1.0824,0.990441,0.494269,0.502709,0.494269,0.470309
7000,1.0673,0.970516,0.498567,0.495594,0.498567,0.490027
8000,1.0595,0.990714,0.510029,0.466999,0.510029,0.427499
9000,1.0405,0.956619,0.522636,0.521559,0.522636,0.521578
10000,1.0274,0.965498,0.520344,0.525837,0.520344,0.502028


0,1
eval/accuracy,▁▃▄▄▅▆▆▆▆▆▇▆▆▆▇▇▇█▇▇▇▇█▇▇█▇█████████████
eval/f1,▁▂▃▄▄▅▆▆▆▅▆▆▆▇▆▇▆▇▇▇▆▇▇▇▇█▇▇████████████
eval/loss,█▇▄▃▄▃▃▄▅▃▃▃▃▃▃▂▂▂▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
eval/precision,▁▂▄▃▅▇▆▅▆▇▆▅▆▅▇▆▆▇▆▇▇▇▇▇█▇▇██████▇█▇▇███
eval/recall,▁▃▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▆▇▇▇▇▇▇▇███████████
eval/runtime,▁▁▁▁▁▁█▃▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▂▂▃▂▂▁▁▁▁▁▁▁▁▁
eval/samples_per_second,██▇██▇▁▇▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▄▄▇▇▇▇▇▇▇▇▇▇
eval/steps_per_second,███▇▁▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▅▆▅▆▆▇▇▇▇▇▇▇
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███

0,1
eval/accuracy,0.60287
eval/f1,0.60385
eval/loss,0.84936
eval/precision,0.60609
eval/recall,0.60287
eval/runtime,4.2991
eval/samples_per_second,811.804
eval/steps_per_second,50.941
total_flos,0.0
train/epoch,3.0


In [34]:
for lang in ['rus', 'pol']:
    finetune_transcription(lang, True, model)

finetuning on rus ipa


Final evaluation on rus


{'eval_loss': 0.9713320136070251, 'eval_accuracy': 0.5325301204819277, 'eval_precision': 0.5493527148366735, 'eval_recall': 0.5325301204819277, 'eval_f1': 0.5193151602512706, 'eval_runtime': 2.8267, 'eval_samples_per_second': 880.894, 'eval_steps_per_second': 55.189, 'epoch': 3.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁
train/global_step,▁

0,1
eval/accuracy,0.53253
eval/f1,0.51932
eval/loss,0.97133
eval/precision,0.54935
eval/recall,0.53253
eval/runtime,2.8267
eval/samples_per_second,880.894
eval/steps_per_second,55.189
train/epoch,3.0
train/global_step,75132.0


finetuning on pol ipa


Final evaluation on pol
{'eval_loss': 0.595771849155426, 'eval_accuracy': 0.768, 'eval_precision': 0.7431457140546661, 'eval_recall': 0.768, 'eval_f1': 0.7199687680338691, 'eval_runtime': 1.4132, 'eval_samples_per_second': 707.629, 'eval_steps_per_second': 44.581, 'epoch': 3.0}


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁
train/global_step,▁

0,1
eval/accuracy,0.768
eval/f1,0.71997
eval/loss,0.59577
eval/precision,0.74315
eval/recall,0.768
eval/runtime,1.4132
eval/samples_per_second,707.629
eval/steps_per_second,44.581
train/epoch,3.0
train/global_step,75132.0
