# Training the Diffusion-Lm with normalized data

In [1]:
from spacy.lang.en import English
from spacy.lang.de import German
import spacy
import re
import unicodedata

COVOST_PATH = "./covost/dataset"

nlp_en = spacy.load("en_core_web_sm")
#nlp_en.initialize()
lemmatizer_en = nlp_en.get_pipe("lemmatizer")
nlp_de = spacy.load("de_core_news_sm")
#nlp_de.initialize()
lemmatizer_de = nlp_de.get_pipe("lemmatizer")

def replace_german_special_chars(string: str) -> str:

    replacements = {
        'ä':"ae",
        'ü':"ue",
        'ö':"oe",
        'ß':"ss"
    }

    for char, replacement in replacements.items():
        string = string.replace(char, replacement)
        
    return string

# function to remove special characters
def remove_special_characters(text):
    # define the pattern to keep
    regex = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(regex, '', text)

# function to remove accented characters
def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

for split in ['dev', 'test', 'train']:
    covost_tsv = f'{COVOST_PATH}/covost_v2.de_en.{split}.tsv'
    training_data_txt = f'{COVOST_PATH}/covost_v2.de_en_normalized.{split}.txt'
    normalized_original_de = f'{COVOST_PATH}/covost_v2.normalized_original_de.{split}.txt'
    normalized_original_en = f'{COVOST_PATH}/covost_v2.normalized_original_en.{split}.txt'
    with open(covost_tsv, 'r') as tsv, open(training_data_txt, 'w') as txt, open(normalized_original_de, 'w') as txt_de, open(normalized_original_en, 'w') as txt_en:
        for line in tsv.readlines():
            _path, sentence, translation, _client_id, *_ = line.split('\t')
            sentence_tokenized = nlp_de(sentence)
            sentence_lemmas = [token.lemma_ for token in sentence_tokenized]
            sentence_lemmatized = ' '.join(sentence_lemmas)
            sentence_lemmatized_decapitalized = sentence_lemmatized.lower()
            sentence_lemmatized_decapitalized_replaced_special_chars = replace_german_special_chars(sentence_lemmatized_decapitalized)
            sentence_lemmatized_decapitalized_replaced_special_chars_no_accents = remove_accented_chars(sentence_lemmatized_decapitalized_replaced_special_chars)
            sentence_normalized = remove_special_characters(sentence_lemmatized_decapitalized_replaced_special_chars_no_accents)
            translation_tokenized = nlp_en(translation)
            translation_lemmas = [token.lemma_ for token in translation_tokenized]
            translation_lemmatized = ' '.join(translation_lemmas)
            translation_lemmatized_decapitalized = translation_lemmatized.lower()
            translation_lemmatized_decapitalized_no_accents = remove_accented_chars(translation_lemmatized_decapitalized)
            translation_normalized = remove_special_characters(translation_lemmatized_decapitalized_no_accents)
            txt.write(f'{sentence_normalized} [SEP] {translation_normalized}\n')
            txt_de.write(f'{sentence_normalized} [SEP] {sentence}\n')
            txt_en.write(f'{translation_normalized} [SEP] {translation}\n')

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

COVOST_PATH = "./covost/dataset"
VOCAB_SIZE = 30000

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=VOCAB_SIZE)
tokenizer.pre_tokenizer = Whitespace()
files = [f'{COVOST_PATH}/covost_v2.de_en_normalized.{split}.txt' for split in ["test", "train", "dev"]]
tokenizer.train(files, trainer)

tokenizer.save(f"{COVOST_PATH}/tokenizer_{VOCAB_SIZE}_normalized.json")






### Training a surface realization model

In [None]:
from transformers import EncoderDecoderModel, Seq2SeqTrainingArguments, Seq2SeqTrainer
from tokenizers import Tokenizer, Encoding
from datasets import Dataset

COVOST_PATH = "./covost/dataset"

tokenizer = Tokenizer.from_pretrained("bert-base-uncased")

bert2bert : EncoderDecoderModel = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id

bert2bert.forward()

def get_data_loader(split):

    data = {
        'input_ids':[],
        'labels':[]
    }

    reader = open(f'{COVOST_PATH}/covost_v2.normalized_original_en.{split}.txt', 'r')
    for line in reader:
        normalized_sentence, sentence = line.split("[SEP]")
        tokenized_normalized_sentence : Encoding = tokenizer.encode(normalized_sentence, return_tensors="pt")
        tokenized_sentence : Encoding = tokenizer.encode(sentence, return_tensors="pt")
        data['input_ids'].append(tokenized_normalized_sentence.ids)
        data['labels'].append(tokenized_sentence.ids)
    reader.close()

    dataset = Dataset.from_dict(data)

    return dataset

train_dataset = get_data_loader('train')
eval_dataset = get_data_loader('dev')

training_arguments = Seq2SeqTrainingArguments(output_dir=COVOST_PATH, evaluation_strategy='epoch')

trainer = Seq2SeqTrainer(
    model=bert2bert,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()


## Training a Diffusion-LM Model

In [2]:
from improved_diffusion import dist_util, logger
from improved_diffusion.image_datasets import load_data
from improved_diffusion.text_datasets import load_data_text
from improved_diffusion.resample import create_named_schedule_sampler
from improved_diffusion.script_util import (
    model_and_diffusion_defaults,
    create_model_and_diffusion,
    args_to_dict,
    add_dict_to_argparser,
)
from improved_diffusion.train_util import TrainLoop
from transformers import set_seed
from functools import partial
from improved_diffusion.test_util import get_weights, compute_logp
from improved_diffusion.rounding import load_models, load_tokenizer
from tokenizers import Tokenizer
import torch.distributed as dist
import wandb
import json
import os
import torch

class Parameters(dict):
    '''Most of the code in the Diffusion-LM Paper just passes the args through all the functions
    I wanted to use dicts for the params, but only noticed then that they don't support getting their
    values like attributes.'''
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def __getattribute__(self, __name: 'str'):
        if hasattr(super(), __name):
            return super().__getattribute__(__name) 
        elif super().get(__name):
            return super().get(__name)
        else:
            raise AttributeError(f"'parameters' object has no attribute '{__name}'")


COVOST_PATH = "./covost/dataset"
DIFFUSION_MODELS_PATH = "./improved-diffusion/diffusion_models/"
TEXT_DEFAULTS: 'dict[str, str | int | float]' = dict(
    modality='text',
    dataset_name='covost',
    experiment='translation',
    noise_schedule='cosine',
    loss_type='Lsimple',
    dropout=0.1,
    weight_decay=0.0,
    image_size=8,
    #hidden_size=128,
    in_channel=16, ## Embedding Dimension
    lr_anneal_steps=400000, ## Training steps
    num_res_blocks=2, ## Not sure
    lr=1e-04, ## Learning rate?
    bsz=64, ## Batch Size
    diff_steps=4000, ## Steps of diffusion
    model_arch='conv-unet',
    emb_scale_factor=1.0, 
    noise_level=0.0, 
    cache_mode='no', 
    use_bert_tokenizer='no',
    padding_mode='block',
    preprocessing_num_workers=1,
    #config='diffusion_lm/synthetic_data/configs/emnlp2020/experiments/difflm_seed0_m3_k128_trainc20000.yaml',
    #model_name_or_path='predictability/diff_models/compress_e=5_b=60_m=gpt2_wikitext-103-raw-v1_None',
    #experiment='gpt2_pre_compress',

)
'''These are the defaults from Diffusion-LMs run_train.py'''

DIFFUSION_DEFAULTS: "dict[str, str | float]" = {
    'seed': 101,
    'data_dir': "",
    'schedule_sampler': "uniform",
    'lr':1e-4,
    'weight_decay':0.0,
    'lr_anneal_steps':0,
    'batch_size':1,
    'microbatch':-1, # -1 disables microbatches
    'ema_rate':"0.9999", # comma-seperated list of EMA values
    'log_interval':50,
    'save_interval':50000,
    'resume_checkpoint':"",
    'use_fp16':False,
    'fp16_scale_growth':1e-3,
    'gradient_clipping':-1.0,
    'eval_interval':2000,
    'checkpoint_path':"diff_models"
}
'''These are the defaults from improved-diffusions train.py'''

MODEL_AND_DIFFUSION_DEFAULTS = model_and_diffusion_defaults()
ARGS: 'dict[str, str | int | float]' = {
    'diffusion_steps': 2000,
    'model_arch': 'transformer',
    'lr': 0.0001,
    'lr_anneal_steps': 50000,
    'seed': 102,
    'noise_schedule': 'sqrt',
    'in_channel': 256,
    'out_channel': 256, # Same as in_channel
    'modality': 'text',
    'experiment': 'translation',
    'submit': 'no',
    'padding_mode': 'pad',
    'predict_xstart': True,
    'training_mode': 'e2e',
    'notes': 'xstart_e2e',
    'batch_size': 64,
    'vocab_size': 30000
}

'''Adjust these for your run.'''

PARAMS = Parameters(**{
    **DIFFUSION_DEFAULTS,
    **MODEL_AND_DIFFUSION_DEFAULTS,
    **TEXT_DEFAULTS,
    **ARGS
})
'''This collects all the parameters, as in the Diffusion-Lm repo'''

### From run_train.py
if PARAMS['loss_type'] == 'Lsimple':
    PARAMS.update(use_kl= False, learn_sigma= False)
elif PARAMS['loss_type'] == 'Lhybrid':
    PARAMS.update(use_kl= False, learn_sigma= True)
elif PARAMS['loss_type'] == 'Lvlb':
    PARAMS.update(use_kl= True, learn_sigma= True)
else:
    assert False

def model_path(
    modality,
    padding_mode,
    experiment,
    in_channel,
    model_arch,
    lr,
    weight_decay,
    diff_steps,
    noise_schedule,
    loss_type,
    #hidden_size,
    num_res_blocks,
    dropout,
    seed,
    notes=None,
    **_
    ):
    MODEL_NAME = f"diff" \
        f"_{modality}" \
        f"_{padding_mode}" \
        f"_{experiment}{in_channel}_normalized" \
        f"_{model_arch}" \
        f"_lr{lr}" \
        f"_{weight_decay}" \
        f"_{diff_steps}" \
        f"_{noise_schedule}" \
        f"_{loss_type}" \
        f"_h{'hidden_size'}" \
        f"_s{num_res_blocks}" \
        f"_d{dropout}" \
        f"_sd{seed}" \
        f"{f'_{notes}' if notes else ''}"

    return os.path.join(DIFFUSION_MODELS_PATH, MODEL_NAME)

MODEL_PATH = model_path(**PARAMS)

PARAMS['checkpoint_path'] = MODEL_PATH

### Environment variables for the training script
os.environ['OPENAI_LOGDIR']=MODEL_PATH
os.environ['TOKENIZERS_PARALLELISM']='false'

set_seed(PARAMS['seed']) 
dist_util.setup_dist() # DEBUG **
logger.configure()

print(PARAMS)

logger.log("creating model and diffusion...")
model, diffusion = create_model_and_diffusion(
    **{
        key: PARAMS[key] for key in MODEL_AND_DIFFUSION_DEFAULTS.keys()
    }
)
model.to(dist_util.dev()) #  DEBUG **
# model.cuda() #  DEBUG **

pytorch_total_params = sum(parameter.numel() for parameter in model.parameters())
logger.log(f'the parameter count is {pytorch_total_params}')

schedule_sampler = create_named_schedule_sampler(PARAMS['schedule_sampler'], diffusion)

logger.log(f'saving the hyperparameters to {PARAMS["checkpoint_path"]}/training_args.json')
with open(f'{PARAMS["checkpoint_path"]}/training_args.json', 'w') as hyperparams_file:
    json.dump(PARAMS, hyperparams_file, indent=2)

logger.log("creating data loader...")
print('load data', '*'*50)

tokenizer: 'Tokenizer' = Tokenizer.from_file(f'{COVOST_PATH}/tokenizer_{PARAMS.vocab_size}_normalized.json')

embedding_model = torch.nn.Embedding(tokenizer.get_vocab_size(), PARAMS['in_channel'])

torch.save(embedding_model.weight, f'{MODEL_PATH}/embedding_weights_initial.pt')

def create_data_loaders():
    '''Creating a function for this so that unnecessary data can be freed'''
    from improved_diffusion.text_datasets import TextDataset_NoCache
    from torch.utils.data import DataLoader
    import itertools
    from diffusion_translation.datasets import TextDataset_FileBacked

    split_renamings = {
        'train':'data',
        'dev': 'eval_data'
    }

    # return {
    #     split_renamings[split]: (
    #         databatch for databatch in DataLoader(
    #             TextDataset_FileBacked(
    #                 tokenizer= tokenizer,
    #                 file= f'{COVOST_PATH}/covost_v2.de_en.{split}.txt',
    #                 embedding_model = embedding_model,
    #                 **PARAMS
    #             ),
    #             batch_size=PARAMS['batch_size'],  # 64,
    #             drop_last=True,
    #             shuffle=False,
    #             num_workers=1,
    #         )
    #     ) for split in ['train', 'dev']
    # }

    # As in Diffusion-LM
    max_seq_len = PARAMS.image_size ** 2

    from datasets import DatasetDict, Dataset
    from improved_diffusion.text_datasets import _collate_batch_helper

    data = DatasetDict()

    for split in ["test", "train", "dev"]:
        pad_token_id = int(tokenizer.token_to_id('[PAD]'))
        reader = open(f'{COVOST_PATH}/covost_v2.de_en_normalized.{split}.txt')

        ## Skip Header
        next(reader)

        encoded = (tokenizer.encode(line).ids for line in reader)

        ## Some lines might be too long. Can't split them up for translation.
        def filtered():
            num_dataset_valid_lines = 0
            num_dataset_filtered_out_lines = 0
            for encoding in encoded:
                if len(encoding) < max_seq_len:
                    num_dataset_valid_lines+=1
                    yield encoding
                else:
                    num_dataset_filtered_out_lines+=1
            num_lines = num_dataset_filtered_out_lines + num_dataset_valid_lines
            print(f'Finished filtering the dataset lines: {num_dataset_filtered_out_lines} out of {num_lines} were too long. ({num_dataset_filtered_out_lines/num_lines*100}\%)')

        encoded_dataset = Dataset.from_dict({
            'input_ids':[ encoding  for encoding in filtered()]
        })

        def pad_function(group_lst):
            group_lst['input_ids'] = _collate_batch_helper(group_lst['input_ids'], pad_token_id, max_seq_len)
            return group_lst

        padded_dataset = encoded_dataset.map(
            pad_function,
            batched=True,
            num_proc=1,
            desc=f'padding',
        )

        data[split]=padded_dataset

        reader.close()

    def data_generator(split, data):
        dataset = TextDataset_NoCache(
            data,
            PARAMS.image_size,
            PARAMS,
            model_arch=PARAMS['model_arch'],
            model_emb=embedding_model.cpu(),
            split=split
        )
        dataloader = DataLoader(
            dataset,
            batch_size=PARAMS['batch_size'],  # 64,
            drop_last=True,
            shuffle=True,
            num_workers=1,
        )

        while True:
            yield from dataloader

    return {
        # split_renamings[split]: (
        #     databatch for databatch in DataLoader(
        #         TextDataset_NoCache(
        #             data,
        #             PARAMS.image_size,
        #             PARAMS,
        #             model_arch=PARAMS['model_arch'],
        #             model_emb=embedding_model,
        #             split=split
        #         ),
        #         batch_size=PARAMS['batch_size'],  # 20,
        #         drop_last=True,
        #         shuffle=False,
        #         num_workers=1,
        #     )
        #) for split in ['train', 'dev']
        split_renamings[split]: data_generator(split, data) for split in ['train', 'dev']
    }

data_loaders = create_data_loaders()

embedding_model_cuda = embedding_model.cuda()

def set_mapping_func(args, diffusion):
    print(f'Embedding model: {embedding_model}\n Requires Grad: {embedding_model.weight.requires_grad}')
    mapping_func = partial(compute_logp, args, embedding_model_cuda)
    diffusion.mapping_func = mapping_func

set_mapping_func(PARAMS, diffusion)

def training_params(
    batch_size, 
    microbatch, 
    lr, 
    ema_rate,
    log_interval,
    save_interval,
    resume_checkpoint,
    use_fp16,
    fp16_scale_growth,
    weight_decay,
    lr_anneal_steps,
    checkpoint_path,
    gradient_clipping,
    eval_interval, **_):
    '''Extracts just the training parameters'''
    return dict(
        batch_size=batch_size, 
        microbatch=microbatch, 
        lr=lr, 
        ema_rate=ema_rate,
        log_interval=log_interval,
        save_interval=save_interval,
        resume_checkpoint=resume_checkpoint,
        use_fp16=use_fp16,
        fp16_scale_growth=fp16_scale_growth,
        weight_decay=weight_decay,
        lr_anneal_steps=lr_anneal_steps,
        checkpoint_path=checkpoint_path,
        gradient_clipping=gradient_clipping,
        eval_interval=eval_interval
    )

wandb.init(
    project=os.getenv("WANDB_PROJECT", "diffusion_lm"),
    name=PARAMS['checkpoint_path'],
)
wandb.config.update(PARAMS, allow_val_change=True)

logger.log("training...")
TrainLoop(
    model=model,
    diffusion=diffusion,
    schedule_sampler=schedule_sampler,
    **training_params(**PARAMS),
    **data_loaders
).run_loop()

## Saving the Embedding Model
torch.save(embedding_model.weight, f'{MODEL_PATH}/embedding_weights.pt')
torch.save(embedding_model_cuda.weight, f'{MODEL_PATH}/embedding_weights_cuda.pt')


# for data_loader in data_loaders:
#     data_loaders[data_loader].close()

wandb.finish(0)


Logging to ./improved-diffusion/diffusion_models/diff_text_pad_translation256_normalized_transformer_lr0.0001_0.0_4000_sqrt_Lsimple_hhidden_size_s2_d0.1_sd102_xstart_e2e
{'seed': 102, 'data_dir': '', 'schedule_sampler': 'uniform', 'lr': 0.0001, 'weight_decay': 0.0, 'lr_anneal_steps': 50000, 'batch_size': 64, 'microbatch': -1, 'ema_rate': '0.9999', 'log_interval': 50, 'save_interval': 50000, 'resume_checkpoint': '', 'use_fp16': False, 'fp16_scale_growth': 0.001, 'gradient_clipping': -1.0, 'eval_interval': 2000, 'checkpoint_path': './improved-diffusion/diffusion_models/diff_text_pad_translation256_normalized_transformer_lr0.0001_0.0_4000_sqrt_Lsimple_hhidden_size_s2_d0.1_sd102_xstart_e2e', 'image_size': 8, 'num_channels': 128, 'num_res_blocks': 2, 'num_heads': 4, 'num_heads_upsample': -1, 'attention_resolutions': '16,8', 'dropout': 0.1, 'learn_sigma': False, 'sigma_small': False, 'class_cond': False, 'diffusion_steps': 2000, 'noise_schedule': 'sqrt', 'timestep_respacing': '', 'use_kl': F



Finished filtering the dataset lines: 4 out of 13509 were too long. (0.029609889703160856\%)


HBox(children=(FloatProgress(value=0.0, description='padding', max=14.0, style=ProgressStyle(description_width…


Finished filtering the dataset lines: 17 out of 127637 were too long. (0.013319021913708406\%)


HBox(children=(FloatProgress(value=0.0, description='padding', max=128.0, style=ProgressStyle(description_widt…


Finished filtering the dataset lines: 2 out of 13509 were too long. (0.014804944851580428\%)


HBox(children=(FloatProgress(value=0.0, description='padding', max=14.0, style=ProgressStyle(description_width…


Embedding model: Embedding(30000, 256)
 Requires Grad: True


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myunus-demirag[0m. Use [1m`wandb login --relogin`[0m to force relogin


training...
8
------------------------
| grad_norm | 0.969    |
| loss      | 0.959    |
| loss_q0   | 0.952    |
| loss_q1   | 0.943    |
| loss_q2   | 0.962    |
| loss_q3   | 0.981    |
| mse       | 0.959    |
| mse_q0    | 0.952    |
| mse_q1    | 0.943    |
| mse_q2    | 0.962    |
| mse_q3    | 0.981    |
| samples   | 64       |
| step      | 0        |
------------------------
8
eval on validation set
---------------------------
| eval_loss    | 0.758    |
| eval_loss_q0 | 0.778    |
| eval_loss_q1 | 0.755    |
| eval_loss_q2 | 0.721    |
| eval_loss_q3 | 0.79     |
| eval_mse     | 0.758    |
| eval_mse_q0  | 0.778    |
| eval_mse_q1  | 0.755    |
| eval_mse_q2  | 0.721    |
| eval_mse_q3  | 0.79     |
---------------------------
saving model 0...
writing to ./improved-diffusion/diffusion_models/diff_text_pad_translation256_normalized_transformer_lr0.0001_0.0_4000_sqrt_Lsimple_hhidden_size_s2_d0.1_sd102_xstart_e2e/model000000.pt
writing to ./improved-diffusion/diffusion_model

KeyboardInterrupt: 

In [7]:
wandb.finish(1)

0,1
eval_loss,█▂▁▁▁▁▁▁▁▁▁
eval_loss_q0,█▁▁▁▁▁▁▁▁▁▁
eval_loss_q1,█▂▂▁▁▁▁▁▁▁▁
eval_loss_q2,█▂▂▁▁▁▂▁▁▁▁
eval_loss_q3,█▂▂▂▂▁▁▁▁▁▁
eval_mse,█▂▁▁▁▁▁▁▁▁▁
eval_mse_q0,█▁▁▁▁▁▁▁▁▁▁
eval_mse_q1,█▂▂▁▁▁▁▁▁▁▁
eval_mse_q2,█▂▂▁▁▁▂▁▁▁▁
eval_mse_q3,█▂▂▂▂▁▁▁▁▁▁

0,1
eval_loss,0.10886
eval_loss_q0,0.04198
eval_loss_q1,0.06238
eval_loss_q2,0.07157
eval_loss_q3,0.19137
eval_mse,0.10886
eval_mse_q0,0.04198
eval_mse_q1,0.06238
eval_mse_q2,0.07157
eval_mse_q3,0.19137
