# Translation with Diffusion-LM and CoVost

## Formatting the Covost Dataset

For simplicity we want the dataset in a line-by-line format to apply a tokenizer. The dataset structure is as following: <br>
path (audio filename), sentence (transcript), translation, client_id (speaker ID) all seperated by tab chars

In [3]:
COVOST_PATH = "./covost/dataset"

for split in ['dev', 'test', 'train']:
    with open(f'{COVOST_PATH}/covost_v2.de_en.{split}.tsv', 'r') as tsv, open(f'{COVOST_PATH}/covost_v2.de_en.{split}.txt', 'w') as txt:
        for line in tsv.readlines():
            _path, sentence, translation, _client_id, *_ = line.split('\t')
            txt.write(f'{sentence} {translation}\n')

        

## Training a Diffusion-LM

### Creating the Tokenizer

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

COVOST_PATH = "./covost/dataset"

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = Whitespace()
files = [f'{COVOST_PATH}/covost_v2.de_en.{split}.txt' for split in ["test", "train", "dev"]]
tokenizer.train(files, trainer)

tokenizer.save(f"{COVOST_PATH}/tokenizer.json")






In [4]:
from improved_diffusion import dist_util, logger
from improved_diffusion.image_datasets import load_data
from improved_diffusion.text_datasets import load_data_text
from improved_diffusion.resample import create_named_schedule_sampler
from improved_diffusion.script_util import (
    model_and_diffusion_defaults,
    create_model_and_diffusion,
    args_to_dict,
    add_dict_to_argparser,
)
from improved_diffusion.train_util import TrainLoop
from transformers import set_seed
from functools import partial
from improved_diffusion.test_util import get_weights, compute_logp
from improved_diffusion.rounding import load_models, load_tokenizer
from tokenizers import Tokenizer
import torch.distributed as dist
import wandb
import json
import os
import torch

class Parameters(dict):
    '''Most of the code in the Diffusion-LM Paper just passes the args through all the functions
    I wanted to use dicts for the params, but only noticed then that they don't support getting their
    values like attributes.'''
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def __getattribute__(self, __name: 'str'):
        if hasattr(super(), __name):
            return super().__getattribute__(__name) 
        elif super().get(__name):
            return super().get(__name)
        else:
            raise AttributeError(f"'parameters' object has no attribute '{__name}'")


COVOST_PATH = "./covost/dataset"
DIFFUSION_MODELS_PATH = "./improved-diffusion/diffusion_models/"
TEXT_DEFAULTS: 'dict[str, str | int | float]' = dict(
    modality='text',
    dataset_name='covost',
    experiment='translation',
    noise_schedule='cosine',
    loss_type='Lsimple',
    dropout=0.1,
    weight_decay=0.0,
    image_size=8,
    hidden_size=128,
    in_channel=16, ## Embedding Dimension
    lr_anneal_steps=400000, ## Training steps
    num_res_blocks=2, ## Not sure
    lr=1e-04, ## Learning rate?
    bsz=64, ## Batch Size
    diff_steps=4000, ## Steps of diffusion
    model_arch='conv-unet',
    emb_scale_factor=1.0, 
    noise_level=0.0, 
    cache_mode='no', 
    use_bert_tokenizer='no',
    padding_mode='block',
    preprocessing_num_workers=1,
    #config='diffusion_lm/synthetic_data/configs/emnlp2020/experiments/difflm_seed0_m3_k128_trainc20000.yaml',
    #model_name_or_path='predictability/diff_models/compress_e=5_b=60_m=gpt2_wikitext-103-raw-v1_None',
    #experiment='gpt2_pre_compress',

)
'''These are the defaults from Diffusion-LMs run_train.py'''

DIFFUSION_DEFAULTS: "dict[str, str | float]" = {
    'seed': 101,
    'data_dir': "",
    'schedule_sampler': "uniform",
    'lr':1e-4,
    'weight_decay':0.0,
    'lr_anneal_steps':0,
    'batch_size':1,
    'microbatch':-1, # -1 disables microbatches
    'ema_rate':"0.9999", # comma-seperated list of EMA values
    'log_interval':50,
    'save_interval':50000,
    'resume_checkpoint':"",
    'use_fp16':False,
    'fp16_scale_growth':1e-3,
    'gradient_clipping':-1.0,
    'eval_interval':2000,
    'checkpoint_path':"diff_models"
}
'''These are the defaults from improved-diffusions train.py'''

MODEL_AND_DIFFUSION_DEFAULTS = model_and_diffusion_defaults()
ARGS: 'dict[str, str | int | float]' = {
    'diffusion_steps': 2000,
    'model_arch': 'transformer',
    'lr': 0.0001,
    'lr_anneal_steps': 2000,
    'seed': 102,
    'noise_schedule': 'sqrt',
    'in_channel': 128,
    'out_channel': 128, # Same as in_channel
    'modality': 'text',
    'experiment': 'translation',
    'submit': 'no',
    'padding_mode': 'pad',
    'predict_xstart': True,
    'training_mode': 'e2e-simple',
    'notes': 'xstart_e2e-simple',
    'batch_size': 64,
    'vocab_size': 30000
}

'''Adjust these for your run.'''

PARAMS = Parameters(**{
    **DIFFUSION_DEFAULTS,
    **MODEL_AND_DIFFUSION_DEFAULTS,
    **TEXT_DEFAULTS,
    **ARGS
})
'''This collects all the parameters, as in the Diffusion-Lm repo'''

### From run_train.py
if PARAMS['loss_type'] == 'Lsimple':
    PARAMS.update(use_kl= False, learn_sigma= False)
elif PARAMS['loss_type'] == 'Lhybrid':
    PARAMS.update(use_kl= False, learn_sigma= True)
elif PARAMS['loss_type'] == 'Lvlb':
    PARAMS.update(use_kl= True, learn_sigma= True)
else:
    assert False

def model_path(
    modality,
    padding_mode,
    experiment,
    in_channel,
    model_arch,
    lr,
    weight_decay,
    diff_steps,
    noise_schedule,
    loss_type,
    hidden_size,
    num_res_blocks,
    dropout,
    seed,
    notes=None,
    **_
    ):
    MODEL_NAME = f"diff" \
        f"_{modality}" \
        f"_{padding_mode}" \
        f"_{experiment}{in_channel}" \
        f"_{model_arch}" \
        f"_lr{lr}" \
        f"_{weight_decay}" \
        f"_{diff_steps}" \
        f"_{noise_schedule}" \
        f"_{loss_type}" \
        f"_h{hidden_size}" \
        f"_s{num_res_blocks}" \
        f"_d{dropout}" \
        f"_sd{seed}" \
        f"{f'_{notes}' if notes else ''}"

    return os.path.join(DIFFUSION_MODELS_PATH, MODEL_NAME)

MODEL_PATH = model_path(**PARAMS)

PARAMS['checkpoint_path'] = MODEL_PATH

### Environment variables for the training script
os.environ['OPENAI_LOGDIR']=MODEL_PATH
os.environ['TOKENIZERS_PARALLELISM']='false'

set_seed(PARAMS['seed']) 
dist_util.setup_dist() # DEBUG **
logger.configure()

print(PARAMS)

logger.log("creating model and diffusion...")
model, diffusion = create_model_and_diffusion(
    **{
        key: PARAMS[key] for key in MODEL_AND_DIFFUSION_DEFAULTS.keys()
    }
)
model.to(dist_util.dev()) #  DEBUG **
# model.cuda() #  DEBUG **

pytorch_total_params = sum(parameter.numel() for parameter in model.parameters())
logger.log(f'the parameter count is {pytorch_total_params}')

schedule_sampler = create_named_schedule_sampler(PARAMS['schedule_sampler'], diffusion)

logger.log(f'saving the hyperparameters to {PARAMS["checkpoint_path"]}/training_args.json')
with open(f'{PARAMS["checkpoint_path"]}/training_args.json', 'w') as hyperparams_file:
    json.dump(PARAMS, hyperparams_file, indent=2)

logger.log("creating data loader...")
print('load data', '*'*50)

tokenizer: 'Tokenizer' = Tokenizer.from_file(f'{COVOST_PATH}/tokenizer.json')

embedding_model = torch.nn.Embedding(tokenizer.get_vocab_size(), PARAMS['in_channel'])

torch.save(embedding_model.weight, f'{MODEL_PATH}/embedding_weights_initial.pt')

def create_data_loaders():
    '''Creating a function for this so that unnecessary data can be freed'''
    from improved_diffusion.text_datasets import TextDataset_NoCache
    from torch.utils.data import DataLoader
    import itertools
    from diffusion_translation.datasets import TextDataset_FileBacked

    split_renamings = {
        'train':'data',
        'dev': 'eval_data'
    }

    # return {
    #     split_renamings[split]: (
    #         databatch for databatch in DataLoader(
    #             TextDataset_FileBacked(
    #                 tokenizer= tokenizer,
    #                 file= f'{COVOST_PATH}/covost_v2.de_en.{split}.txt',
    #                 embedding_model = embedding_model,
    #                 **PARAMS
    #             ),
    #             batch_size=PARAMS['batch_size'],  # 64,
    #             drop_last=True,
    #             shuffle=False,
    #             num_workers=1,
    #         )
    #     ) for split in ['train', 'dev']
    # }

    # As in Diffusion-LM
    max_seq_len = PARAMS.image_size ** 2

    from datasets import DatasetDict, Dataset
    from improved_diffusion.text_datasets import _collate_batch_helper

    data = DatasetDict()

    for split in ["test", "train", "dev"]:
        pad_token_id = int(tokenizer.token_to_id('[PAD]'))
        reader = open(f'{COVOST_PATH}/covost_v2.de_en.{split}.txt')

        ## Skip Header
        next(reader)

        encoded = (tokenizer.encode(line).ids for line in reader)

        ## Some lines might be too long. Can't split them up for translation.
        def filtered():
            num_dataset_valid_lines = 0
            num_dataset_filtered_out_lines = 0
            for encoding in encoded:
                if len(encoding) < max_seq_len:
                    num_dataset_valid_lines+=1
                    yield encoding
                else:
                    num_dataset_filtered_out_lines+=1
            num_lines = num_dataset_filtered_out_lines + num_dataset_valid_lines
            print(f'Finished filtering the dataset lines: {num_dataset_filtered_out_lines} out of {num_lines} were too long. ({num_dataset_filtered_out_lines/num_lines*100}\%)')

        encoded_dataset = Dataset.from_dict({
            'input_ids':[ encoding  for encoding in filtered()]
        })

        def pad_function(group_lst):
            group_lst['input_ids'] = _collate_batch_helper(group_lst['input_ids'], pad_token_id, max_seq_len)
            return group_lst

        padded_dataset = encoded_dataset.map(
            pad_function,
            batched=True,
            num_proc=1,
            desc=f'padding',
        )

        data[split]=padded_dataset

        reader.close()

    def data_generator(split, data):
        dataset = TextDataset_NoCache(
            data,
            PARAMS.image_size,
            PARAMS,
            model_arch=PARAMS['model_arch'],
            model_emb=embedding_model.cpu(),
            split=split
        )
        dataloader = DataLoader(
            dataset,
            batch_size=PARAMS['batch_size'],  # 64,
            drop_last=True,
            shuffle=False,
            num_workers=1,
        )

        while True:
            yield from dataloader

    return {
        # split_renamings[split]: (
        #     databatch for databatch in DataLoader(
        #         TextDataset_NoCache(
        #             data,
        #             PARAMS.image_size,
        #             PARAMS,
        #             model_arch=PARAMS['model_arch'],
        #             model_emb=embedding_model,
        #             split=split
        #         ),
        #         batch_size=PARAMS['batch_size'],  # 20,
        #         drop_last=True,
        #         shuffle=False,
        #         num_workers=1,
        #     )
        #) for split in ['train', 'dev']
        split_renamings[split]: data_generator(split, data) for split in ['train', 'dev']
    }

data_loaders = create_data_loaders()

embedding_model_cuda = embedding_model.cuda()

def set_mapping_func(args, diffusion):
    print(f'Embedding model: {embedding_model}\n Requires Grad: {embedding_model.weight.requires_grad}')
    mapping_func = partial(compute_logp, args, embedding_model_cuda)
    diffusion.mapping_func = mapping_func

set_mapping_func(PARAMS, diffusion)

def training_params(
    batch_size, 
    microbatch, 
    lr, 
    ema_rate,
    log_interval,
    save_interval,
    resume_checkpoint,
    use_fp16,
    fp16_scale_growth,
    weight_decay,
    lr_anneal_steps,
    checkpoint_path,
    gradient_clipping,
    eval_interval, **_):
    '''Extracts just the training parameters'''
    return dict(
        batch_size=batch_size, 
        microbatch=microbatch, 
        lr=lr, 
        ema_rate=ema_rate,
        log_interval=log_interval,
        save_interval=save_interval,
        resume_checkpoint=resume_checkpoint,
        use_fp16=use_fp16,
        fp16_scale_growth=fp16_scale_growth,
        weight_decay=weight_decay,
        lr_anneal_steps=lr_anneal_steps,
        checkpoint_path=checkpoint_path,
        gradient_clipping=gradient_clipping,
        eval_interval=eval_interval
    )

wandb.init(
    project=os.getenv("WANDB_PROJECT", "diffusion_lm"),
    name=PARAMS['checkpoint_path'],
)
wandb.config.update(PARAMS, allow_val_change=True)

logger.log("training...")
TrainLoop(
    model=model,
    diffusion=diffusion,
    schedule_sampler=schedule_sampler,
    **training_params(**PARAMS),
    **data_loaders
).run_loop()

## Saving the Embedding Model
torch.save(embedding_model.weight, f'{MODEL_PATH}/embedding_weights.pt')
torch.save(embedding_model_cuda.weight, f'{MODEL_PATH}/embedding_weights_cuda.pt')


# for data_loader in data_loaders:
#     data_loaders[data_loader].close()

wandb.finish(0)


Logging to ./improved-diffusion/diffusion_models/diff_text_pad_translation128_transformer_lr0.0001_0.0_4000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_e2e-simple
{'seed': 102, 'data_dir': '', 'schedule_sampler': 'uniform', 'lr': 0.0001, 'weight_decay': 0.0, 'lr_anneal_steps': 2000, 'batch_size': 64, 'microbatch': -1, 'ema_rate': '0.9999', 'log_interval': 50, 'save_interval': 50000, 'resume_checkpoint': '', 'use_fp16': False, 'fp16_scale_growth': 0.001, 'gradient_clipping': -1.0, 'eval_interval': 2000, 'checkpoint_path': './improved-diffusion/diffusion_models/diff_text_pad_translation128_transformer_lr0.0001_0.0_4000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_e2e-simple', 'image_size': 8, 'num_channels': 128, 'num_res_blocks': 2, 'num_heads': 4, 'num_heads_upsample': -1, 'attention_resolutions': '16,8', 'dropout': 0.1, 'learn_sigma': False, 'sigma_small': False, 'class_cond': False, 'diffusion_steps': 2000, 'noise_schedule': 'sqrt', 'timestep_respacing': '', 'use_kl': False, 'predict_xstart': T

HBox(children=(FloatProgress(value=0.0, description='padding', max=14.0, style=ProgressStyle(description_width…


Finished filtering the dataset lines: 45 out of 127637 were too long. (0.03525623447746343\%)


HBox(children=(FloatProgress(value=0.0, description='padding', max=128.0, style=ProgressStyle(description_widt…


Finished filtering the dataset lines: 4 out of 13509 were too long. (0.029609889703160856\%)


HBox(children=(FloatProgress(value=0.0, description='padding', max=14.0, style=ProgressStyle(description_width…


Embedding model: Embedding(30000, 128)
 Requires Grad: True


training...
8
------------------------
| ce        | 14.4     |
| ce_q0     | 14.7     |
| ce_q1     | 14.4     |
| ce_q2     | 14.2     |
| ce_q3     | 14.3     |
| grad_norm | 92.4     |
| loss      | 14.4     |
| loss_q0   | 14.7     |
| loss_q1   | 14.4     |
| loss_q2   | 14.2     |
| loss_q3   | 14.3     |
| samples   | 64       |
| step      | 0        |
------------------------
8
eval on validation set
---------------------------
| eval_ce      | 15.6     |
| eval_ce_q0   | 17.7     |
| eval_ce_q1   | 15.1     |
| eval_ce_q2   | 13.9     |
| eval_ce_q3   | 15.8     |
| eval_loss    | 15.6     |
| eval_loss_q0 | 17.7     |
| eval_loss_q1 | 15.1     |
| eval_loss_q2 | 13.9     |
| eval_loss_q3 | 15.8     |
---------------------------
saving model 0...
writing to ./improved-diffusion/diffusion_models/diff_text_pad_translation128_transformer_lr0.0001_0.0_4000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_e2e-simple/model000000.pt
writing to ./improved-diffusion/diffusion_models/diff_text_

VBox(children=(Label(value='0.002 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.133489…

0,1
ce,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
ce_q0,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
ce_q1,█▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
ce_q2,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
ce_q3,█▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval_ce,▁
eval_ce_q0,▁
eval_ce_q1,▁
eval_ce_q2,▁
eval_ce_q3,▁

0,1
ce,0.75873
ce_q0,0.0001
ce_q1,0.016
ce_q2,0.42011
ce_q3,2.56796
eval_ce,15.58713
eval_ce_q0,17.69467
eval_ce_q1,15.11178
eval_ce_q2,13.88553
eval_ce_q3,15.78082


In [21]:
class parameters(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def __getattribute__(self, __name: 'str'):
        if hasattr(super(), __name):
            return super().__getattribute__(__name) 
        elif super().get(__name):
            return super().get(__name)
        else:
            raise AttributeError(f"'parameters' object has no attribute '{__name}'")

dict1 = parameters(
    one=1,
    two=2,
    three=3
)

dict1.one

1

In [5]:
model1 = torch.load('./improved-diffusion/diffusion_models/diff_text_pad_translation128_transformer_lr0.0001_0.0_4000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_e2e-simple/embedding_weights_initial.pt')
model2 = torch.load('./improved-diffusion/diffusion_models/diff_text_pad_translation128_transformer_lr0.0001_0.0_4000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_e2e-simple/embedding_weights_cuda.pt')

print(model1==model2)

tensor([[True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        ...,
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True]])


In [20]:
sample = next(data_loaders['data'])
sample[0].shape
sample=sample[0]

In [1]:
def exp(x):
    _x = x
    while True:
        _x += _x
        yield _x

gen = (x for x in exp(1))

print([next(gen) for _ in range(3)])

[2, 4, 8]


## Using the Infilling of Diffusion-LM

### Preperation

In [3]:
from tokenizers import Tokenizer
from diffusion_translation.parameters import Parameters
from improved_diffusion.script_util import model_and_diffusion_defaults, create_model_and_diffusion
from improved_diffusion import logger
import torch
import json


TEXT_DEFAULTS: 'dict[str, str | int | float]' = dict(
    modality='text',
    dataset_name='covost',
    experiment='translation',
    noise_schedule='cosine',
    loss_type='Lsimple',
    dropout=0.1,
    weight_decay=0.0,
    image_size=8,
    hidden_size=128,
    in_channel=16, ## Embedding Dimension
    lr_anneal_steps=400000, ## Training steps
    num_res_blocks=2, ## Not sure
    lr=1e-04, ## Learning rate?
    bsz=64, ## Batch Size
    diff_steps=4000, ## Steps of diffusion
    model_arch='conv-unet',
    emb_scale_factor=1.0, 
    noise_level=0.0, 
    cache_mode='no', 
    use_bert_tokenizer='no',
    padding_mode='block',
    preprocessing_num_workers=1,
    #config='diffusion_lm/synthetic_data/configs/emnlp2020/experiments/difflm_seed0_m3_k128_trainc20000.yaml',
    #model_name_or_path='predictability/diff_models/compress_e=5_b=60_m=gpt2_wikitext-103-raw-v1_None',
    #experiment='gpt2_pre_compress',

)
'''These are the defaults from Diffusion-LMs run_train.py'''

DIFFUSION_DEFAULTS: "dict[str, str | float]" = {
    'seed': 101,
    'data_dir': "",
    'schedule_sampler': "uniform",
    'lr':1e-4,
    'weight_decay':0.0,
    'lr_anneal_steps':0,
    'batch_size':1,
    'microbatch':-1, # -1 disables microbatches
    'ema_rate':"0.9999", # comma-seperated list of EMA values
    'log_interval':50,
    'save_interval':50000,
    'resume_checkpoint':"",
    'use_fp16':False,
    'fp16_scale_growth':1e-3,
    'gradient_clipping':-1.0,
    'eval_interval':2000,
    'checkpoint_path':"diff_models"
}
'''These are the defaults from improved-diffusions train.py'''

MODEL_AND_DIFFUSION_DEFAULTS = model_and_diffusion_defaults()

ARGS = {
    'model_path': './improved-diffusion/diffusion_models/diff_text_pad_translation128_transformer_lr0.0001_0.0_4000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_e2e-simple',
    'model_file':'model002000.pt',
    'tokenizer': './covost/dataset/tokenizer.json',
    'num_samples':50,
    'seqlen':64,
    'batch_size':64,
    'in_channel':128,
    'clip_denoised': False,
    'eta': 1.,
    'top_p':-1.0,
    'output_directory':'improved-diffusion/diffusion_models/diff_text_pad_translation128_transformer_lr0.0001_0.0_4000_sqrt_Lsimple_h128_s2_d0.1_sd102_xstart_e2e-simple'
}

# load configurations.
config_path = f'{ARGS["model_path"]}/training_args.json'
# sys.setdefaultencoding('utf-8')
with open(config_path, 'rb', ) as f:
    TRAINING_ARGS = json.load(f)

PARAMS = Parameters(**{
    **DIFFUSION_DEFAULTS,
    **MODEL_AND_DIFFUSION_DEFAULTS,
    **TEXT_DEFAULTS,
    **ARGS,
    **TRAINING_ARGS
})

logger.configure()

cuda = torch.full([1],1).cuda().device

tokenizer = Tokenizer.from_file(PARAMS.tokenizer)

padding_token = tokenizer.token_to_id('[PAD]')

num_samples = PARAMS.num_samples

embedding_weights = torch.load(f'{PARAMS.model_path}/embedding_weights_cuda.pt')

embedding_model = torch.nn.Embedding.from_pretrained(embedding_weights)

model, diffusion = create_model_and_diffusion(
    **{key: PARAMS[key] for key in MODEL_AND_DIFFUSION_DEFAULTS.keys()}
)

model.load_state_dict(torch.load(f'{PARAMS.model_path}/{PARAMS.model_file}'))

model.to(cuda)


Logging to /tmp/openai-2022-12-07-17-09-05-733693
creating model, based on transformer
BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

LossType.E2E_Simple_MSE False
training mode is  e2e-simple
training mode is  e2e-simple


TransformerNetModel2(
  (word_embedding): Embedding(30000, 128)
  (lm_head): Linear(in_features=128, out_features=30000, bias=True)
  (time_embed): Sequential(
    (0): Linear(in_features=128, out_features=512, bias=True)
    (1): SiLU()
    (2): Linear(in_features=512, out_features=768, bias=True)
  )
  (input_up_proj): Sequential(
    (0): Linear(in_features=128, out_features=768, bias=True)
    (1): Tanh()
    (2): Linear(in_features=768, out_features=768, bias=True)
  )
  (input_transformers): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768

### Sampling

In [4]:
from improved_diffusion.test_util import denoised_fn_round
from functools import partial

model_kwargs = {}

sample_shape = (PARAMS.batch_size, PARAMS.seqlen, PARAMS.in_channel, )

sample = diffusion.p_sample_loop(
    model,
    sample_shape,
    denoised_fn=partial(denoised_fn_round, PARAMS, embedding_model.cuda()),
    clip_denoised=PARAMS['clip_denoised'],
    model_kwargs=model_kwargs,
    top_p = PARAMS.top_p,
    device=cuda,
    progress=True
)

sample

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




tensor([[[ 1.3961, -1.6038, -0.0275,  ..., -0.0809, -0.3764, -1.3592],
         [ 0.5318,  0.0996,  0.6039,  ...,  0.5425, -0.5783, -0.4982],
         [ 0.5318,  0.0996,  0.6039,  ...,  0.5425, -0.5783, -0.4982],
         ...,
         [ 0.9512,  0.0573,  1.1934,  ..., -0.5230, -0.7619,  0.0284],
         [ 0.5318,  0.0996,  0.6039,  ...,  0.5425, -0.5783, -0.4982],
         [ 0.5318,  0.0996,  0.6039,  ...,  0.5425, -0.5783, -0.4982]],

        [[ 0.5318,  0.0996,  0.6039,  ...,  0.5425, -0.5783, -0.4982],
         [-1.1557, -1.2191,  0.9237,  ..., -2.3334, -0.4334, -0.6740],
         [ 0.5318,  0.0996,  0.6039,  ...,  0.5425, -0.5783, -0.4982],
         ...,
         [ 0.9512,  0.0573,  1.1934,  ..., -0.5230, -0.7619,  0.0284],
         [ 0.9512,  0.0573,  1.1934,  ..., -0.5230, -0.7619,  0.0284],
         [ 0.5318,  0.0996,  0.6039,  ...,  0.5425, -0.5783, -0.4982]],

        [[ 0.5318,  0.0996,  0.6039,  ...,  0.5425, -0.5783, -0.4982],
         [-1.1557, -1.2191,  0.9237,  ..., -2

### Decoding

In [21]:
from torch import Tensor
from torch.nn import Embedding
from tokenizers import Tokenizer

def k_nearest_neighbors(text_embedding_list: Tensor, embedding_model: Embedding, tokenizer: Tokenizer):
    decoded_outputs = []
    embedding_weights = embedding_model.weight

    def get_k_nearest_neighbors(embedding_weights: Tensor, text_embedding: Tensor, dist='cos'):
        text_embedding = text_embedding.to(device=embedding_weights.device)
        if dist == 'cos':
            adjacency = embedding_weights @ text_embedding.transpose(1, 0).to(embedding_weights.device)
        elif dist == 'l2':
            adjacency = embedding_weights.unsqueeze(1).expand(-1, text_embedding.size(0), -1) - text_embedding.unsqueeze(0).expand(embedding_weights.size(0), -1, -1)
            adjacency = -torch.norm(adjacency, dim=-1)
        topk_out = torch.topk(adjacency, k=6, dim=0)
        return topk_out.values, topk_out.indices

    for text_embedding in text_embedding_list:
        if len(text_embedding.shape) > 2:
            text_embedding = text_embedding.view(-1, text_embedding.size(-1))
        values, indices = get_k_nearest_neighbors(embedding_weights=embedding_weights, text_embedding=text_embedding, dist='l2')
        decoded_output = tokenizer.decode(indices[0].tolist())
        decoded_outputs.append(decoded_output)
    
    return decoded_outputs

decoded_outputs = k_nearest_neighbors(text_embedding_list=sample, embedding_model=embedding_model, tokenizer=tokenizer)

decoded_outputs_in_lines = (decoded_output + '\n' for decoded_output in decoded_outputs)

with open(f'{PARAMS.output_directory}/samples.txt', 'w') as output_file:
    output_file.writelines(decoded_outputs_in_lines)

: 

: 

In [5]:
import torch
import numpy as np

arr = np.concatenate([sample.cpu().numpy()], axis=0)
print(arr.shape, 'full shape')
arr = arr[: PARAMS.num_samples]
print(arr.shape, 'arr shape')

decoded_outputs = []
print('decoding for e2e', )
print(sample.shape)
print(sample.dtype)
print(model.get_logits)
x_t = torch.tensor(sample).cuda().to(torch.float32)
if PARAMS.model_arch == 'conv-unet':
    reshaped_x_t = x_t.view(x_t.size(0), -1, x_t.size(-1))
else:
    reshaped_x_t = x_t
logits = model.get_logits(reshaped_x_t)  # bsz, seqlen, vocab
cands = torch.topk(logits, k=1, dim=-1)
indices = cands.indices
for seq in cands.indices:
    print(seq)
    numpy_sequence = seq.cpu().numpy()
    tokens = tokenizer.decode(numpy_sequence.squeeze(-1))
    decoded_outputs.append(tokens)

decoded_outputs_in_lines = (decoded_output + '\n' for decoded_output in decoded_outputs)

with open(f'{PARAMS.output_directory}/samples_e2e_decoded.txt', 'w') as output_file:
    output_file.writelines(decoded_outputs_in_lines)

(64, 64, 128) full shape
(50, 64, 128) arr shape
decoding for e2e
torch.Size([64, 64, 128])
torch.float32
<bound method TransformerNetModel2.get_logits of TransformerNetModel2(
  (word_embedding): Embedding(30000, 128)
  (lm_head): Linear(in_features=128, out_features=30000, bias=True)
  (time_embed): Sequential(
    (0): Linear(in_features=128, out_features=512, bias=True)
    (1): SiLU()
    (2): Linear(in_features=512, out_features=768, bias=True)
  )
  (input_up_proj): Sequential(
    (0): Linear(in_features=128, out_features=768, bias=True)
    (1): Tanh()
    (2): Linear(in_features=768, out_features=768, bias=True)
  )
  (input_transformers): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, 

  


tensor([[25297],
        [10179],
        [24918],
        [25297],
        [25297],
        [25297],
        [25297],
        [16205],
        [25297],
        [25297],
        [13433],
        [25297],
        [25297],
        [ 5788],
        [16205],
        [10179],
        [ 5788],
        [10179],
        [25297],
        [10179],
        [25297],
        [29759],
        [25297],
        [25297],
        [25297],
        [24918],
        [24918],
        [ 3817],
        [16205],
        [13433],
        [22481],
        [ 4180],
        [10179],
        [10179],
        [22481],
        [22481],
        [16205],
        [16205],
        [25297],
        [24918],
        [25297],
        [25297],
        [25297],
        [25297],
        [25297],
        [25297],
        [16205],
        [22481],
        [25297],
        [22481],
        [ 9866],
        [24918],
        [16205],
        [16205],
        [25297],
        [25297],
        [25297],
        [22481],
        [27271