In [1]:
from transformers import CLIPTokenizer
from datasets import load_dataset
from datasets import Image as HuggingFaceImage
from linformer import Linformer
from vit_pytorch.efficient import ViT
import torch
def get_tokenizer() -> CLIPTokenizer:
    return CLIPTokenizer.from_pretrained('openai/clip-vit-base-patch32')
def prepare_data(tokenizer: CLIPTokenizer):
    def add_prompt(example):
        props = example['font_properties']
        character = example['character']
        split = character.split('_')
        if len(split) > 1:
            character = split[0] + 'case ' + split[1]
        else:
            character = split[0]
        prompt = f"a {props['font_serifs']} {character} with {props['width']} width {props['rounding']} corners {props['font_weight']} weight and {props['dynamics']} movement with characteristics that can be described by adjectives {example['font_characteristics']}" 
        example['prompt'] = prompt
        return example
    def map_tokens(example):
        prompt = example['prompt']
        tokens = tokenizer.encode(prompt, padding='max_length', max_length=42)
        example['tokens'] = tokens
        return example
    dataset = load_dataset('json', data_files={'train':'train-metadata.jsonl', 'test':'test-metadata.jsonl'})
    
    train_new_column = ['foo'] * len(dataset['train'])
    dataset['train'] = dataset['train'].add_column('prompt', train_new_column)
    dataset['train'] = dataset['train'].add_column('tokens', train_new_column)
    dataset['train'] = dataset['train'].map(add_prompt)
    dataset['train'] = dataset['train'].map(map_tokens)
    dataset['train'] = dataset['train'].remove_columns(['prompt', 'uniqueId', 'ttf_path', 'font_characteristics', 'font_properties', 'character', 'vit_label'])
    dataset['train'] = dataset['train'].cast_column('image', HuggingFaceImage())
    dataset['train'] = dataset['train'].with_format('torch')
    
    test_new_column = ['bar'] * len(dataset['test'])
    dataset['test'] = dataset['test'].add_column('prompt', test_new_column)
    dataset['test'] = dataset['test'].add_column('tokens', test_new_column)
    dataset['test'] = dataset['test'].map(add_prompt)
    dataset['test'] = dataset['test'].map(map_tokens)
    dataset['test'] = dataset['test'].remove_columns(['prompt', 'uniqueId', 'ttf_path', 'font_characteristics', 'font_properties', 'character', 'vit_label'])
    dataset['test'] = dataset['test'].cast_column('image', HuggingFaceImage())
    dataset['test'] = dataset['test'].with_format('torch')
    return dataset
def get_vit_model(image_size: int, patch_size: int, dim: int, depth: int, num_heads: int, k: int, device: str):
    sequence_length = (image_size//patch_size)**2 + 1
    # for 512x512px image with 32x32px patches: 16x16 + 1 CLS token
    efficient_transformer = Linformer(
        dim=dim,
        seq_len=sequence_length,  
        depth=depth,
        heads=num_heads,
        k=k
    )
    model = ViT(
        dim=dim,
        image_size=image_size,
        patch_size=patch_size,
        num_classes=62,
        transformer=efficient_transformer,
        channels=1,
    )
    return model 
def get_vit(image_size, patch_size, vit_dim, vit_depth, vit_num_heads, k, device, vit_checkpoint_path):
    vit = get_vit_model(image_size=image_size, 
                        patch_size=patch_size, 
                        dim=vit_dim, 
                        depth=vit_depth, 
                        num_heads=vit_num_heads, 
                        k=k, 
                        device=device)
    if vit_checkpoint_path != None:
        vit_checkpoint = torch.load(vit_checkpoint_path)
        vit.load_state_dict(vit_checkpoint['model_state_dict'])
        print('Loaded ViT model from checkpoint:', vit_checkpoint_path)
    return vit


In [None]:
filtered = dataset['train'].filter(lambda ex: len(ex['tokens']) != 28)
unique = set()
for ex in filtered:
    if len(ex['tokens']) == 35:
        print(ex['prompt'])
    unique.add(len(ex['tokens']))
print(unique)

In [3]:
import torch.nn as nn
class LinformerLM(nn.Module):
    def __init__(self, num_tokens, dim, seq_len, depth, k = 256, heads = 8, dim_head = None, one_kv_head = False, share_kv = False, reversible = False, dropout = 0.):
        super().__init__()
        self.token_emb = nn.Embedding(num_tokens, dim)
        self.pos_emb = nn.Embedding(seq_len, dim)
        self.linformer = Linformer(dim, seq_len, depth, k = k, heads = heads, dim_head = dim_head,
                one_kv_head = one_kv_head, share_kv = share_kv, reversible = reversible, dropout = dropout)
        # self.to_logits = nn.Linear(dim, num_tokens)

    def forward(self, x):
        x = self.token_emb(x)
        x = self.pos_emb(torch.arange(x.shape[1], device=x.device)) + x
        x = self.linformer(x)
        # out = self.to_logits(x)
        return x

In [2]:
from torch.optim import AdamW
from x_clip import CLIP
from vit_pytorch.extractor import Extractor
from torch.utils.data import DataLoader
import tqdm as tqdm
device = 'cuda'
clip_tokenizer = get_tokenizer()
dataset = prepare_data(clip_tokenizer)
print(dataset)
def get_dataloaders(train_clip_dataset, test_clip_dataset, batch_size):
    train_loader = DataLoader(dataset=train_clip_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=test_clip_dataset, batch_size=batch_size, shuffle=True)
    return train_loader, test_loader
train_dataset = dataset['train']
test_dataset = dataset['test']
train_loader, valid_loader = get_dataloaders(train_dataset, test_dataset, 2)
vit_checkpoint = './vit-checkpoints/model-epoch18.pt'
image_size = 512
patch_size = 32
vit_dim = 128
vit_depth = 12
vit_num_heads = 8
k = 64

base_vit = get_vit(image_size, 
                    patch_size, 
                    vit_dim, 
                    vit_depth, 
                    vit_num_heads, 
                    k, 
                    device, 
                    vit_checkpoint)
image_encoder = Extractor(
    base_vit,
    return_embeddings_only = True
)
text_encoder = LinformerLM(
    num_tokens=49408,
    dim = 256,
    seq_len = 42,
    depth = 12,
    heads = 8,
    dim_head = 64,        # be able to set the dimension of each head in multi-head attention
    k = 128,               # this is the k that the key/values are projected to along the sequence dimension
    one_kv_head = True,    # share one key/value head across all heads
    share_kv = False,      # share the same projection for keys and values
    reversible = False,      # make network reversible, like Reformer
)
clip = CLIP(
    image_encoder = image_encoder,
    text_encoder = text_encoder,
    dim_image=128,
    dim_text=256,
    dim_latent=128,
    text_encode_without_mask=True,
    use_all_token_embeds=True,
    text_has_cls_token=False,
    visual_has_cls_token=True,
).to(device)
def prepare_batch(batch):
    batch_imgs = batch['image']
    batch_tokens = batch['tokens']
    batch_imgs = batch_imgs[:, :, :, 0].unsqueeze(-1)
    batch_imgs = batch_imgs.permute(0, 3, 1, 2)
    batch_imgs = batch_imgs.type('torch.FloatTensor')
    return batch_imgs, batch_tokens
for batch in train_loader:
    batch_imgs, batch_tokens = prepare_batch(batch)
    # batch_imgs.to(device)
    batch_tokens = batch_tokens.to(device)
    batch_imgs = batch_imgs.to(device)
    loss = clip(batch_tokens, batch_imgs, return_loss=True)
    loss.backward()
lr=3e-5
def get_trainable_params(model):
    return [params for params in model.parameters() if params.requires_grad]
optimizer = AdamW(get_trainable_params(clip), lr=lr) # DALLE-pytorch setup
for epoch in range(0, 10):
    epoch_loss = 0
    epoch_accuracy = 0
    for batch in tqdm(train_loader):
        batch_imgs, batch_tokens = prepare_batch(batch)
        # batch_imgs.to(device)
        batch_tokens = batch_tokens.to(device)
        batch_imgs = batch_imgs.to(device)
        loss = clip(batch_tokens, batch_imgs, return_loss=True)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # acc = (output.argmax(dim=1) == batch_labels).float().mean()
        # epoch_accuracy += acc / len(train_loader)
        # epoch_loss += loss / len(train_loader)

NameError: name 'get_tokenizer' is not defined

In [8]:
from transformers import AutoTokenizer, CLIPTextModel
import torch
tokenizer = AutoTokenizer.from_pretrained('openai/clip-vit-base-patch32')
text_encoder = CLIPTextModel.from_pretrained('openai/clip-vit-base-patch32')
text_encoder.config.bos_token_id = tokenizer.bos_token_id
text_encoder.config.eos_token_id = tokenizer.eos_token_id
text_encoder.config.pad_token_id = tokenizer.pad_token_id
x = torch.randint(0, tokenizer.vocab_size, (2, 42))
out = text_encoder(x)
print(x.shape, out)


Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.11.self_attn.out_proj.bias', 'vision_model.encoder.layers.4.self_attn.out_proj.weight', 'vision_model.encoder.layers.5.self_attn.v_proj.weight', 'vision_model.encoder.layers.6.layer_norm1.weight', 'vision_model.encoder.layers.9.layer_norm1.weight', 'vision_model.encoder.layers.11.self_attn.k_proj.weight', 'vision_model.encoder.layers.8.self_attn.k_proj.bias', 'vision_model.encoder.layers.7.self_attn.q_proj.weight', 'vision_model.encoder.layers.1.mlp.fc2.weight', 'vision_model.encoder.layers.3.self_attn.out_proj.weight', 'vision_model.encoder.layers.3.layer_norm1.bias', 'vision_model.encoder.layers.0.self_attn.q_proj.bias', 'vision_model.encoder.layers.2.layer_norm2.bias', 'vision_model.encoder.layers.5.mlp.fc1.bias', 'vision_model.encoder.layers.11.layer_norm2.bias', 'vision_model.encoder.layers.7.self_attn.v_proj.bias', 'vision_model.embeddi

torch.Size([2, 42]) BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 0.4614, -0.0276,  0.1367,  ...,  0.2076,  0.6126,  0.0962],
         [-0.3207, -1.3248,  1.0631,  ..., -1.0859,  0.1839, -0.7059],
         [ 0.5146, -1.6968,  1.6400,  ...,  0.9960,  0.6290,  0.2361],
         ...,
         [ 0.2864, -0.0054,  0.1663,  ...,  0.1414,  0.3384, -0.6460],
         [ 1.5542,  0.1783,  0.4818,  ..., -0.2123,  0.6743, -1.6258],
         [ 0.7134, -0.5206,  0.8076,  ...,  0.4765,  0.2446, -0.4237]],

        [[ 0.4480,  0.0079,  0.0966,  ...,  0.2242,  0.6600,  0.1128],
         [ 0.8891, -0.4952, -0.4421,  ...,  0.8624,  0.7505,  0.0025],
         [ 0.6614, -2.0598, -0.1620,  ...,  0.6620, -0.4753, -1.5487],
         ...,
         [ 1.1787, -0.6765,  0.0310,  ...,  0.7357, -0.0599, -1.2353],
         [ 0.9095, -0.3877,  0.4095,  ...,  0.4120,  0.4656, -0.9974],
         [ 0.6709, -1.4997,  1.2520,  ...,  0.8821,  0.3796, -0.4205]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_

In [3]:
print(text_encoder.config)

CLIPTextConfig {
  "_name_or_path": "openai/clip-vit-base-patch32",
  "attention_dropout": 0.0,
  "bos_token_id": 49406,
  "dropout": 0.0,
  "eos_token_id": 49407,
  "hidden_act": "quick_gelu",
  "hidden_size": 512,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 77,
  "model_type": "clip_text_model",
  "num_attention_heads": 8,
  "num_hidden_layers": 12,
  "pad_token_id": 49407,
  "projection_dim": 512,
  "transformers_version": "4.28.0",
  "vocab_size": 49408
}



In [2]:
import os
from transformers import CLIPTextModel
from x_clip_train import get_tokenizer
import torch
from x_clip import CLIP
from vit_pytorch.extractor import Extractor

image_size = 512
patch_size = 32
vit_dim = 128
vit_depth = 12
vit_num_heads = 8
k = 64
base_vit = get_vit(image_size, 
                    patch_size, 
                    vit_dim, 
                    vit_depth, 
                    vit_num_heads, 
                    k, 
                    device=None, 
                    vit_checkpoint_path=None)
image_encoder = Extractor(
    base_vit,
    return_embeddings_only = True
)
clip_tokenizer = get_tokenizer(True)
text_encoder = CLIPTextModel.from_pretrained('openai/clip-vit-base-patch32')
text_encoder.resize_token_embeddings(len(clip_tokenizer))
path = os.path.join(os.getcwd(), 'clip-checkpoints', 'clip-epoch-9.pt')
checkpoint = torch.load(path)
clip = CLIP(
    image_encoder = image_encoder,
    text_encoder = text_encoder,
    dim_image=128,
    dim_text=512,
    dim_latent=384,
    text_encode_without_mask=False,
    use_all_token_embeds=False,
    text_has_cls_token=True,
    visual_has_cls_token=True,
    num_text_tokens=text_encoder.vocab_size,
    text_pad_id=clip_tokenizer.pad_token_id,
    text_eos_id=clip_tokenizer.eos_token_id,
    use_mlm=True,
    mlm_mask_token_id=clip_tokenizer.mask_token_id,
    mlm_pad_token_id=clip_tokenizer.pad_token_id,
    mlm_mask_ignore_token_ids=[clip_tokenizer.bos_token_id]
).to('cuda')
clip.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint['epoch']
loss = checkpoint['loss']
print('Loaded model from checkpoint:', path)

Added special tokens:  {'mask_token': '<|mask_token|>'}


Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.11.layer_norm1.weight', 'vision_model.encoder.layers.6.layer_norm1.bias', 'vision_model.encoder.layers.7.layer_norm1.weight', 'vision_model.encoder.layers.9.self_attn.q_proj.weight', 'vision_model.encoder.layers.7.self_attn.v_proj.weight', 'vision_model.encoder.layers.3.mlp.fc1.weight', 'vision_model.encoder.layers.6.layer_norm1.weight', 'vision_model.encoder.layers.10.mlp.fc2.weight', 'vision_model.encoder.layers.6.mlp.fc2.weight', 'vision_model.encoder.layers.10.mlp.fc2.bias', 'vision_model.encoder.layers.1.self_attn.out_proj.bias', 'vision_model.encoder.layers.7.mlp.fc1.weight', 'vision_model.encoder.layers.4.self_attn.k_proj.weight', 'vision_model.encoder.layers.0.mlp.fc2.weight', 'vision_model.encoder.layers.5.layer_norm1.weight', 'vision_model.encoder.layers.1.mlp.fc1.weight', 'vision_model.encoder.layers.8.self_attn.q_proj.weight', 'vi

Loaded model from checkpoint: d:\font-diffusion\clip-checkpoints\clip-epoch-9.pt


In [9]:
from x_clip_train import prepare_batch, get_dataloaders, get_tokenizer, prepare_data
from transformers import CLIPTextModel
clip_tokenizer = get_tokenizer()
print(clip_tokenizer.special_tokens_map)
clip_tokenizer.add_special_tokens({'mask_token':'<|mask_token|>', 'cls_token': '<|cls_token|>'})
print(clip_tokenizer.special_tokens_map)
model = CLIPTextModel.from_pretrained('openai/clip-vit-base-patch32')
model.resize_token_embeddings(len(clip_tokenizer))
dataset = prepare_data(clip_tokenizer)


{'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}
{'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'cls_token': '<|cls_token|>', 'mask_token': '<|mask_token|>'}


Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.7.self_attn.k_proj.weight', 'vision_model.encoder.layers.11.layer_norm1.weight', 'vision_model.encoder.layers.10.layer_norm2.bias', 'vision_model.encoder.layers.3.self_attn.k_proj.bias', 'vision_model.encoder.layers.6.self_attn.k_proj.weight', 'vision_model.encoder.layers.6.self_attn.q_proj.weight', 'vision_model.encoder.layers.8.self_attn.v_proj.bias', 'vision_model.encoder.layers.8.mlp.fc2.weight', 'vision_model.encoder.layers.8.self_attn.k_proj.bias', 'vision_model.encoder.layers.10.layer_norm1.bias', 'vision_model.encoder.layers.3.layer_norm1.weight', 'vision_model.post_layernorm.bias', 'vision_model.encoder.layers.8.self_attn.out_proj.weight', 'vision_model.encoder.layers.10.self_attn.q_proj.bias', 'vision_model.encoder.layers.5.mlp.fc1.bias', 'vision_model.encoder.layers.7.self_attn.q_proj.weight', 'vision_model.encoder.layers.8.self_at

  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\Michael Labarca\.cache\huggingface\datasets\json\default-c772ad4eb6d31de9\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-b26c26fef36d8e27.arrow


Map:   0%|          | 0/12090 [00:00<?, ? examples/s]

Loading cached processed dataset at C:\Users\Michael Labarca\.cache\huggingface\datasets\json\default-c772ad4eb6d31de9\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-f46cb15ec73d6a31.arrow


Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [12]:
dataset['train'][0]['tokens']
clip_tokenizer.decode(dataset['train'][0]['tokens'])

'<|startoftext|>a serif 0 with extended width rounded corners black weight and dynamic movement with characteristics that can be described by adjectives chinese restaurant asian <|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

<|cls_token|> 49408 49408
<|mask_token|> 49409 49408


In [3]:
from x_clip_train import prepare_batch, get_dataloaders, get_tokenizer, prepare_data
clip_tokenizer = get_tokenizer(True)
dataset = prepare_data(clip_tokenizer)
train_dataset, test_dataset = dataset['train'], dataset['test']
train_loader, valid_loader = get_dataloaders(train_dataset, test_dataset, 32)
for batch in valid_loader:
    batch_imgs, batch_tokens = prepare_batch(batch)
    # batch_tokens = batch_tokens.type(torch.int32)
    batch_imgs, batch_tokens = batch_imgs.to('cuda'), batch_tokens.to('cuda')
    out = clip(batch_tokens, batch_imgs[0].unsqueeze(0), return_encodings=True)
    print(out[0].last_hidden_state.shape)
    print(out[1].shape)
    break

Added special tokens:  {'mask_token': '<|mask_token|>'}


Found cached dataset json (C:/Users/Michael Labarca/.cache/huggingface/datasets/json/default-c772ad4eb6d31de9/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\Michael Labarca\.cache\huggingface\datasets\json\default-c772ad4eb6d31de9\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-b26c26fef36d8e27.arrow
Loading cached processed dataset at C:\Users\Michael Labarca\.cache\huggingface\datasets\json\default-c772ad4eb6d31de9\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-93ccb330251d5d57.arrow
Loading cached processed dataset at C:\Users\Michael Labarca\.cache\huggingface\datasets\json\default-c772ad4eb6d31de9\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-f46cb15ec73d6a31.arrow
Loading cached processed dataset at C:\Users\Michael Labarca\.cache\huggingface\datasets\json\default-c772ad4eb6d31de9\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-fa8db6b8d59f5606.arrow


torch.Size([32, 42, 512])
torch.Size([1, 257, 128])


In [4]:
from sentence_transformers import util
print(out[0].last_hidden_state.shape)
print(out[1].shape)
util.cos_sim(out[1], out[0].last_hidden_state)    

torch.Size([32, 42, 512])
torch.Size([1, 257, 128])


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [42, 128] but got: [42, 32].

In [None]:
text_encoder()