# Library

In [3]:
# MUST USE GRADSCALER AND AUTOCAST OR TRAINING TAKES 3X AS LONG

import os
import sys
import random
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm
from scipy import spatial
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchvision import transforms
import timm
from timm.utils import AverageMeter
from sentence_transformers import SentenceTransformer
import warnings
from transformers import AutoModel, AutoProcessor
from torch.cuda.amp import GradScaler, autocast

warnings.filterwarnings('ignore')

In [4]:
# CONFIRM IMAGES EXIST
#csv_file_path = "hardcoded_prompts.csv"
#image_folder_path = "hardcoded_images"
#df = pd.read_csv(csv_file_path)

#def generate_image_path(index):
#    image_name = f"{index}.png" 
#    return os.path.join(image_folder_path, image_name)

#df['image_path'] = df.index.map(generate_image_path)

#df.to_csv("hardcoded_prompts_final.csv", index=False)

In [5]:
#def concatenate_csv_files(directory, output_filename):
#    all_files = os.listdir(directory)
#    csv_files = [file for file in all_files if file.endswith('.csv')]

#    combined_data = pd.DataFrame()

#    for file in csv_files:
#        file_path = os.path.join(directory, file)
#        data = pd.read_csv(file_path)
#        combined_data = pd.concat([combined_data, data], ignore_index=True)

#    combined_data.to_csv(output_filename, index=False)
    
#directory = "./prompt_artifacts"
#output_filename = "prompt_artifacts_final.csv"
#concatenate_csv_files(directory, output_filename)

In [None]:
clip_processor = AutoProcessor.from_pretrained("clip-ft")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class CFG:
    model_name = 'clip-vit-large-patch14'
    input_size = 224
    batch_size = 128
    num_epochs = 25
    lr = 5e-4
    seed = 21
    unfreeze = 18 



def get_dataloaders(trn_df, val_df, input_size, batch_size):
    trn_dataset = DiffusionDataset(trn_df)
    val_dataset = DiffusionDataset(val_df)
    collator = DiffusionCollator()
    
    dataloaders = {}
    dataloaders['train'] = DataLoader(
        dataset=trn_dataset,
        shuffle=True,
        batch_size=batch_size,
        pin_memory=True,
        num_workers=12,
        drop_last=True,
        collate_fn=collator)

    dataloaders['val'] = DataLoader(
        dataset=val_dataset,
        shuffle=False,
        batch_size=batch_size,
        pin_memory=True,
        num_workers=12,
        drop_last=False,
        collate_fn=collator)

    return dataloaders



def cosine_similarity(y_trues, y_preds):
    return np.mean([
        1 - spatial.distance.cosine(y_true, y_pred) 
        for y_true, y_pred in zip(y_trues, y_preds)])



class DiffusionDataset(Dataset):
    def __init__(self, df, clip_processor=clip_processor):
        self.df = df
        self.transform = transforms.Compose([
            transforms.Resize(224),
            transforms.ToTensor()])
        self.clip_processor = clip_processor
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['image_path'])
        image = self.transform(image)
        processed_image = self.clip_processor(images=image, return_tensors="pt")['pixel_values'].squeeze(0)
        prompt = row['prompt']
        return processed_image, prompt

class DiffusionCollator:
    def __init__(self):
        self.st_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
    
    def __call__(self, batch):
        images, prompts = zip(*batch)
        images = torch.stack(images)
        prompt_embeddings = self.st_model.encode(prompts, show_progress_bar=False, convert_to_tensor=True)
        return images, prompt_embeddings
    
        

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        clip = AutoModel.from_pretrained("clip-ft")
        self.vision = clip.vision_model
        self.fc = nn.Linear(1024, 384)

    def forward(self, x):
        out = self.vision(x)['pooler_output']
        return self.fc(out)
    
def load_pretrained_model():
    model = Net()

    trainable_model_weights = False
    for name, child in model.named_children():
        if name == 'vision':
            for pn, p in child.named_parameters():
                if str(CFG.unfreeze) in pn:
                    """start unfreezing layer to make weights trainable"""
                    trainable_model_weights = True
                p.requires_grad = trainable_model_weights
                if p.requires_grad:
                    print(f"{pn} is set to be trainable.")

    return model

def train(trn_df, val_df, model_name, input_size, batch_size, num_epochs, lr):
    dataloaders = get_dataloaders(trn_df, val_df, input_size, batch_size)

    model = load_pretrained_model()
    sd = torch.load("clip_vit_model5.pt")
    model.load_state_dict(sd)
    model = torch.compile(model)

    model.to(device)
    optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, fused=True)
    optimizer.zero_grad()

    scaler = GradScaler()
    criterion = nn.CosineEmbeddingLoss()
    
    best_score = -1.0
    count = 0
    run = 5

    for epoch in range(num_epochs):
        run += 1
        train_meters = {'loss': AverageMeter(), 'cos': AverageMeter()}
        model.train()

        for X, y in tqdm(dataloaders['train'], leave=False):
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()

            with autocast():
                X_out = model(X)
                target = torch.ones(X.size(0)).to(device)
                loss = criterion(X_out, y, target)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            trn_loss = loss.item()
            trn_cos = cosine_similarity(X_out.detach().cpu().numpy(), y.detach().cpu().numpy())
            train_meters['loss'].update(trn_loss, n=X.size(0))
            train_meters['cos'].update(trn_cos, n=X.size(0))

        print(f'Epoch {epoch + 1} / trn/loss={train_meters["loss"].avg:.4f}, trn/cos={train_meters["cos"].avg:.4f}')

        val_meters = {'loss': AverageMeter(), 'cos': AverageMeter()}
        model.eval()

        for X, y in tqdm(dataloaders['val'], leave=False):
            X, y = X.to(device), y.to(device)

            with torch.no_grad():
                with autocast():
                    X_out = model(X)
                    target = torch.ones(X.size(0)).to(device)
                    loss = criterion(X_out, y, target)

                val_loss = loss.item()
                val_cos = cosine_similarity(X_out.detach().cpu().numpy(), y.detach().cpu().numpy())

            val_meters['loss'].update(val_loss, n=X.size(0))
            val_meters['cos'].update(val_cos, n=X.size(0))

        print(f'Epoch {epoch + 1} / val/loss={val_meters["loss"].avg:.4f}, val/cos={val_meters["cos"].avg:.4f}')

        if val_meters['cos'].avg > best_score:
            best_score = val_meters['cos'].avg
            torch.save(model.state_dict(), f"clip_vit_model{run}.pt")
            torch.save(optimizer.state_dict(), f"clip_vit_opt{run}.pt")
        else:
            count += 1
            if count >= 3:
                print("Early stopping")
                break
            

df = pd.read_csv('filtered_image_data.csv') 
            
trn_df, val_df = train_test_split(df, test_size=0.1, random_state=CFG.seed)

train(trn_df, val_df, CFG.model_name, CFG.input_size, CFG.batch_size, CFG.num_epochs, CFG.lr)

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


encoder.layers.18.self_attn.k_proj.weight is set to be trainable.
encoder.layers.18.self_attn.k_proj.bias is set to be trainable.
encoder.layers.18.self_attn.v_proj.weight is set to be trainable.
encoder.layers.18.self_attn.v_proj.bias is set to be trainable.
encoder.layers.18.self_attn.q_proj.weight is set to be trainable.
encoder.layers.18.self_attn.q_proj.bias is set to be trainable.
encoder.layers.18.self_attn.out_proj.weight is set to be trainable.
encoder.layers.18.self_attn.out_proj.bias is set to be trainable.
encoder.layers.18.layer_norm1.weight is set to be trainable.
encoder.layers.18.layer_norm1.bias is set to be trainable.
encoder.layers.18.mlp.fc1.weight is set to be trainable.
encoder.layers.18.mlp.fc1.bias is set to be trainable.
encoder.layers.18.mlp.fc2.weight is set to be trainable.
encoder.layers.18.mlp.fc2.bias is set to be trainable.
encoder.layers.18.layer_norm2.weight is set to be trainable.
encoder.layers.18.layer_norm2.bias is set to be trainable.
encoder.laye

  0%|          | 0/3501 [00:00<?, ?it/s]

Epoch 1 / trn/loss=0.4677, trn/cos=0.5323


  0%|          | 0/390 [00:00<?, ?it/s]

Epoch 1 / val/loss=0.4625, val/cos=0.5375


  0%|          | 0/3501 [00:00<?, ?it/s]