In [3]:
%%capture
!pip install git+https://github.com/openai/CLIP.git
!pip install loraclip

In [4]:
import argparse
import torch
import random
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
import torch.nn.functional as F
import clip
import loraclip

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

In [7]:
def load_loraclip(args):
  model, preprocess = loraclip.load(args.clip_model_name, device=args.device, r=args.lora_rank, lora_mode=args.lora_mode)
  loraclip.print_trainable_parameters(model)
  return model, preprocess


def setup_args():
  parser = argparse.ArgumentParser()
  parser.add_argument("--clip-model-name", type=str, default="ViT-B/16")
  parser.add_argument("--device", type=str, default="mps")
  parser.add_argument("--lora-rank", type=int, default=8)
  parser.add_argument("--lora-mode", type=str, default="vision", choices=["vision", "text", "vision+text"])
  args, unknown = parser.parse_known_args()
  return args

In [8]:
args = setup_args()
clip_model, clip_preprocess = load_loraclip(args)

Model loaded
Unexpected keys: ['lora_text_projection']
 
trainable params: 1397248 || all params: 150472705 || trainable%: 0.9285723945748168


In [9]:
for name, parameter in clip_model.named_parameters():
    print(f"{name}: requires_grad = {parameter.requires_grad}")

positional_embedding: requires_grad = False
logit_scale: requires_grad = False
text_projection: requires_grad = False
visual.class_embedding: requires_grad = True
visual.positional_embedding: requires_grad = True
visual.proj: requires_grad = True
visual.conv1.weight: requires_grad = False
visual.ln_pre.weight: requires_grad = False
visual.ln_pre.bias: requires_grad = False
visual.transformer.resblocks.0.attn.in_proj_weight: requires_grad = False
visual.transformer.resblocks.0.attn.in_proj_weight_lora_A: requires_grad = True
visual.transformer.resblocks.0.attn.in_proj_weight_lora_B: requires_grad = True
visual.transformer.resblocks.0.attn.in_proj_bias: requires_grad = False
visual.transformer.resblocks.0.attn.out_proj.weight: requires_grad = False
visual.transformer.resblocks.0.attn.out_proj.bias: requires_grad = False
visual.transformer.resblocks.0.attn.out_proj.lora_A: requires_grad = True
visual.transformer.resblocks.0.attn.out_proj.lora_B: requires_grad = True
visual.transformer.res

In [6]:
trainable_params = sum(p.numel() for p in lora_clip.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in lora_clip.parameters())
print(f"Trainable Parameters: {trainable_params}")
print(f"Total Parameters: {total_params}")

Trainable Parameters: 1316864
Total Parameters: 150675969
