In [1]:
!git clone https://github.com/openai/CLIP.git
# CLIP: Contrastive Language-Image Pre-Training
# Learning Transferable Visual Models From Natural Language Supervision (paper)

!git clone https://github.com/CompVis/taming-transformers.git
# Taming Transformers for High-Resolutions Image Synthesis (paper)

Cloning into 'CLIP'...
remote: Enumerating objects: 256, done.[K
remote: Counting objects: 100% (154/154), done.[K
remote: Compressing objects: 100% (44/44), done.[K
remote: Total 256 (delta 126), reused 110 (delta 110), pack-reused 102 (from 1)[K
Receiving objects: 100% (256/256), 8.86 MiB | 14.14 MiB/s, done.
Resolving deltas: 100% (140/140), done.
Cloning into 'taming-transformers'...
remote: Enumerating objects: 1342, done.[K
remote: Counting objects: 100% (1/1), done.[K
remote: Total 1342 (delta 0), reused 0 (delta 0), pack-reused 1341 (from 2)[K
Receiving objects: 100% (1342/1342), 409.77 MiB | 23.27 MiB/s, done.
Resolving deltas: 100% (282/282), done.


In [2]:
# install some extra libraries

!pip install --no-deps ftfy regex tqdm  # install text fixing, regex, and progress bar libraries without dependencies
!pip install omegaconf==2.0.0 pytorch-lightning==1.0.8  # install specific versions for configuration and training framework
!pip uninstall torchtext --yes  # remove torchtext as it may conflict or is not needed
!pip install einop  # install einop, a lightweight wrapper for Einstein summation operations

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1
Collecting omegaconf==2.0.0
  Downloading omegaconf-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Collecting pytorch-lightning==1.0.8
  Downloading pytorch_lightning-1.0.8-py3-none-any.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3->pytorch-lightning==1.0.8)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3->pytorch-lightning==1.0.8)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1

In [4]:
# import libraries

import numpy as np  # numerical operations and arrays
import torch, os, imageio, pdb, math  # torch for deep learning, os for file paths, imageio for reading/writing images, pdb for debugging, math for mathematical functions
import torchvision  # PyTorch vision utilities
import torchvision.transforms as T  # image transformations
import torchvision.transforms.functional as TF  # functional image transformations

import PIL  # Python Imaging Library for handling images
import matplotlib.pyplot as plt  # plotting and visualisation

import yaml  # for reading YAML configuration files
from omegaconf import OmegaConf  # structured configuration management

from CLIP import clip  # import CLIP model (Contrastive Language–Image Pretraining)

# import warnings
# warnings.filterwarnings('ignore')  # (optional) suppress warnings


In [5]:
# helper functions

def show_from_tensor(tensor):
  img = tensor.clone()  # create a copy of the tensor
  img = img.mul(255).byte()  # scale pixel values from [0,1] to [0,255] and convert to byte type
  img = img.cpu().numpy().transpose((1, 2, 0))  # move to CPU and change to H x W x C format for plotting

  plt.figure(figsize=(10, 7))  # set figure size
  plt.axis('off')  # remove axes for clean display
  plt.imshow(img)  # show image
  plt.show()

def norm_data(data):
  return (data.clip(-1, 1) + 1) / 2  # clip values to [-1, 1] and normalise to [0, 1] range

# parameters

learning_rate = 0.5  # learning rate for the optimiser
batch_size = 1  # number of samples per training iteration
wd = 0.1  # weight decay (regularisation parameter)
noise_factor = 0.1  # scale of noise added to the image

total_iter = 100  # total number of optimisation steps
im_shape = [225, 400, 3]  # image dimensions: height, width, channels
size1, size2, channels = im_shape  # unpack image dimensions


In [6]:
### clip model ###

clipmodel, _ = clip.load("ViT-B/32", jit = False)  # load the CLIP model with Vision Transformer (ViT-B/32) architecture
clipmodel.eval()  # set the model to evaluation mode
print(clip.available_models())  # print all available CLIP model variants

print("CLIP model visual input resolution: ", clipmodel.visual.input_resolution)  # display expected image resolution

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  # choose GPU if available, else CPU
torch.cuda.empty_cache()  # clear unused GPU memory

100%|███████████████████████████████████████| 338M/338M [00:22<00:00, 15.9MiB/s]


['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']
Clip model visual input resolution:  224
