In [None]:
!pwd

/content


In [None]:
!cd /content/drive/MyDrive/unsplash-dataset/curated-data/

/content/drive/MyDrive/unsplash-dataset/curated-data


In [None]:
!pwd

/content/drive/MyDrive/unsplash-dataset/curated-data


In [None]:
# Clone the CLIP repository
!git clone https://github.com/openai/CLIP.git

Cloning into 'CLIP'...
remote: Enumerating objects: 251, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 251 (delta 3), reused 2 (delta 0), pack-reused 243[K
Receiving objects: 100% (251/251), 8.93 MiB | 15.98 MiB/s, done.
Resolving deltas: 100% (127/127), done.


**List the Photos folder images**

In [None]:
import math
import numpy as np
import pandas as pd

In [None]:
from pathlib import Path
photos_path = Path("/content/drive/My Drive/unsplash-dataset/curated-data/photos")

photos_files = list(photos_path.glob("*.jpg"))
# photos_files[:20]

# Print some statistics
print(f"Photos found: {len(photos_files)}")

Photos found: 1000


**Load the CLIP API from OpenAI**

In [None]:
!pwd

/content/drive/MyDrive/unsplash-dataset/curated-data


In [None]:
import clip
import torch
from PIL import Image
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

#COMPUTE FEATURE VECTOR FOR THE IMAGES
def compute_clip_features(photos_batch):
    photos = [Image.open(photo_file) for photo_file in photos_batch]

    #batch processing
    photos_preprocessed = torch.stack([preprocess(photo) for photo in photos]).to(device)
    with torch.no_grad():
        #encode photos for feature vectors
        photos_features = model.encode_image(photos_preprocessed)
        photos_features /= photos_features.norm(dim=-1, keepdim=True)
    #feature vectors back to the CPU and convert to numpy
    return photos_features.cpu().numpy()

100%|███████████████████████████████████████| 338M/338M [00:04<00:00, 86.8MiB/s]


In [17]:
#defining preprocssing batch size
batch_size = 50

#path for output feature vectors
features_path = Path("/content/drive/My Drive/unsplash-dataset/curated-data/features")

#number of batches
batches = math.ceil(len(photos_files) / batch_size)

#Process each batch
for i in range(batches):
    print(f"Processing batch {i+1}/{batches}")

    batch_ids_path = features_path / f"{i:010d}.csv"
    batch_features_path = features_path / f"{i:010d}.npy"

    #error handling
    if not batch_features_path.exists():
        try:
            batch_files = photos_files[i*batch_size : (i+1)*batch_size]
            #compute features
            batch_features = compute_clip_features(batch_files)
            np.save(batch_features_path, batch_features)

            #photos id csv for backup
            photo_ids = [photo_file.name.split(".")[0] for photo_file in batch_files]
            photo_ids_data = pd.DataFrame(photo_ids, columns=['photo_id'])
            photo_ids_data.to_csv(batch_ids_path, index=False)
        except:
            # Catch problems with the processing to make the process more robust
            print(f'Problem with batch {i}')

Processing batch 1/20
Processing batch 2/20
Processing batch 3/20
Processing batch 4/20
Processing batch 5/20
Processing batch 6/20
Processing batch 7/20
Processing batch 8/20
Processing batch 9/20
Processing batch 10/20
Processing batch 11/20
Processing batch 12/20
Processing batch 13/20
Processing batch 14/20
Processing batch 15/20
Processing batch 16/20
Processing batch 17/20
Processing batch 18/20
Processing batch 19/20
Processing batch 20/20


In [19]:
#load feature vector files
features_list = [np.load(features_file) for features_file in sorted(features_path.glob("*.npy"))]

# Concatenate the features and store in a merged file
features = np.concatenate(features_list)
np.save(features_path / "features.npy", features)

# Load all the photo IDs
photo_ids = pd.concat([pd.read_csv(ids_file) for ids_file in sorted(features_path.glob("*.csv"))])
photo_ids.to_csv(features_path / "photo_ids.csv", index=False)
print(features_path)

/content/drive/My Drive/unsplash-dataset/curated-data/features
