In [32]:
from pathlib import Path
import pandas as pd

# Read the photos table
photos = pd.read_csv(os.path.join(Path('unsplash-dataset/full'), "photos.tsv000"), sep='\t', header=0)

# Print some stats
print(f'Photos in the dataset: {photos.shape[0]}')

Photos in the dataset: 1981321


In [33]:
from urllib.request import urlretrieve

donwload_path = Path('unsplash-dataset/photos')
photo_urls = photos[['photo_id', 'photo_image_url']].values.tolist()

def download_photo(photo):
    photo_id = photo[0]
    photo_url = photo[1] + "?w=640"
    photo_path = donwload_path / (photo_id + ".jpg")

    if not photo_path.exists():
        try:
            urlretrieve(photo_url, photo_path)
        except:
            pass

In [35]:
from multiprocessing.pool import ThreadPool

# Parallelize the download using a thread pool
pool = ThreadPool(128)
photos = pool.map(download_photo, photo_urls)

# Display some statistics
display(f'Photos downloaded: {len(photos)}')

'Photos downloaded: 1981321'

In [51]:
import clip
import torch
from PIL import Image

# Load the open CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def compute_clip_vectors(files):
    photos = [Image.open(file) for file in files]
    
    with torch.no_grad():
        photos_preprocessed = torch.stack([preprocess(photo) for photo in photos]).to(device)

        photos_encoded = model.encode_image(photos_preprocessed)
        photos_encoded /= photos_encoded.norm(dim=-1, keepdim=True)

    return photos_encoded.cpu().numpy()

In [45]:
import glob
photos_files = glob.glob(str(donwload_path / "*.jpg"))

1


In [149]:
import os
import pandas as pd

batch_size = 1024
batches = math.ceil(len(photos_files) / batch_size)

vectors_path = Path('unsplash-dataset/vectors')

for i in range(batches):

    i = 1289
    batch_path = vectors_path / f"{i:08d}.pickle"
    
    if not batch_path.exists():
        try:
            batch_files = photos_files[i*batch_size : (i+1)*batch_size]
            batch_vectors = compute_clip_vectors(batch_files)

            batch_data = pd.DataFrame(batch_vectors, columns=[f"{x:03d}" for x in range(512)])
            batch_data['photo_id'] = [os.path.basename(file).split(".")[0] for file in batch_files]

            batch_data.to_pickle(batch_path)
        except:
            print(f'Problem with batch {i}')

    break

batch_data

Unnamed: 0,000,001,002,003,004,005,006,007,008,009,...,503,504,505,506,507,508,509,510,511,photo_id
0,-0.026382,-0.001537,-0.039764,-0.007259,-0.034943,-0.056519,0.006962,0.003796,0.066467,0.034515,...,0.020462,-0.016800,-0.054810,0.040375,-0.038300,0.003374,0.012909,0.019714,-0.013550,X-4y73KHCrQ
1,0.009781,-0.005615,0.044586,0.009995,0.027634,0.008247,0.040741,0.026642,0.040436,0.003319,...,-0.013969,-0.003824,-0.028610,0.019501,0.007492,0.045776,0.017715,0.014061,-0.005169,IXINb0iDn8U
2,-0.015793,0.014610,-0.037201,-0.013939,0.008636,-0.062988,-0.004726,-0.008469,0.070007,0.064148,...,0.018646,0.040344,0.015175,-0.027222,-0.029053,-0.017792,0.011330,-0.010277,0.004509,AXl0P6vmEJ4
3,-0.012199,0.018448,-0.003237,-0.026581,0.024857,-0.003727,0.051208,0.056091,-0.031204,0.025009,...,0.014008,0.080872,-0.034821,-0.023117,-0.017075,-0.028076,0.091553,-0.032379,0.021896,RytfSySq40k
4,0.020020,-0.006714,0.014793,-0.048279,0.024612,-0.029602,0.041229,0.045593,-0.026093,0.007683,...,0.025146,0.080627,-0.033447,-0.008224,-0.016174,0.000379,0.022095,0.026047,-0.017685,wt9KLJxzer0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,-0.016983,0.026688,-0.041168,-0.009697,-0.023239,-0.024658,-0.003515,0.041718,-0.054871,0.045563,...,-0.022278,0.061890,0.017349,0.021912,-0.015083,0.012863,-0.016159,-0.018372,-0.011635,60QBdU8ezB8
1020,-0.011566,0.078064,-0.001796,-0.006283,0.058411,-0.011925,0.062866,0.061005,-0.025299,0.019272,...,0.009819,0.044434,-0.017120,-0.021774,0.009285,-0.014893,0.045410,0.041992,0.007767,iyqj9IKpQK0
1021,0.004204,0.027954,-0.006706,0.024338,0.039307,-0.015533,-0.025452,0.025574,0.055237,0.006435,...,0.027420,-0.017273,-0.005264,0.022263,0.037018,-0.034241,0.004894,-0.026505,0.026657,hSHJYwA5QzM
1022,-0.013092,0.033997,0.016418,0.000260,0.025070,0.001532,-0.002144,0.058105,-0.051483,0.023911,...,0.018539,0.052246,-0.010925,-0.000822,0.015320,-0.029312,0.026566,0.001087,-0.013748,peFHn8r4t1o


In [156]:
vec_files = sorted(glob.glob(str(vectors_path / '*.pickle')))

data = pd.concat([pd.read_pickle(f) for f in vec_files])

In [166]:
keys = list(data['photo_id'])
features = data[[f"{x:03d}" for x in range(512)]].to_numpy()


In [167]:
print(len(keys), features.shape, features.dtype)

1981161 (1981161, 512) float16


In [169]:
data['photo_id'].to_csv('unsplash-dataset/clip/keys.csv', index=False)

In [170]:
import numpy as np
np.save('unsplash-dataset/clip/features.npy', features)

NameError: name 'np' is not defined