```
Required python dependencies

transformers==4.33.3
timm==0.9.7
cassio>=0.1.3
datasets==2.13.1
gradio==3.36.1
jupyter>=1.0.0
numpy==1.24.4
panns_inference==0.1.1
python-dotenv==1.0.0
scipy>=1.10
torch==2.0.1
torchvision==0.15.2
```

In [2]:
DATASET_TARGET_SIZES = [100, 500, 1000, 2000, 5000]

In [3]:
from PIL import Image
import torch
import torchvision
import torchvision.transforms as T
from torchvision.transforms import (
    Compose,
    Resize,
    CenterCrop,
    ToTensor,
    Normalize
)
import numpy as np
from transformers import pipeline

from tqdm.auto import tqdm

In [4]:
preprocess = Compose([
    Resize(256),
    CenterCrop(224),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [5]:
DATA_DIRECTORY = "data"

datasets = {
    "CIFAR10": torchvision.datasets.CIFAR10(
        DATA_DIRECTORY,
        transform=preprocess,
        download=True,
    ),
}

Files already downloaded and verified


In [6]:
labels_dict = {
    0: "airplane",
    1: "automobile",
    2: "bird",
    3: "cat",
    4: "deer",
    5: "dog",
    6: "frog",
    7: "horse",
    8: "ship",
    9: "truck",
}

In [7]:
img_per_class = (1 + max(DATASET_TARGET_SIZES)) // len(labels_dict)
n_classes = len(labels_dict)
sel_img_map = {i: [] for i in range(n_classes)}
labels_map = {i: [] for i in range(n_classes)}
sel_img_arr_map = {i: [] for i in range(n_classes)}

# Iterate through each class and select a number of images
for (img, cl_label), img_arr in zip(datasets["CIFAR10"], datasets["CIFAR10"].data):

    # are all classes completed?
    if all(len(ims) >= img_per_class for ims in sel_img_map.values()):
        break

    if len(sel_img_map[cl_label]) < img_per_class:
        # add this one
        sel_img_map[cl_label].append(img)
        labels_map[cl_label].append(cl_label)
        sel_img_arr_map[cl_label].append(img_arr)

selected_images = []
labels = []
selected_img_array = []
for cl_label in range(n_classes):
    selected_images += sel_img_map[cl_label]
    labels += labels_map[cl_label]
    selected_img_array += sel_img_arr_map[cl_label]
print("Total selected images:", len(selected_images))

Total selected images: 5000


In [None]:
GPU_AVAILABLE = torch.cuda.device_count() > 0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torchvision.models.squeezenet1_1(
    weights=torchvision.models.SqueezeNet1_1_Weights.IMAGENET1K_V1
).to(device).eval()

if GPU_AVAILABLE:
    print("Loaded the image embedding model on the GPU.")
else:
    print("Loaded the image embedding model on the CPU.")

In [9]:
def get_vector_metadata(label_indices, class_list):
    """Return list of {"label": <class name>}."""
    return [{"label": class_list[index]} for index in label_indices]

# Creating Vector IDs
# Each vector ID will have a prefix corresponding to CIFAR10
def get_vector_ids(batch_number, batch_size, prefix):
    """Return vector ids."""
    start_index = batch_number
    end_index = start_index + batch_size
    ids = np.arange(start_index, end_index)

    # create id based on prefix
    # eg. if id == 5, prefix == "CIFAR10", then create "CIFAR10.5" as vector id.
    ids_with_prefix = [f"{prefix}.{str(x)}" for x in ids]
    return ids_with_prefix


def get_vectors_from_batch(data_processed, label_indices,batch_number, dataset):
    """Return list of tuples like (vector_id, vector_values, vector_metadata)."""
    num_records = len(data_processed)
    prefix = dataset.__class__.__name__
    with torch.no_grad():
        # generate image embeddings with PyTorch model
        vector_values = model(data_processed).tolist()
    # return respective IDs/metadata for each image embedding
    vector_metadata = get_vector_metadata(label_indices, dataset.classes)
    vector_ids = get_vector_ids(batch_number, num_records, prefix)
    return list(zip(vector_ids, vector_values, vector_metadata))

dataset = datasets["CIFAR10"]
# Move the data to the respective device
preprocessed_data = torch.stack(selected_images).to(device)

In [10]:
# further labeling
tr = T.ToPILImage()
lab_model=pipeline("image-classification")

def label_img(img_data):
    res = lab_model(tr(img_data))
    if res:
        pt = res[0]['label'].split(',')[0]
        if pt:
            return pt
        else:
            return None
    else:
        return None

No model was supplied, defaulted to google/vit-base-patch16-224 and revision 5dca96d (https://huggingface.co/google/vit-base-patch16-224).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [11]:
BATCH_SIZE = 100
SAMPLES_TO_PROCESS = len(selected_images)

full_items = []

for i in tqdm(range(0, SAMPLES_TO_PROCESS, BATCH_SIZE)):
    # Find end of batch
    i_end = min(i + BATCH_SIZE, SAMPLES_TO_PROCESS)
    # Generate embeddings for all the images in the batch
    # with the corresponding vector id and metadata lists
    batch_vectors = get_vectors_from_batch(
        preprocessed_data[i:i_end],
        labels[i:i_end],
        i,
        dataset,
    )

    for deltai, (vector_id, embedding, metadata) in enumerate(batch_vectors):
        img_label = label_img(selected_img_array[i+deltai])
        full_items.append(
            {
                'id': vector_id,
                'embedding': embedding,
                'label': metadata['label'],
                'metadata': {'content': img_label},
            }
        )

  0%|          | 0/50 [00:00<?, ?it/s]

In [12]:
def _permute(lst, n):
    return list(np.random.permutation(lst)[:n])

In [13]:
# permutations and save all at once
all_datasets = {
    size: _permute(full_items, size)
    for size in DATASET_TARGET_SIZES
}

In [14]:
import json

def make_json(itm):
    return itm

def _make_csv_em(v):
    return '"'+str(v)+'"'

def _make_csv_md(md):
    return '"'+json.dumps(md, separators=[',', ':']).replace('"', '\\"')+'"'

def make_csv_dict(itm):
    return {
        'id': itm['id'],
        'embedding': _make_csv_em(itm['embedding']),
        'label': itm['label'],
        'metadata': _make_csv_md(itm['metadata']),
    }

#### manual hacking of the CSV output for dsbulk:

In [15]:
md = {'k1': 'v1', 'k2': 'v2'}

print(_make_csv_md(md))

"{\"k1\":\"v1\",\"k2\":\"v2\"}"


`"{\"k1\":\"v1\",\"k2\":\"v2\"}"`

In [16]:
v = [0.1, 0.3333333333333339, 0.3]
print(_make_csv_em(v))

"[0.1, 0.3333333333333339, 0.3]"


#### manual hacking of the JSON output to keep the numbers short for dsbulk:

In [None]:
## inspired by:
## https://stackoverflow.com/questions/54370322/how-to-limit-the-number-of-float-digits-jsonencoder-produces

orig_dict = {'a': 'letter', 'v': [2.5328171253204346, 4.954568386077881]}

print(json.dumps(orig_dict, indent=4, default=json_handler))

In [None]:
class RoundingFloat(float):
    __repr__ = staticmethod(lambda x: format(x, '.5f'))

json.encoder.c_make_encoder = None
if hasattr(json.encoder, 'FLOAT_REPR'):
    # Python 2
    json.encoder.FLOAT_REPR = RoundingFloat.__repr__
else:
    # Python 3
    json.encoder.float = RoundingFloat

In [None]:
print(json.dumps(orig_dict, indent=4, default=json_handler))

## Hackings completed, we can print

In [17]:
import csv

In [50]:
csv.register_dialect('dsbulk', escapechar='\\')

for size, items in all_datasets.items():
    fname = 'cifar10-data-%s.csv' % size
    with open(fname, 'w', newline='') as csvfile:
        fieldnames = ['id', 'embedding', 'label', 'metadata']
        csvfile.write(f"{','.join(fieldnames)}\n")
        # writer = csv.DictWriter(csvfile, dialect='dsbulk', fieldnames=fieldnames)
    
        # writer.writeheader()
        for itm in items:
            citm = make_csv_dict(itm)
            csvfile.write(f"{citm['id']},{citm['embedding']},{citm['label']},{citm['metadata']}\n")

for size, items in all_datasets.items():
    fname = 'cifar10-data-%s-no-embedding.csv' % size
    with open(fname, 'w', newline='') as csvfile:
        fieldnames = ['id', 'label', 'metadata']
        csvfile.write(f"{','.join(fieldnames)}\n")
        # writer = csv.DictWriter(csvfile, dialect='dsbulk', fieldnames=fieldnames)
    
        # writer.writeheader()
        for itm in items:
            citm = make_csv_dict(itm)
            csvfile.write(f"{citm['id']},{citm['label']},{citm['metadata']}\n")

In [19]:
import json

In [51]:
for size, items in all_datasets.items():
    fname = 'cifar10-data-%s.json' % size
    with open(fname, 'w') as jsonfile:
        json.dump(items, jsonfile, indent=1)

for size, items in all_datasets.items():
    fname = 'cifar10-data-%s-no-embedding.json' % size
    with open(fname, 'w') as jsonfile:
        no_e_items = [
            {
                k: v
                for k, v in itm.items()
                if k != 'embedding'
            }
            for itm in items
        ]
        json.dump(no_e_items, jsonfile, indent=1)