In [None]:
!python3 --version

In [None]:
!python3 -m pip --version

In [None]:
!python3 -m pip install -r requirements/dev.txt

In [None]:
!ls data/UGallery -sh

# Data processing procedure

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
from collections import defaultdict
from math import ceil
from os import cpu_count
from os.path import join

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

from utils.ugallery.data import load_embedding, load_embedding_legacy, concatenate_embedding
from utils.ugallery.entities import Inventory, User
from utils.ugallery.hashing import HashesContainer
from utils.ugallery.sampling import pre_hash, strategy_1, strategy_2, strategy_3, strategy_4, strategy_5, strategy_6

## Creating visual clusters

In [None]:
SETTINGS = {
    "clustering:n_times": 20,
    "clustering:n_init": 8,
    "clustering:n_jobs": cpu_count(),
    "embeddings:public": True,
    "inventory:public": True,
    "output:index_mode": False,
}

In [None]:
resnet50_embedding_path = join("data", "UGallery", "ugallery_resnet50_embeddings.npy")
resnet50_legacy_embedding_path = join("data", "UGallery", "private", "flatten_1.npy")
resnet50_legacy_ids_path = join("data", "UGallery", "private", "ids")
resnext101_legacy_embedding_path = join("data", "UGallery", "private", "features.npy")
resnext101_legacy_ids_path = join("data", "UGallery", "private", "ids.npy")

In [None]:
# Load embeddings from the available files
if SETTINGS["embeddings:public"]:
    EMBEDDINGS = {
        "ResNet50 (public)": load_embedding(resnet50_embedding_path),
    }
else:
    EMBEDDINGS = {
        "ResNet50 (private)": load_embedding_legacy(resnet50_legacy_embedding_path, resnet50_legacy_ids_path),
        "ResNeXt-101 (private)": load_embedding_legacy(resnext101_legacy_embedding_path, resnext101_legacy_ids_path),
    }

In [None]:
for embedding_name, embedding_data in EMBEDDINGS.items():
    print(f"{embedding_name} embedding shape: {embedding_data.features.shape}")

print(f"Merge {len(EMBEDDINGS)} embeddings into one...")
embedding = concatenate_embedding(EMBEDDINGS)
print(f"Merged embedding shape: {embedding.features.shape}")

In [None]:
# 0. z-score normalization of embedding
embedding.features = StandardScaler().fit_transform(embedding.features)
print(f"z-score normalization shape: {embedding.features.shape}")

In [None]:
# 1. Conduct PCA to reduce dimension
embedding.features = PCA(n_components=200).fit_transform(embedding.features)
print(f"PCA reduction shape: {embedding.features.shape}")

In [None]:
# 2. Perform k-means clustering with 100 clusters 20 times
# and keep the clusterer with the highest Silhouette coefficient
best_score = float("-inf")
best_clusterer = None

for i in range(SETTINGS["clustering:n_times"]):
    clusterer = KMeans(
        n_clusters=100,
        max_iter=2000,
        n_init=SETTINGS["clustering:n_init"],
        n_jobs=SETTINGS["clustering:n_jobs"],
    ).fit(embedding.features)
    score = silhouette_score(embedding.features, clusterer.labels_)
    if score > best_score:
        best_clusterer = clusterer
        best_score = score
        print(f"Silhouette score ({i + 1}): {score} - New highest!")
    else:
        print(f"Silhouette score ({i + 1}): {score}")

print(f">> Best Silhouette score: {best_score}")

In [None]:
# 3. Label each image with its respective visual cluster
id2cluster = dict()
cluster2id = defaultdict(list)
for i, label in enumerate(best_clusterer.labels_):
    artwork_id = embedding.index2id[i]
    id2cluster[artwork_id] = label
    cluster2id[label].append(artwork_id)

n_clusters = len(set(id2cluster.values()))
print(f"There are n_clusters: {n_clusters}")

## Sampling triples

In [None]:
if SETTINGS["inventory:public"]:
    inventory_path = join("data", "UGallery", "ugallery_inventory.csv")
    purchases_path = join("data", "UGallery", "ugallery_purchases.csv")
    inventory = Inventory(inventory_path, purchases_path)
else:
    inventory_path = join("data", "UGallery", "private", "valid_artworks.csv")
    purchases_path = join("data", "UGallery", "private", "valid_sales.csv")
    inventory = Inventory(inventory_path, purchases_path, legacy=True)

In [None]:
TOTAL_SAMPLES_TRAIN = 10_000_000
TOTAL_SAMPLES_VALID = TOTAL_SAMPLES_TRAIN * 0.05

N_STRATEGIES = 6
N_SAMPLES_PER_STRATEGY_TRAIN = ceil(TOTAL_SAMPLES_TRAIN / N_STRATEGIES)
N_SAMPLES_PER_STRATEGY_VALID = ceil(TOTAL_SAMPLES_VALID / N_STRATEGIES)

In [None]:
id2artist = dict(
    zip(
        inventory.inventory["artwork_id"],
        inventory.inventory["artist_id"],
    ))
artist2id = defaultdict(list)
for artwork_id, artist_id in id2artist.items():
    artist2id[artist_id].append(artwork_id)

In [None]:
inventory.build_users(id2cluster, id2artist)

In [None]:
hashes_container = HashesContainer()

### 1) Predicting missing item in purchase basket

In [None]:
s1_train = strategy_1(
    N_SAMPLES_PER_STRATEGY_TRAIN,
    inventory, hashes_container,
    id2cluster, id2artist,
)
s1_validation = strategy_1(
    N_SAMPLES_PER_STRATEGY_VALID,
    inventory, hashes_container,
    id2cluster, id2artist,
)

### 2) Predicting next purchase basket

In [None]:
s2_train = strategy_2(
    N_SAMPLES_PER_STRATEGY_TRAIN,
    inventory, hashes_container,
    id2cluster, id2artist,
)
s2_validation = strategy_2(
    N_SAMPLES_PER_STRATEGY_VALID,
    inventory, hashes_container,
    id2cluster, id2artist,
)

### 3) Recommending visually similar artworks from favorite artists

In [None]:
s3_train = strategy_3(
    N_SAMPLES_PER_STRATEGY_TRAIN,
    inventory, hashes_container,
    id2cluster, id2artist, cluster2id, artist2id,
)
s3_validation = strategy_3(
    N_SAMPLES_PER_STRATEGY_VALID,
    inventory, hashes_container,
    id2cluster, id2artist, cluster2id, artist2id,
)

### 4) Recommending profile items from the same user profile

In [None]:
s4_train = strategy_4(
    N_SAMPLES_PER_STRATEGY_TRAIN,
    inventory, hashes_container,
)
s4_validation = strategy_4(
    N_SAMPLES_PER_STRATEGY_VALID,
    inventory, hashes_container,
)

### 5) Recommending profile items given an artificially created user profile

In [None]:
s5_train = strategy_5(
    N_SAMPLES_PER_STRATEGY_TRAIN,
    inventory, hashes_container,
)
s5_validation = strategy_5(
    N_SAMPLES_PER_STRATEGY_VALID,
    inventory, hashes_container,
)

### 6) Artificial profile with a single item: recommend visually similar items from the same artist

In [None]:
s6_train = strategy_6(
    N_SAMPLES_PER_STRATEGY_TRAIN,
    inventory, hashes_container,
    id2artist, artist2id,
)
s6_validation = strategy_6(
    N_SAMPLES_PER_STRATEGY_VALID,
    inventory, hashes_container,
    id2artist, artist2id,
)

In [None]:
print(f"Total collisions: {hashes_container.collisions}")

## Store data

Tuples will be scores as artwork indexes in the embedding instead of using the hashes, to improve performance and memory usage.

### Training data

In [None]:
# Merge strategies samples
TRAINING_DATA = [
    s1_train,
    s2_train,
    s3_train,
    s4_train,
    s5_train,
    s6_train,
]
# Transform samples from ids to indexes
TRAINING_DATA = [
    (triple[0], triple[1], triple[2])
    for strategy_samples in TRAINING_DATA
    for triple in strategy_samples
]
print(f"There are {len(TRAINING_DATA)} training samples")

In [None]:
# Search for duplicated hashes
training_hash_check = HashesContainer()
for triple in TRAINING_DATA:
    assert training_hash_check.enroll(pre_hash(triple))
print("No duplicated hashes found")

In [None]:
# Convert ids to indexes
if SETTINGS["output:index_mode"]:
    TRAINING_DATA = [
        (
            [embedding.id2index[i] for i in triple[0]],
            embedding.id2index[triple[1]],
            embedding.id2index[triple[2]],
        )
        for triple in TRAINING_DATA
    ]
print("Creating training output DataFrame")
df_train = pd.DataFrame(TRAINING_DATA, columns=["profile", "pi", "ni"])
df_train.head()

In [None]:
if SETTINGS["embeddings:public"]:
    output_train = join("data", "UGallery", "train_public.csv")
else:
    output_train = join("data", "UGallery", "train_private.csv")
df_train.to_csv(output_train, index=False)

### Validation data

In [None]:
# Merge strategies samples
VALIDATION_DATA = [
    s1_validation,
    s2_validation,
    s3_validation,
    s4_validation,
    s5_validation,
    s6_validation,
]
# Transform samples from ids to indexes
VALIDATION_DATA = [
    (triple[0], triple[1], triple[2])
    for strategy_samples in VALIDATION_DATA
    for triple in strategy_samples
]
print(f"There are {len(VALIDATION_DATA)} validation samples")

In [None]:
# Search for duplicated hashes
validation_hash_check = HashesContainer()
for triple in VALIDATION_DATA:
    assert validation_hash_check.enroll(pre_hash(triple))

print("No duplicated hashes found")

In [None]:
# Convert ids to indexes
if SETTINGS["output:index_mode"]:
    VALIDATION_DATA = [
        (
            [embedding.id2index[i] for i in triple[0]],
            embedding.id2index[triple[1]],
            embedding.id2index[triple[2]],
        )
        for triple in VALIDATION_DATA
    ]
print("Creating training output DataFrame")
df_validation = pd.DataFrame(VALIDATION_DATA, columns=["profile", "pi", "ni"])
df_validation.head()

In [None]:
if SETTINGS["embeddings:public"]:
    output_validation = join("data", "UGallery", "validation_public.csv")
else:
    output_validation = join("data", "UGallery", "validation_private.csv")
df_validation.to_csv(output_validation, index=False)

### Test data (evaluation)

In [None]:
if SETTINGS["output:index_mode"]:
    evaluation_baskets = {
        uid: {
            "profile": [embedding.id2index[i] for i in user.profile],
            "evaluation_basket": [embedding.id2index[i] for i in user.evaluation_basket],
            "evaluation_timestamp": user.evaluation_timestamp,
        }
        for uid, user in inventory.users.items()
        if user.evaluation_basket
    }
else:
    evaluation_baskets = {
        uid: {
            "profile": [i for i in user.profile],
            "evaluation_basket": [i for i in user.evaluation_basket],
            "evaluation_timestamp": user.evaluation_timestamp,
        }
        for uid, user in inventory.users.items()
        if user.evaluation_basket
    }

print(f"There are {len(evaluation_baskets)} evaluation baskets/users")

In [None]:
if SETTINGS["embeddings:public"]:
    output_evaluation = join("data", "UGallery", "evaluation_public.json")
else:
    output_evaluation = join("data", "UGallery", "evaluation_private.json")

with open(output_evaluation, "w") as file:
    json.dump(evaluation_baskets, file, indent=4)