In [1]:
!python3 --version

Python 3.8.1


In [2]:
!python3 -m pip --version

pip 20.0.2 from /home/aaossa/.local/lib/python3.8/site-packages/pip (python 3.8)


In [3]:
!python3 -m pip install -r requirements/dev.txt

Defaulting to user installation because normal site-packages is not writeable


In [4]:
!ls data/Ugallery -sh

total 482M
4.0K README.md		     540K ugallery_inventory.csv
4.0K Readme.txt		     420K ugallery_purchases.csv
240K evaluation_public.json  260M ugallery_resnet50_embeddings.npy
   0 private		      11M validation_public.csv
211M train_public.csv


# Data processing procedure

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import json
from collections import defaultdict
from math import ceil
from os import cpu_count
from os.path import join

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm

from utils.ugallery.data import load_embedding, load_embedding_legacy, concatenate_embedding
from utils.ugallery.entities import Inventory, User
from utils.ugallery.hashing import HashesContainer
from utils.ugallery.sampling import pre_hash, strategy_1, strategy_2, strategy_3, strategy_4, strategy_5, strategy_6

  from tqdm.autonotebook import tqdm


## Creating visual clusters

In [7]:
# TODO(Antonio): Seems like the private version has disconection between embedding data and inventory data
# TODO(Antonio): Check if the same happens in the public version

In [8]:
SETTINGS = {
    "clustering:n_times": 1,  # Should be 20
    "clustering:n_init": 1,  # Should be 8
    "clustering:n_jobs": cpu_count(),  # 8 is ideal
    "embeddings:public": True,
    "inventory:public": True,
}

In [9]:
resnet50_embedding_path = join("data", "UGallery", "ugallery_resnet50_embeddings.npy")
resnet50_legacy_embedding_path = join("data", "UGallery", "private", "flatten_1.npy")
resnet50_legacy_ids_path = join("data", "Ugallery", "private", "ids")
resnext101_legacy_embedding_path = join("data", "UGallery", "private", "features.npy")
resnext101_legacy_ids_path = join("data", "Ugallery", "private", "ids.npy")

In [10]:
# Load embeddings from the available files
if SETTINGS["embeddings:public"]:
    EMBEDDINGS = {
        "ResNet50 (public)": load_embedding(resnet50_embedding_path),
    }
else:
    EMBEDDINGS = {
        "ResNet50 (private)": load_embedding_legacy(resnet50_legacy_embedding_path, resnet50_legacy_ids_path),
        "ResNeXt-101 (private)": load_embedding_legacy(resnext101_legacy_embedding_path, resnext101_legacy_ids_path),
    }

In [11]:
for embedding_name, embedding_data in EMBEDDINGS.items():
    print(f"{embedding_name} embedding shape: {embedding_data.features.shape}")

print(f"Merge {len(EMBEDDINGS)} embeddings into one...")
embedding = concatenate_embedding(EMBEDDINGS)
print(f"Merged embedding shape: {embedding.features.shape}")

ResNet50 (public) embedding shape: (13297, 2048)
Merge 1 embeddings into one...
Merged embedding shape: (13297, 2048)


In [12]:
# 0. z-score normalization of embedding
embedding.features = StandardScaler().fit_transform(embedding.features)
print(f"z-score normalization shape: {embedding.features.shape}")

z-score normalization shape: (13297, 2048)


In [13]:
# 1. Conduct PCA to reduce dimension
embedding.features = PCA(n_components=200).fit_transform(embedding.features)
print(f"PCA reduction shape: {embedding.features.shape}")

PCA reduction shape: (13297, 200)


In [14]:
# 2. Perform k-means clustering with 100 clusters 20 times
# and keep the clusterer with the highest Silhouette coefficient
best_score = float("-inf")
best_clusterer = None

for i in range(SETTINGS["clustering:n_times"]):
    clusterer = KMeans(
        n_clusters=100,
        max_iter=2000,
        n_init=SETTINGS["clustering:n_init"],
        n_jobs=SETTINGS["clustering:n_jobs"],
    ).fit(embedding.features)
    score = silhouette_score(embedding.features, clusterer.labels_)
    if score > best_score:
        best_clusterer = clusterer
        best_score = score
        print(f"Silhouette score ({i + 1}): {score} - New highest!")
    else:
        print(f"Silhouette score ({i + 1}): {score}")

print(f">> Best Silhouette score: {best_score}")

Silhouette score (1): 0.004584513773707563 - New highest!
>> Best Silhouette score: 0.004584513773707563


In [15]:
# 3. Label each image with its respective visual cluster
id2cluster = dict()
cluster2id = defaultdict(list)
for i, label in enumerate(best_clusterer.labels_):
    artwork_id = embedding.index2id[i]
    id2cluster[artwork_id] = label
    cluster2id[label].append(artwork_id)

n_clusters = len(set(id2cluster.values()))
print(f"There are n_clusters: {n_clusters}")

There are n_clusters: 100


## Sampling triples

In [16]:
if SETTINGS["inventory:public"]:
    inventory_path = join("data", "UGallery", "ugallery_inventory.csv")
    purchases_path = join("data", "UGallery", "ugallery_purchases.csv")
    inventory = Inventory(inventory_path, purchases_path)
else:
    inventory_path = join("data", "UGallery", "private", "valid_artworks.csv")
    purchases_path = join("data", "UGallery", "private", "valid_sales.csv")
    inventory = Inventory(inventory_path, purchases_path, legacy=True)

In [17]:
TOTAL_SAMPLES_TRAIN = 10_000_000
TOTAL_SAMPLES_VALID = TOTAL_SAMPLES_TRAIN * 0.05

N_STRATEGIES = 6
N_SAMPLES_PER_STRATEGY_TRAIN = ceil(TOTAL_SAMPLES_TRAIN / N_STRATEGIES)
N_SAMPLES_PER_STRATEGY_VALID = ceil(TOTAL_SAMPLES_VALID / N_STRATEGIES)

In [18]:
id2artist = dict(
    zip(
        inventory.inventory["artwork_id"],
        inventory.inventory["artist_id"],
    ))
artist2id = defaultdict(list)
for artwork_id, artist_id in id2artist.items():
    artist2id[artist_id].append(artwork_id)

In [19]:
inventory.build_users(id2cluster, id2artist)

In [20]:
hashes_container = HashesContainer()

### 1) Predicting missing item in purchase basket

In [21]:
s1_train = strategy_1(
    N_SAMPLES_PER_STRATEGY_TRAIN,
    inventory, hashes_container,
    id2cluster, id2artist,
)
s1_validation = strategy_1(
    N_SAMPLES_PER_STRATEGY_VALID,
    inventory, hashes_container,
    id2cluster, id2artist,
)

Strategy 1) Predicting missing item in purchase basket
Valid users: 625 | Samples/user: 2667
Target: 1666667 | Total samples: 1666875


HBox(children=(FloatProgress(value=0.0, description='Valid users', max=2919.0, style=ProgressStyle(description…


Hash collisions: 137535
Samples: 1666875


HBox(children=(FloatProgress(value=0.0, description='Check S1', max=1666875.0, style=ProgressStyle(description…


Strategy 1) Predicting missing item in purchase basket
Valid users: 625 | Samples/user: 134
Target: 83334 | Total samples: 83750


HBox(children=(FloatProgress(value=0.0, description='Valid users', max=2919.0, style=ProgressStyle(description…


Hash collisions: 14964
Samples: 83750


HBox(children=(FloatProgress(value=0.0, description='Check S1', max=83750.0, style=ProgressStyle(description_w…




### 2) Predicting next purchase basket

In [22]:
s2_train = strategy_2(
    N_SAMPLES_PER_STRATEGY_TRAIN,
    inventory, hashes_container,
    id2cluster, id2artist,
)
s2_validation = strategy_2(
    N_SAMPLES_PER_STRATEGY_VALID,
    inventory, hashes_container,
    id2cluster, id2artist,
)

Strategy 2) Predicting next purchase basket
Valid users: 455 | Samples/user: 3664
Target: 1666667 | Total samples: 1667120


HBox(children=(FloatProgress(value=0.0, description='Valid users', max=2919.0, style=ProgressStyle(description…


Hash collisions: 384577
Strategy 2) Predicting next purchase basket
Samples: 1667120


HBox(children=(FloatProgress(value=0.0, description='Check S2', max=1667120.0, style=ProgressStyle(description…


Strategy 2) Predicting next purchase basket
Valid users: 455 | Samples/user: 184
Target: 83334 | Total samples: 83720


HBox(children=(FloatProgress(value=0.0, description='Valid users', max=2919.0, style=ProgressStyle(description…


Hash collisions: 49397
Strategy 2) Predicting next purchase basket
Samples: 83720


HBox(children=(FloatProgress(value=0.0, description='Check S2', max=83720.0, style=ProgressStyle(description_w…




### 3) Recommending visually similar artworks from favorite artists

In [23]:
s3_train = strategy_3(
    N_SAMPLES_PER_STRATEGY_TRAIN,
    inventory, hashes_container,
    id2cluster, id2artist, cluster2id, artist2id,
)
s3_validation = strategy_3(
    N_SAMPLES_PER_STRATEGY_VALID,
    inventory, hashes_container,
    id2cluster, id2artist, cluster2id, artist2id,
)

Strategy 3) Recommending visually similar artworks from favorite artists
Valid users: 2214 | Samples/user: 753
Target: 1666667 | Total samples: 1667142


HBox(children=(FloatProgress(value=0.0, description='Valid users', max=2919.0, style=ProgressStyle(description…


Hash collisions: 51238
Strategy 3) Recommending visually similar artworks from favorite artists
Samples: 1667142


HBox(children=(FloatProgress(value=0.0, description='Check S3', max=1667142.0, style=ProgressStyle(description…


Strategy 3) Recommending visually similar artworks from favorite artists
Valid users: 2214 | Samples/user: 38
Target: 83334 | Total samples: 84132


HBox(children=(FloatProgress(value=0.0, description='Valid users', max=2919.0, style=ProgressStyle(description…


Hash collisions: 4697
Strategy 3) Recommending visually similar artworks from favorite artists
Samples: 84132


HBox(children=(FloatProgress(value=0.0, description='Check S3', max=84132.0, style=ProgressStyle(description_w…




### 4) Recommending profile items from the same user profile

In [24]:
s4_train = strategy_4(
    N_SAMPLES_PER_STRATEGY_TRAIN,
    inventory, hashes_container,
)
s4_validation = strategy_4(
    N_SAMPLES_PER_STRATEGY_VALID,
    inventory, hashes_container,
)

Strategy 4) Recommending profile items from the same user profile
Valid users: 2919 | Samples/user: 571
Target: 1666667 | Total samples: 1666749


HBox(children=(FloatProgress(value=0.0, description='Valid users', max=2919.0, style=ProgressStyle(description…


Hash collisions: 89864
Strategy 4) Recommending profile items from the same user profile
Samples: 1666749


HBox(children=(FloatProgress(value=0.0, description='Check S4', max=1666749.0, style=ProgressStyle(description…


Strategy 4) Recommending profile items from the same user profile
Valid users: 2919 | Samples/user: 29
Target: 83334 | Total samples: 84651


HBox(children=(FloatProgress(value=0.0, description='Valid users', max=2919.0, style=ProgressStyle(description…


Hash collisions: 8691
Strategy 4) Recommending profile items from the same user profile
Samples: 84651


HBox(children=(FloatProgress(value=0.0, description='Check S4', max=84651.0, style=ProgressStyle(description_w…




### 5) Recommending profile items given an artificially created user profile

In [25]:
s5_train = strategy_5(
    N_SAMPLES_PER_STRATEGY_TRAIN,
    inventory, hashes_container,
)
s5_validation = strategy_5(
    N_SAMPLES_PER_STRATEGY_VALID,
    inventory, hashes_container,
)

Strategy 5) Recommending profile items given an artificially created user profile
Target: 1666667 | Total samples: 1666667


HBox(children=(FloatProgress(value=0.0, description='Valid artificial profiles', max=1666667.0, style=Progress…


Hash collisions: 95155
Strategy 5) Recommending profile items given an artificially created user profile
Samples: 1666667


HBox(children=(FloatProgress(value=0.0, description='Check S5', max=1666667.0, style=ProgressStyle(description…


Strategy 5) Recommending profile items given an artificially created user profile
Target: 83334 | Total samples: 83334


HBox(children=(FloatProgress(value=0.0, description='Valid artificial profiles', max=83334.0, style=ProgressSt…


Hash collisions: 6215
Strategy 5) Recommending profile items given an artificially created user profile
Samples: 83334


HBox(children=(FloatProgress(value=0.0, description='Check S5', max=83334.0, style=ProgressStyle(description_w…




### 6) Artificial profile with a single item: recommend visually similar items from the same artist

In [26]:
s6_train = strategy_6(
    N_SAMPLES_PER_STRATEGY_TRAIN,
    inventory, hashes_container,
    id2artist, artist2id,
)
s6_validation = strategy_6(
    N_SAMPLES_PER_STRATEGY_VALID,
    inventory, hashes_container,
    id2artist, artist2id,
)

Strategy 6) Artificial profile with a single item: recommend visually similar items from the same artist
Target: 1666667 | Total samples: 1666667


HBox(children=(FloatProgress(value=0.0, description='Valid artificial profiles', max=1666667.0, style=Progress…


Hash collisions: 45235
Strategy 6) Artificial profile with a single item: recommend visually similar items from the same artist
Samples: 1666667


HBox(children=(FloatProgress(value=0.0, description='Check S6', max=1666667.0, style=ProgressStyle(description…


Strategy 6) Artificial profile with a single item: recommend visually similar items from the same artist
Target: 83334 | Total samples: 83334


HBox(children=(FloatProgress(value=0.0, description='Valid artificial profiles', max=83334.0, style=ProgressSt…


Hash collisions: 2519
Strategy 6) Artificial profile with a single item: recommend visually similar items from the same artist
Samples: 83334


HBox(children=(FloatProgress(value=0.0, description='Check S6', max=83334.0, style=ProgressStyle(description_w…




In [27]:
print(f"Total collisions: {hashes_container.collisions}")

Total collisions: 890087


## Store data

Tuples will be scores as artwork indexes in the embedding instead of using the hashes, to improve performance and memory usage.

### Training data

In [28]:
# Merge strategies samples
TRAINING_DATA = [
    s1_train,
    s2_train,
    s3_train,
    s4_train,
    s5_train,
    s6_train,
]
# Transform samples from ids to indexes
TRAINING_DATA = [
    (triple[0], triple[1], triple[2])
    for strategy_samples in TRAINING_DATA
    for triple in strategy_samples
]
print(f"There are {len(TRAINING_DATA)} training samples")

There are 10001220 training samples


In [29]:
# Search for duplicated hashes
training_hash_check = HashesContainer()
for triple in TRAINING_DATA:
    assert training_hash_check.enroll(pre_hash(triple))
print("No duplicated hashes found")

No duplicated hashes found


In [30]:
# Convert ids to indexes
TRAINING_DATA = [
    (
        [embedding.id2index[i] for i in triple[0]],
        embedding.id2index[triple[1]],
        embedding.id2index[triple[2]],
    )
    for triple in TRAINING_DATA
]
print("Creating training output DataFrame")
df_train = pd.DataFrame(TRAINING_DATA, columns=["profile", "pi", "ni"])
df_train.head()

Creating training output DataFrame


Unnamed: 0,profile,pi,ni
0,"[11034, 11523]",10928,1195
1,"[11034, 10928]",11523,12448
2,"[11523, 10928]",11034,11842
3,"[11523, 10928]",11034,2082
4,"[11034, 10928]",11523,13288


In [31]:
if SETTINGS["embeddings:public"]:
    output_train = join("data", "Ugallery", "train_public.csv")
else:
    output_train = join("data", "Ugallery", "train_private.csv")
df_train.to_csv(output_train, index=False)

### Validation data

In [32]:
# Merge strategies samples
VALIDATION_DATA = [
    s1_validation,
    s2_validation,
    s3_validation,
    s4_validation,
    s5_validation,
    s6_validation,
]
# Transform samples from ids to indexes
VALIDATION_DATA = [
    (triple[0], triple[1], triple[2])
    for strategy_samples in VALIDATION_DATA
    for triple in strategy_samples
]
print(f"There are {len(VALIDATION_DATA)} validation samples")

There are 502921 validation samples


In [33]:
# Search for duplicated hashes
validation_hash_check = HashesContainer()
for triple in VALIDATION_DATA:
    assert validation_hash_check.enroll(pre_hash(triple))

print("No duplicated hashes found")

No duplicated hashes found


In [34]:
# Convert ids to indexes
VALIDATION_DATA = [
    (
        [embedding.id2index[i] for i in triple[0]],
        embedding.id2index[triple[1]],
        embedding.id2index[triple[2]],
    )
    for triple in VALIDATION_DATA
]
print("Creating training output DataFrame")
df_validation = pd.DataFrame(VALIDATION_DATA, columns=["profile", "pi", "ni"])
df_validation.head()

Creating training output DataFrame


Unnamed: 0,profile,pi,ni
0,"[11523, 10928]",11034,3831
1,"[11034, 10928]",11523,7317
2,"[11523, 10928]",11034,9098
3,"[11523, 10928]",11034,1805
4,"[11523, 10928]",11034,9159


In [35]:
if SETTINGS["embeddings:public"]:
    output_validation = join("data", "Ugallery", "validation_public.csv")
else:
    output_validation = join("data", "Ugallery", "validation_private.csv")
df_validation.to_csv(output_validation, index=False)

### Test data (evaluation)

In [36]:
evaluation_baskets = {
    uid: {
        "profile": [embedding.id2index[i] for i in user.profile],
        "evaluation_basket": [embedding.id2index[i] for i in user.evaluation_basket],
        "evaluation_timestamp": user.evaluation_timestamp,
    }
    for uid, user in inventory.users.items()
    if user.evaluation_basket
}

print(f"There are {len(evaluation_baskets)} evaluation baskets/users")

There are 1073 evaluation baskets/users


In [37]:
if SETTINGS["embeddings:public"]:
    output_evaluation = join("data", "Ugallery", "evaluation_public.json")
else:
    output_evaluation = join("data", "Ugallery", "evaluation_private.json")

with open(output_evaluation, "w") as file:
    json.dump(evaluation_baskets, file, indent=4)