In [1]:
!python3 --version

Python 3.8.1


In [2]:
!python3 -m pip --version

pip 20.0.2 from /home/aaossa/.local/lib/python3.8/site-packages/pip (python 3.8)


In [3]:
import sys

print(sys.version)

3.8.1 (default, Jan 25 2020, 13:45:32) 
[GCC 7.4.0]


In [4]:
!python3 -m pip install -r requirements/dev.txt

Defaulting to user installation because normal site-packages is not writeable


In [5]:
!ls data/UGallery

Readme.txt		ugallery_purchases.csv
ugallery_inventory.csv	ugallery_resnet50_embeddings.npy


In [6]:
!head -n 5 data/UGallery/ugallery_inventory.csv

artwork_id_hash,artist_id_hash,upload_timestamp
9338d925a4f391d049f1cb55be83206d,ac648ab20e0c330dcfaa912644abcc2f,0
6ff620bdd4b7143ef7ef9a43ae35379f,dfc6c382d6584b22b5ba75c62cdb0c56,0
ec4708be07b9b92dfd3c98b92d5b273e,e836b4d62ec611a38ce9dd6e394a65e1,1
bb8fe17afcd2b8a8700155b79980a7d9,942050a4fd56327fb69ecb8b81948ded,2


In [7]:
!head -n 5 data/UGallery/ugallery_purchases.csv

user_id_hash,purchase_timestamp,purchased_artwork_ids_hash
faaecc910173fcca8b146c66db26b99f,1416,['f196009db1ba9607150abf0570e0fffe']
90d6d470c21861aaa739b2811ac0df3c,1418,['ccccb1b02d3130e435e05a4eea7d11fd']
67c7793eefa4aca1cd9a18029b26efc6,1420,['7bbc45a178aef2c041f4376ecdc26b23']
8c872e88b91f7077527d8c7bf8892fbd,1424,['5775ae42d3cef7ea7f56b800a8d7cffc']


In [8]:
!cat data/UGallery/Readme.txt

Due to privacy and copyright restrictions, we are only able to release part of the user transactions data, which consists of 6535 transactions of 2919 users on 6030 items.

Files under this folder:

* ugallery_purchases.csv  
Each line is a tuple in the form of (user_id_hash, purchase_timestamp, artwork_id_hash). Note that each id_hash is an 32-char string.
This file has the purchases in time for each user, the test data is the last purchase of each user.

* ugallery_inventory.csv  
Each line is a tuple in the form of (artwork_id_hash, artist_id_hash, upload_timestamp). Note that each id_hash is an 32-char string.
This file has the time an item is added to the website inventory, because these are physical artworks, the availability of the items must be simulated in order to make the recommendations.

* ugallery_resnet50_embeddings.npy
This is a numpy array of shape (13297, 2), each row is of shape (2,) where the first value is the artwork_id_hash, and the second one is the

In [9]:
# ugallery_data_utils.py
import numpy as np


def load_embeddings(embedding_path, embedding_shape=(13297, 2048)):
    data = np.load(embedding_path, allow_pickle=True)
    # Generate indexes and contiguous embedding
    embedding = np.empty(shape=embedding_shape)
    artwork_id2index = dict()
    artwork_index2id = dict()
    for i, (artwork_id_hash, artwork_embedding) in enumerate(data):
        artwork_id2index[artwork_id_hash] = i
        artwork_index2id[i] = artwork_id_hash
        embedding[i] = artwork_embedding
    return embedding, artwork_id2index, artwork_index2id

# Data processing procesure

In [10]:
import random
from collections import defaultdict
from math import ceil
from os.path import join

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [11]:
from tqdm.notebook import tqdm

## Creating visual clusters

In [12]:
# Load embeddings from files
embeddings_path = join("data", "UGallery", "ugallery_resnet50_embeddings.npy")
loaded_data = load_embeddings(embeddings_path)
embeddings, artwork_id2index, artwork_index2id = loaded_data
print(f"embeddings shape: {embeddings.shape}")

embeddings shape: (13297, 2048)


In [13]:
# z-score normalization of embedding
embeddings = StandardScaler().fit_transform(embeddings)
print(f"z-score normalization result shape: {embeddings.shape}")

z-score normalization result shape: (13297, 2048)


In [14]:
# 1. Conduct PCA to reduce from R2048 to R200
embeddings = PCA(n_components=200).fit_transform(embeddings)
print(f"PCA reduction result shape: {embeddings.shape}")

PCA reduction result shape: (13297, 200)


In [15]:
# 2. Perform k-means clustering with 100 clusters 20 times
# and keep the clusterer with the highest Silhouette coefficient
best_score = float("-inf")
best_clusterer = None

for i in range(20):
    clusterer = KMeans(
        n_clusters=100,
        max_iter=2000,
        n_init=8,
        n_jobs=8
    ).fit(embeddings)
    clusterer_labels = clusterer.predict(embeddings)
    clusterer_score = silhouette_score(embeddings, clusterer_labels)
    if clusterer_score > best_score:
        best_score = clusterer_score
        best_clusterer = clusterer
        print(f"Silhouette score ({i}): {clusterer_score} - New highest!")
    else:
        print(f"Silhouette score ({i}): {clusterer_score}")
    break # TODO(Antonio): Remove this constrain

print(f">> Best Silhouette score: {best_score}")

Silhouette score (0): 0.00813901169841024 - New highest!
>> Best Silhouette score: 0.00813901169841024


In [16]:
# 3. Label each image with its respective visual cluster
clusterer_labels = best_clusterer.predict(embeddings)
print(f"Best clusterer Silhouette score: {best_score}")

artwork_id2cluster = dict()
artwork_cluster2id = defaultdict(list)
for i, label in enumerate(clusterer_labels):
    artwork_id = artwork_index2id[i]
    artwork_id2cluster[artwork_id] = label
    artwork_cluster2id[label].append(artwork_id)

n_clusters = len(set(artwork_id2cluster.values()))
print(f"There are n_clusters: {n_clusters}")

Best clusterer Silhouette score: 0.00813901169841024
There are n_clusters: 100


## Sampling triples

In [17]:
# ugallery_data_utils.py
class User:
    def __init__(self, user_id_hash):
        self.user_id_hash = user_id_hash
        self.baskets = dict()
        self.gt_baskets = dict()
        self.evaluation_basket = None
        self.liked_artists = None
        self.liked_clusters = None
        self.profile = None

    def add_purchase(self, purchased_items, timestamp):
        assert timestamp not in self.baskets
        timestamp = int(timestamp)
        # Baskets are assumed to contain each item once
        self.baskets[timestamp] = set(purchased_items)
        self.gt_baskets[timestamp] = set(purchased_items)

    def assert_baskets(self):
        assert isinstance(self.baskets, dict)
        assert isinstance(self.evaluation_basket, set)
        assert isinstance(self.gt_baskets, dict)
        if len(self.evaluation_basket):
            misses = 0
            for gt_timestamp, gt_basket in self.gt_baskets.items():
                if gt_timestamp in self.baskets:
                    assert gt_basket == self.baskets[gt_timestamp]
                else:
                    misses += 1
                    assert gt_basket == self.evaluation_basket
            assert misses == 1
        else:
            assert self.baskets == self.gt_baskets

    def create_profile(self):
        assert self.profile is None
        self.profile = list(
            set(item for basket in self.baskets.values() for item in basket))

    def create_likes(self, artwork_id2cluster, artwork_id2artist):
        self.liked_clusters = set(artwork_id2cluster[item]
                                  for item in self.profile)
        self.liked_artists = set(artwork_id2artist[item]
                                 for item in self.profile)

    def save_basket_for_evaluation(self):
        assert self.evaluation_basket is None
        # If user has a single basket, use it for training only
        if len(self.baskets) == 1:
            self.evaluation_basket = set()
            return
        # All baskets are still available in self.gt_baskets
        last_basket_timestamp = max(self.baskets.keys())
        assert last_basket_timestamp in self.baskets
        last_basket = self.baskets.pop(last_basket_timestamp)
        assert last_basket is not None
        self.evaluation_basket = last_basket

    def strategy_1_valid_baskets(self, min_size=0):
        return [(timestamp, basket)
                for timestamp, basket in self.baskets.items()
                if len(basket) >= min_size]

    def strategy_2_valid_partitions(self):
        sorted_timestamps = sorted(self.baskets)
        valid_partitions = []
        for i in range(1, len(sorted_timestamps)):
            profile = {
                item
                for timestamp in sorted_timestamps[:i]
                for item in self.baskets[timestamp]
            }
            timestamp = sorted_timestamps[i]
            basket = self.baskets[timestamp]
            valid_partitions.append((timestamp, profile, basket))
        return valid_partitions

    def strategy_3_valid_liked(self, artwork_cluster2id, artwork_artist2id):
        liked_clusters_artworks = {
            artwork
            for cluster in self.liked_clusters
            for artwork in artwork_cluster2id[cluster]
        }
        liked_artists_artworks = {
            artwork
            for artist in self.liked_artists
            for artwork in artwork_artist2id[artist]
        }
        positive_candidates = liked_clusters_artworks & liked_artists_artworks
        valid_liked = positive_candidates - set(self.profile)
        return valid_liked

In [18]:
# ugallery_data_utils.py
import pandas as pd


class Inventory:
    def __init__(self, inventory_path, purchases_path):
        self.users = None
        # Build dataframes to manage data
        self.inventory = pd.read_csv(inventory_path)
        self.purchases = pd.read_csv(purchases_path)
        self.inventory.rename(
            columns={"upload_timestamp": "timestamp"},
            inplace=True,
        )
        self.purchases.rename(
            columns={"purchase_timestamp": "timestamp"},
            inplace=True,
        )

        # Check if artwork_id_hash has duplicates (inventory)
        assert not self.inventory["artwork_id_hash"].duplicated().any()
        # Check for missing values in data (inventory)
        assert not self.inventory.isnull().values.any()
        # Check for missing values in data (purchases)
        assert not self.purchases.isnull().values.any()
        # Process purchased artworks column (purchases)
        purchases_to_list = lambda p: p[1:-1].replace("'", "").split(", ")
        self.purchases["purchased_artwork_ids_hash"] = self.purchases[
            "purchased_artwork_ids_hash"].map(purchases_to_list)
        # Check if all purchases contain elements
        assert all(p for p in self.purchases["purchased_artwork_ids_hash"])

        # Find non-unique items
        purchased_items = self.purchases["purchased_artwork_ids_hash"].sum()
        self.non_unique_items, seen = set(), set()
        for item in purchased_items:
            if item not in seen:
                seen.add(item)
            else:
                self.non_unique_items.add(item)
        # Create list with items ids
        self.items = self.inventory["artwork_id_hash"].unique()


    def available_at_t(self, up_to_timestamp=None):
        inventory = set()
        # Forward time by timestamp
        for step, timestamp, row in self.__forward_time(up_to_timestamp):
            # Add item to inventory
            if step == "Add item":
                item = row["artwork_id_hash"]
                if item in inventory:
                    # Item already present
                    pass
                inventory.add(item)
            # Remove item if purchased item is not unique
            elif step == "Sell items":
                for item in row["purchased_artwork_ids_hash"]:
                    if item not in inventory:
                        # Item already sold or not present
                        if item not in self.non_unique_items:
                            # Item already sold or nor present
                            pass
                    if item not in self.non_unique_items:
                        inventory.discard(item)
        return inventory
    
    def build_users(self, artwork_id2cluster, artwork_id2artist):
        self.__build_users(artwork_id2cluster, artwork_id2artist)
        assert isinstance(self.users, dict)
        # Check if all users were built
        users_in_df = set(self.purchases["user_id_hash"].unique())
        users_in_dict = set(self.users.keys())
        assert users_in_df == users_in_dict
        # Check if all users are present in dict
        assert all(self.purchases["user_id_hash"].isin(self.users.keys()))
        # Check if user profiles were created
        assert all(user.profile for user in self.users.values())
        # Check if all users have evaluation basket or a single purchase
        assert all(user.evaluation_basket is not None
                   for user in self.users.values())
        assert all(user.assert_baskets for user in self.users.values())

    def __build_users(self, artwork_id2cluster, artwork_id2artist):
        self.users = dict()
        purchases = 0
        inventory = 0
        for step, timestamp, row in self.__forward_time():
            if step != "Sell items":
                inventory += 1
                continue
            purchases += 1
            if row["user_id_hash"] not in self.users:
                user_id_hash = row["user_id_hash"]
                self.users[user_id_hash] = User(user_id_hash)
            user = self.users[user_id_hash]
            user.add_purchase(
                row["purchased_artwork_ids_hash"],
                row["timestamp"],
            )
        assert purchases == len(self.purchases)
        assert inventory == len(self.inventory)
        for _, user in self.users.items():
            user.save_basket_for_evaluation()
            user.create_profile()
            user.create_likes(artwork_id2cluster, artwork_id2artist)

    def __forward_time(self, up_to_timestamp=None):
        # Sort data by timestamp
        df_inventory = self.inventory.sort_values(by=["timestamp"])
        df_purchases = self.purchases.sort_values(by=["timestamp"])
        # Limits of iteration
        i_inventory, max_inventory = 0, len(df_inventory.index)
        i_purchases, max_purchases = 0, len(df_purchases.index)
        # First row of dataframes
        row_inventory = df_inventory.loc[i_inventory, :]
        row_purchases = df_purchases.loc[i_purchases, :]

        while row_inventory is not None or row_purchases is not None:
            # If next timestamp is an upload
            time_inventory = getattr(row_inventory, "timestamp", float("inf"))
            time_purchases = getattr(row_purchases, "timestamp", float("inf"))
            if time_inventory <= time_purchases:
                yield ("Add item", time_inventory, row_inventory)
                i_inventory += 1
                if i_inventory >= max_inventory:
                    row_inventory = None
                else:
                    row_inventory = df_inventory.loc[i_inventory, :]
            # If next timestamp is a purchase
            elif time_purchases < time_inventory:
                yield ("Sell items", time_purchases, row_purchases)
                i_purchases += 1
                if i_purchases >= max_purchases:
                    row_purchases = None
                else:
                    row_purchases = df_purchases.loc[i_purchases, :]
            # If limit was given
            if up_to_timestamp is not None:
                if min(time_inventory, time_purchases) > up_to_timestamp:
                    break



In [19]:
TOTAL_SAMPLES_TRAIN = 10_000_000
TOTAL_SAMPLES_TEST = TOTAL_SAMPLES_TRAIN * 0.05

N_STRATEGIES = 6
N_SAMPLES_PER_STRATEGY = ceil(TOTAL_SAMPLES_TRAIN / N_STRATEGIES)

In [20]:
inventory_path = join("data", "UGallery", "ugallery_inventory.csv")
purchases_path = join("data", "UGallery", "ugallery_purchases.csv")
inventory = Inventory(inventory_path, purchases_path)

In [21]:
artwork_id2artist = dict(
    zip(
        inventory.inventory["artwork_id_hash"],
        inventory.inventory["artist_id_hash"],
    ))
artwork_artist2id = defaultdict(list)
for artwork_id, artist_id in artwork_id2artist.items():
    artwork_artist2id[artist_id].append(artwork_id)

In [22]:
inventory.build_users(artwork_id2cluster, artwork_id2artist)

### 1) Predicting missing item in purchase basket

In [23]:
# 1. Predicting missing item in purchase basket


def generate_samples_strategy_1(n_samples, users):
    print("Strategy 1) Predicting missing item in purchase basket")
    # Count valid users
    valid_users = 0
    for user in users.values():
        if user.strategy_1_valid_baskets(min_size=2):
            valid_users += 1

    samples_per_user = ceil(n_samples / valid_users)
    print(
        f"Valid users: {valid_users} | Samples/user: {samples_per_user}\n"
        f"Target: {n_samples} | Total samples: {valid_users * samples_per_user}"
    )

    samples = []
    for user in tqdm(users.values(), desc="Valid users"):
        # Pick items from baskets with more than one item
        valid_baskets = user.strategy_1_valid_baskets(min_size=2)
        if not valid_baskets: continue
        # Pick visual clusters and artists liked by the user
        liked_clusters = set(artwork_id2cluster[item] for item in user.profile)
        liked_artists = set(artwork_id2artist[item] for item in user.profile)

        n = samples_per_user
        while n > 0:
            ni = random.choice(inventory.items)
            if artwork_id2cluster[ni] in liked_clusters: continue
            if artwork_id2artist[ni] in liked_artists: continue
            timestamp, basket = random.choice(valid_baskets)
            pi = random.choice(tuple(basket))
            profile = {item for item in basket if item != pi}
            samples.append((profile, pi, ni, timestamp, user.user_id_hash))
            n -= 1

    return samples


def sanity_checks_strategy_1(samples, users):
    print("Strategy 1) Predicting missing item in purchase basket")
    print(f"Samples: {len(samples)}")
    for (profile, pi, ni, timestamp, uid) in tqdm(samples, desc="Check S1"):
        user = users[uid]
        gt_basket = user.baskets[timestamp]
        assert len(profile) + 1 == len(gt_basket)
        # Positive item
        assert pi in gt_basket
        # (Might not be true)
        assert pi not in profile
        # Negative item
        assert ni not in user.profile
        assert artwork_id2cluster[ni] not in user.liked_clusters
        assert artwork_id2artist[ni] not in user.liked_artists
        # Profile
        assert profile.issubset(gt_basket)
        assert profile.issubset(user.profile)

In [24]:
samples_s1 = generate_samples_strategy_1(
    N_SAMPLES_PER_STRATEGY,
    inventory.users,
)
sanity_checks_strategy_1(
    samples_s1,
    inventory.users,
)

Strategy 1) Predicting missing item in purchase basket
Valid users: 625 | Samples/user: 2667
Target: 1666667 | Total samples: 1666875


HBox(children=(FloatProgress(value=0.0, description='Valid users', max=2919.0, style=ProgressStyle(description…


Strategy 1) Predicting missing item in purchase basket
Samples: 1666875


HBox(children=(FloatProgress(value=0.0, description='Check S1', max=1666875.0, style=ProgressStyle(description…




### 2) Predicting next purchase basket

In [25]:
# 2. Predicting next purchase basket


def generate_samples_strategy_2(n_samples, users):
    print("Strategy 2) Predicting next purchase basket")
    # Count valid users
    valid_users = 0
    for user in users.values():
        if user.strategy_2_valid_partitions():
            valid_users += 1

    samples_per_user = ceil(n_samples / valid_users)
    print(
        f"Valid users: {valid_users} | Samples/user: {samples_per_user}\n"
        f"Target: {n_samples} | Total samples: {valid_users * samples_per_user}"
    )

    samples = []
    for user in tqdm(users.values(), desc="Valid users"):
        valid_partitions = user.strategy_2_valid_partitions()
        if not valid_partitions: continue
        # Pick visual clusters and artists liked by the user
        liked_clusters = user.liked_clusters
        liked_artists = user.liked_artists

        n = samples_per_user
        while n > 0:
            ni = random.choice(inventory.items)
            if artwork_id2cluster[ni] in liked_clusters: continue
            if artwork_id2artist[ni] in liked_artists: continue
            timestamp, profile, basket = random.choice(valid_partitions)
            pi = random.choice(tuple(basket))
            # TODO(Antonio): Not sure about this (item purchased twice)
            if pi in profile: continue
            samples.append((profile, pi, ni, timestamp, user.user_id_hash))
            n -= 1

    return samples


def sanity_checks_strategy_2(samples, users):
    print("Strategy 2) Predicting next purchase basket")
    print(f"Samples: {len(samples)}")
    for (profile, pi, ni, timestamp, uid) in tqdm(samples, desc="Check S2"):
        user = users[uid]
        gt_basket = user.baskets[timestamp]
        previous_baskets = set(item
                               for b_timestamp, basket in user.baskets.items()
                               if b_timestamp < timestamp for item in basket)
        # Positive item
        assert pi in gt_basket
        assert profile == previous_baskets
        # (Might not be true)
        assert pi not in profile
        # Negative item
        assert ni not in user.profile
        assert artwork_id2cluster[ni] not in user.liked_clusters
        assert artwork_id2artist[ni] not in user.liked_artists
        # Profile
        assert not gt_basket.issubset(profile)
        assert profile.issubset(user.profile)

In [26]:
samples_s2 = generate_samples_strategy_2(
    N_SAMPLES_PER_STRATEGY,
    inventory.users,
)
sanity_checks_strategy_2(
    samples_s2,
    inventory.users,
)

Strategy 2) Predicting next purchase basket
Valid users: 455 | Samples/user: 3664
Target: 1666667 | Total samples: 1667120


HBox(children=(FloatProgress(value=0.0, description='Valid users', max=2919.0, style=ProgressStyle(description…


Strategy 2) Predicting next purchase basket
Samples: 1667120


HBox(children=(FloatProgress(value=0.0, description='Check S2', max=1667120.0, style=ProgressStyle(description…




### 3) Recommending visually similar artworks from favorite artists

In [27]:
# 3. Recommending visually similar artworks from favorite artists


def generate_samples_strategy_3(n_samples, users):
    print(("Strategy 3) Recommending visually similar "
           "artworks from favorite artists"))

    # Count valid users
    valid_users = 0
    for user in users.values():
        if user.strategy_3_valid_liked(artwork_cluster2id, artwork_artist2id):
            valid_users += 1

    samples_per_user = ceil(n_samples / valid_users)
    print(
        f"Valid users: {valid_users} | Samples/user: {samples_per_user}\n"
        f"Target: {n_samples} | Total samples: {valid_users * samples_per_user}"
    )

    samples = []
    for user in tqdm(users.values(), desc="Valid users"):
        valid_liked = tuple(user.strategy_3_valid_liked(artwork_cluster2id, artwork_artist2id))
        if not valid_liked: continue
        # Pick visual clusters and artists liked by the user
        liked_clusters = user.liked_clusters
        liked_artists = user.liked_artists

        n = samples_per_user
        while n > 0:
            ni = random.choice(inventory.items)
            if artwork_id2cluster[ni] in liked_clusters: continue
            if artwork_id2artist[ni] in liked_artists: continue
            pi = random.choice(valid_liked)
            samples.append((user.profile, pi, ni, user.user_id_hash))
            n -= 1

    return samples


def sanity_checks_strategy_3(samples, users):
    print(("Strategy 3) Recommending visually similar "
           "artworks from favorite artists"))
    print(f"Samples: {len(samples)}")
    for (profile, pi, ni, uid) in tqdm(samples, desc="Check S3"):
        user = users[uid]

        # Positive item
        assert pi not in user.profile
        assert artwork_id2cluster[pi] in user.liked_clusters 
        assert artwork_id2artist[pi] in user.liked_artists 
        # Negative item
        assert ni not in user.profile
        assert artwork_id2cluster[ni] not in user.liked_clusters
        assert artwork_id2artist[ni] not in user.liked_artists
        # Profile
        assert set(profile) == set(user.profile)

In [28]:
samples_s3 = generate_samples_strategy_3(
    N_SAMPLES_PER_STRATEGY,
    inventory.users,
)
sanity_checks_strategy_3(
    samples_s3,
    inventory.users,
)

Strategy 3) Recommending visually similar artworks from favorite artists
Valid users: 2247 | Samples/user: 742
Target: 1666667 | Total samples: 1667274


HBox(children=(FloatProgress(value=0.0, description='Valid users', max=2919.0, style=ProgressStyle(description…


Strategy 3) Recommending visually similar artworks from favorite artists
Samples: 1667274


HBox(children=(FloatProgress(value=0.0, description='Check S3', max=1667274.0, style=ProgressStyle(description…




### 4) Recommending profile items from the same user profile