In [1]:
import json
import os
import random
from math import ceil

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from tqdm.autonotebook import tqdm

from utils.ugallery.data import (
    get_transactions_dataframes, add_aggregation_columns,
    mark_evaluation_rows, get_holdout, map_ids_to_indexes,
)
from utils.ugallery.hashing import pre_hash, HashesContainer
from utils.ugallery.sampling import StrategyHandler
from utils.ugallery.similarity import HybridScorer, VisualSimilarityHandler


# Parameters
RNG_SEED = 0
EMBEDDING_FN = os.path.join("data", "UGallery", "ugallery_embedding.npy")
PCA_COMPONENTS = 200
CLUSTERING_RNG = None
CLUSTERING_N_CLUSTERS = 100
CLUSTERING_N_TIMES = 5  # 20
CLUSTERING_N_INIT = 1  # 8
CLUSTERING_N_JOBS = os.cpu_count()  # 4 or 8
INVENTORY_PATH = os.path.join("data", "UGallery", "valid_artworks.csv")
PURCHASES_PATH = os.path.join("data", "UGallery", "valid_sales.csv")
OUTPUT_TRAIN_PATH = os.path.join("data", "UGallery", "train.csv")
OUTPUT_VALID_PATH = os.path.join("data", "UGallery", "validation.csv")
OUTPUT_EVAL_PATH = os.path.join("data", "UGallery", "evaluation.csv")

# Parameters (sampling)
ARTIST_BOOST = 0.2
CONFIDENCE_MARGIN = 0.18
FINE_GRAINED_THRESHOLD = 0.7
FAKE_COEF = 0.
assert all(
    0. <= var <= 1.
    for var in [ARTIST_BOOST, CONFIDENCE_MARGIN, FINE_GRAINED_THRESHOLD, FAKE_COEF]
)
TOTAL_SAMPLES_TRAIN = 10_000_000
TOTAL_SAMPLES_VALID = 500_000


  from tqdm.autonotebook import tqdm


In [2]:
%%time
# Freezing RNG seed if needed
if RNG_SEED is not None:
    print(f"\nUsing random seed...")
    random.seed(RNG_SEED)
    np.random.seed(RNG_SEED)


# Load embedding from file
print(f"\nLoading embedding from file... ({EMBEDDING_FN})")
embedding = np.load(EMBEDDING_FN, allow_pickle=True)


# Extract features and id2index
print("\nExtracting data into variables...")
features = np.zeros(shape=(embedding.shape[0], embedding[0, 1].shape[0]))
id2index = dict()
for i, (_id, vector_embedding) in enumerate(embedding):
    features[i] = vector_embedding
    id2index[str(_id)] = i
print(f">> Features shape: {features.shape}")


# Creating visual clusters
print("\nCreating visual clusters: 0. z-score normalization of embedding")
features = StandardScaler().fit_transform(features)
print(f">> Features shape: {features.shape}") ###

print("\nCreating visual clusters: 1. Conduct PCA to reduce dimension")
features = PCA(n_components=PCA_COMPONENTS).fit_transform(features)
print(f">> Features shape: {features.shape}") ###


print("\nCreating visual clusters: 2. Perform k-means clustering")
best_score = float("-inf")
best_clusterer = None
for i in range(CLUSTERING_N_TIMES):
    clusterer = KMeans(
        n_clusters=CLUSTERING_N_CLUSTERS,
        max_iter=2000,
        n_init=CLUSTERING_N_INIT,
        n_jobs=CLUSTERING_N_JOBS,
        random_state=CLUSTERING_RNG,
    ).fit(features)
    score = silhouette_score(features, clusterer.labels_)
    if score > best_score:
        best_clusterer = clusterer
        best_score = score
    if CLUSTERING_RNG is not None:
        break
    print((f">> Silhouette score ({i + 1:02}/{CLUSTERING_N_TIMES}): "
           f"{score:.4f} (Best: {best_score:.4f})"), flush=True, end="\r")
print(f">> Best Silhouette score: {best_score}")


# Load transactions CSVs
print(f"\nLoading transactions from files...")
inventory_df, purchases_df = get_transactions_dataframes(
    INVENTORY_PATH, PURCHASES_PATH,
    display_stats=False,
)
# Check if every purchased artwork is present in inventory
for artwork_id in purchases_df["artwork_id"].sum():
    assert artwork_id in inventory_df["artwork_id"].values
print(f">> Inventory: {inventory_df.shape} | Purchases: {purchases_df.shape}")

# Apply id2index, to work with indexes only
inventory_df = map_ids_to_indexes(inventory_df, id2index)
purchases_df = map_ids_to_indexes(purchases_df, id2index)
print(">> Mapping applied")

# Calculating number of baskets and size of each basket for purchases
purchases_df = add_aggregation_columns(purchases_df)
# Check if new values are reasonable
assert all(purchases_df["n_baskets"] > 0)
assert all(purchases_df["n_items"] > 0)
print(f">> Purchases: {purchases_df.shape}")

# Mark purchases used for evaluation procedure
purchases_df = mark_evaluation_rows(purchases_df)
# Check if new column exists and has boolean dtype
assert purchases_df["evaluation"].dtype.name == "bool"
print(f">> Purchases: {purchases_df.shape}")

# Split purchases data according to evaluation column
evaluation_df, purchases_df = get_holdout(purchases_df)
assert not purchases_df.empty
assert not evaluation_df.empty
print(f">> Evaluation: {evaluation_df.shape} | Purchases: {purchases_df.shape}")

# Recalculate number of baskets and size of each basket for purchases
purchases_df = add_aggregation_columns(purchases_df)
print(f">> Purchases: {purchases_df.shape}")

# Add cluster id information
inventory_df["cluster_id"] = inventory_df["artwork_id"].apply(
    lambda idx: best_clusterer.labels_[idx],
)
print(f">> Inventory: {inventory_df.shape}")


# Create helper mapping from idx to data
print("\nCreating mappings from index to data")
artist_by_idx = np.full((features.shape[0],), -1)
for artwork_id, artist_id in inventory_df.set_index("artwork_id").to_dict()["artist_id"].items():
    artist_by_idx[artwork_id] = artist_id
cluster_by_idx = best_clusterer.labels_


# Create helper mapping from data to idxs
print("\nCreating mappings from data to index")
artistId2artworkIndexes = inventory_df.groupby("artist_id")["artwork_id"].apply(list).to_dict()
clustId2artIndexes = dict()
for i, cluster in enumerate(cluster_by_idx):
    if cluster not in clustId2artIndexes:
        clustId2artIndexes[cluster] = list()
    clustId2artIndexes[cluster].append(i)


print("\nCreating helpers instances...")
# Creating hashes container for duplicates detection
hashes_container = HashesContainer()
# Creating custom score helpers
vissimhandler = VisualSimilarityHandler(best_clusterer.labels_, features)
hybrid_scorer = HybridScorer(vissimhandler, artist_by_idx, artist_boost=ARTIST_BOOST)


# Sampling constants
print("\nCalculating important values...")
N_REAL_STRATEGIES = 2
N_FAKE_STRATEGIES = 2
print(f">> There are {N_REAL_STRATEGIES} real strategies and {N_FAKE_STRATEGIES} fake strategies")
N_SAMPLES_PER_REAL_STRAT_TRAIN = ceil((1 - FAKE_COEF) * TOTAL_SAMPLES_TRAIN / N_REAL_STRATEGIES)
N_SAMPLES_PER_REAL_STRAT_VALID = ceil((1 - FAKE_COEF) * TOTAL_SAMPLES_VALID / N_REAL_STRATEGIES)
N_SAMPLES_PER_FAKE_STRAT_TRAIN = ceil(FAKE_COEF * TOTAL_SAMPLES_TRAIN / N_FAKE_STRATEGIES)
N_SAMPLES_PER_FAKE_STRAT_VALID = ceil(FAKE_COEF * TOTAL_SAMPLES_VALID / N_FAKE_STRATEGIES)
N_USERS = purchases_df["customer_id"].nunique()
N_ITEMS = len(embedding)
print(f">> N_USERS = {N_USERS} | N_ITEMS = {N_ITEMS}")


# Actual sampling section
print("\nCreating samples using custom strategies")
strategy_handler = StrategyHandler(
    vissimhandler, hybrid_scorer,
    clustId2artIndexes, cluster_by_idx,
    artistId2artworkIndexes, artist_by_idx,
    threshold=FINE_GRAINED_THRESHOLD,
    confidence_margin=CONFIDENCE_MARGIN,
)

print(">> Strategy #1: Given real profile, recommend profile")
# Sampling training samples
samples_train_1 = strategy_handler.strategy_1(
    purchases_df.copy(),  # purchases_df
    ceil(N_SAMPLES_PER_REAL_STRAT_TRAIN / N_USERS),  # samples_per_user
    hashes_container,  # hashes_container
)
assert len(samples_train_1) >= N_SAMPLES_PER_REAL_STRAT_TRAIN
# Sampling validation samples
samples_valid_1 = strategy_handler.strategy_1(
    purchases_df.copy(),  # purchases_df
    ceil(N_SAMPLES_PER_REAL_STRAT_VALID / N_USERS),  # samples_per_user
    hashes_container,  # hashes_container
)
assert len(samples_valid_1) >= N_SAMPLES_PER_REAL_STRAT_VALID
print(f">> Strategy #1 Training samples ({len(samples_train_1)}) and validation samples ({len(samples_valid_1)})")

print(">> Strategy #2: Given fake profile, recommend profile")
# Sampling training samples
samples_train_2 = strategy_handler.strategy_2(
    embedding,  # embedding
    ceil(N_SAMPLES_PER_FAKE_STRAT_TRAIN / N_ITEMS),  # samples_per_item
    hashes_container,  # hashes_container
)
assert len(samples_train_2) >= N_SAMPLES_PER_FAKE_STRAT_TRAIN
# Sampling validation samples
samples_valid_2 = strategy_handler.strategy_2(
    embedding,  # embedding
    ceil(N_SAMPLES_PER_FAKE_STRAT_VALID / N_ITEMS),  # samples_per_item
    hashes_container,  # hashes_container
)
assert len(samples_valid_2) >= N_SAMPLES_PER_FAKE_STRAT_VALID
print(f">> Strategy #2: Training samples ({len(samples_train_2)}) and validation samples ({len(samples_valid_2)})")

print(">> Strategy #3: Given real profile, recommend items according to hybrid scorer")
# Sampling training samples
samples_train_3 = strategy_handler.strategy_3(
    purchases_df.copy(),  # purchases_df
    ceil(N_SAMPLES_PER_REAL_STRAT_TRAIN / N_USERS),  # samples_per_user
    hashes_container,  # hashes_container
)
assert len(samples_train_3) >= N_SAMPLES_PER_REAL_STRAT_TRAIN
# Sampling validation samples
samples_valid_3 = strategy_handler.strategy_3(
    purchases_df.copy(),  # purchases_df
    ceil(N_SAMPLES_PER_REAL_STRAT_VALID / N_USERS),  # samples_per_user
    hashes_container,  # hashes_container
)
assert len(samples_valid_3) >= N_SAMPLES_PER_REAL_STRAT_VALID
print(f">> Strategy #3: Training samples ({len(samples_train_3)}) and validation samples ({len(samples_valid_3)})")

print(">> Strategy #4: Given fake profile, recommend items according to hybrid scorer")
# Sampling training samples
samples_train_4 = strategy_handler.strategy_4(
    embedding,  # embedding
    ceil(N_SAMPLES_PER_FAKE_STRAT_TRAIN / N_ITEMS),  # samples_per_item
    hashes_container,  # hashes_container
)
assert len(samples_train_4) >= N_SAMPLES_PER_FAKE_STRAT_TRAIN
# Sampling validation samples
samples_valid_4 = strategy_handler.strategy_4(
    embedding,  # embedding
    ceil(N_SAMPLES_PER_FAKE_STRAT_VALID / N_ITEMS),  # samples_per_item
    hashes_container,  # hashes_container
)
assert len(samples_valid_4) >= N_SAMPLES_PER_FAKE_STRAT_VALID
print(f">> Strategy #4: Training samples ({len(samples_train_4)}) and validation samples ({len(samples_valid_4)})")

# Log out detected collisions
print(f">> Total hash collisions: {hashes_container.collisions}")
print(f">> Total visual collisions: {vissimhandler.count}")


# Merge triples into a single list
print("\nMerging strategies samples into a single list")
TRAINING_DATA = [samples_train_1, samples_train_2, samples_train_3, samples_train_4]
for i, samples in enumerate(TRAINING_DATA, start=1):
    print(f">> Strategy {i}: Size: {len(samples):07d} | Sample: {samples[0] if samples else None}")
TRAINING_DATA = [
    triple
    for strategy_samples in TRAINING_DATA
    for triple in strategy_samples
]
print(f">> Training samples: {len(TRAINING_DATA)}")
# Merge strategies samples
VALIDATION_DATA = [samples_valid_1, samples_valid_2, samples_valid_3, samples_valid_4]
for i, samples in enumerate(VALIDATION_DATA, start=1):
    print(f">> Strategy {i}: Size: {len(samples):07d} | Sample: {samples[0] if samples else None}")
VALIDATION_DATA = [
    triple
    for strategy_samples in VALIDATION_DATA
    for triple in strategy_samples
]
print(f">> Validation samples: {len(VALIDATION_DATA)}")


# Search for duplicated hashes
print(f"\nNaive triples validation and looking for duplicates...")
validation_hash_check = HashesContainer()
all_samples = [
    triple
    for subset in (TRAINING_DATA, VALIDATION_DATA)
    for triple in subset
]
customer_ids = purchases_df["customer_id"].unique()
user_data = dict()
for triple in tqdm(all_samples, desc="Naive validation"):
    profile, pi, ni, ui = triple
    assert validation_hash_check.enroll(pre_hash((profile, pi, ni)))
    assert 0 <= pi < N_ITEMS
    assert 0 <= ni < N_ITEMS
    assert pi != ni
    assert not vissimhandler.same(pi, ni)
    if ui == -1:
        continue
    assert ui in customer_ids
    if not ui in user_data:
        user = purchases_df[purchases_df["customer_id"] == ui]
        user_data[ui] = set(np.concatenate(user["artwork_id"].values))
    user_artworks = user_data[ui]
    assert all(i in user_artworks for i in profile)
    spi = hybrid_scorer.get_score(ui, user_artworks, pi)
    sni = hybrid_scorer.get_score(ui, user_artworks, ni)
    assert spi > sni
print(">> No duped hashes found")


print("\nCreating output files (train and valid)...")
# Training dataframe
df_train = pd.DataFrame(TRAINING_DATA, columns=["profile", "pi", "ni", "ui"])
df_train = df_train.drop("ui", axis=1)
df_train["profile"] = df_train["profile"].map(lambda l: " ".join(map(str, l)))
print(f">> Saving training samples ({OUTPUT_TRAIN_PATH})")
df_train.to_csv(OUTPUT_TRAIN_PATH, index=False)

# Validation dataframe
df_validation = pd.DataFrame(VALIDATION_DATA, columns=["profile", "pi", "ni", "ui"])
df_validation = df_validation.drop("ui", axis=1)
df_validation["profile"] = df_validation["profile"].map(lambda l: " ".join(map(str, l)))
print(f">> Saving validation samples in ({OUTPUT_VALID_PATH})")
df_validation.to_csv(OUTPUT_VALID_PATH, index=False)


print("\nCreating output files (evaluation)...")
# Prepare existing dataframes
inventory_df = inventory_df.rename(columns={
    "cluster_id": "visual_cluster_id",
})
purchases_df = purchases_df.rename(columns={
    "artwork_id": "shopping_cart",
    "customer_id": "user_id",
})
purchases_df = purchases_df.drop(["n_baskets", "n_items", "evaluation"], axis=1)
# Add event columns
inventory_df["event"] = "inventory"
purchases_df["event"] = "purchase"
evaluation_df["event"] = "evaluation"

# Evaluation dataframe
df_evaluation = pd.merge(inventory_df, purchases_df, on="timestamp", how="outer")
df_evaluation = pd.merge(df_evaluation, evaluation_df, on="timestamp", how="outer")
# Merge event columns
df_evaluation["event"] = df_evaluation["event"].fillna(df_evaluation["event_x"])
df_evaluation["event"] = df_evaluation["event"].fillna(df_evaluation["event_y"])
df_evaluation = df_evaluation.drop(["event_x", "event_y"], axis=1)
# Merge user_id columns
df_evaluation["user_id"] = df_evaluation["user_id_x"].fillna(df_evaluation["user_id_y"])
df_evaluation = df_evaluation.drop(["user_id_x", "user_id_y"], axis=1)
# Use timestamp to set index
df_evaluation = df_evaluation.sort_values(by=["timestamp"])
df_evaluation = df_evaluation.reset_index(drop=True)
# Move timestamp and event to first columns
df_evaluation.insert(0, "event", df_evaluation.pop("event"))
df_evaluation.insert(0, "timestamp", df_evaluation.pop("timestamp"))
print(f">> Saving evaluation data in ({OUTPUT_VALID_PATH})")
df_evaluation.to_csv(OUTPUT_EVAL_PATH, index=False)


# Finished
print("\nDone")



Using random seed...

Loading embedding from file... (data/UGallery/ugallery_embedding.npy)

Extracting data into variables...
>> Features shape: (13297, 4096)

Creating visual clusters: 0. z-score normalization of embedding
>> Features shape: (13297, 4096)

Creating visual clusters: 1. Conduct PCA to reduce dimension
>> Features shape: (13297, 200)

Creating visual clusters: 2. Perform k-means clustering
>> Best Silhouette score: 0.01127261796461595113)

Loading transactions from files...
>> Inventory: (7742, 3) | Purchases: (4897, 3)
>> Mapping applied
>> Purchases: (4897, 5)
>> Purchases: (4897, 6)
>> Evaluation: (728, 4) | Purchases: (4169, 6)
>> Purchases: (4169, 6)
>> Inventory: (7742, 4)

Creating mappings from index to data

Creating mappings from data to index

Creating helpers instances...

Calculating important values...
>> There are 2 real strategies and 2 fake strategies
>> N_USERS = 2919 | N_ITEMS = 13297

Creating samples using custom strategies
>> Strategy #1: Given re

HBox(children=(FloatProgress(value=0.0, description='Strategy 1', max=2919.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Strategy 1', max=2919.0, style=ProgressStyle(description_…


>> Strategy #1 Training samples (5000247) and validation samples (251034)
>> Strategy #2: Given fake profile, recommend profile


HBox(children=(FloatProgress(value=0.0, description='Strategy 2', max=13297.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Strategy 2', max=13297.0, style=ProgressStyle(description…


>> Strategy #2: Training samples (0) and validation samples (0)
>> Strategy #3: Given real profile, recommend items according to hybrid scorer


HBox(children=(FloatProgress(value=0.0, description='Strategy 3', max=2919.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Strategy 3', max=2919.0, style=ProgressStyle(description_…


>> Strategy #3: Training samples (5000247) and validation samples (251034)
>> Strategy #4: Given fake profile, recommend items according to hybrid scorer


HBox(children=(FloatProgress(value=0.0, description='Strategy 4', max=13297.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Strategy 4', max=13297.0, style=ProgressStyle(description…


>> Strategy #4: Training samples (0) and validation samples (0)
>> Total hash collisions: 11542792
>> Total visual collisions: 1134

Merging strategies samples into a single list
>> Strategy 1: Size: 5000247 | Sample: (array([  409, 13294]), 13294, 1610, 269)
>> Strategy 2: Size: 0000000 | Sample: None
>> Strategy 3: Size: 5000247 | Sample: (array([  409, 13294]), 9969, 5248, 269)
>> Strategy 4: Size: 0000000 | Sample: None
>> Training samples: 10000494
>> Strategy 1: Size: 0251034 | Sample: (array([  409, 13294]), 13294, 11918, 269)
>> Strategy 2: Size: 0000000 | Sample: None
>> Strategy 3: Size: 0251034 | Sample: (array([  409, 13294]), 12811, 12471, 269)
>> Strategy 4: Size: 0000000 | Sample: None
>> Validation samples: 502068

Naive triples validation and looking for duplicates...


HBox(children=(FloatProgress(value=0.0, description='Naive validation', max=10502562.0, style=ProgressStyle(de…


>> No duped hashes found

Creating output files (train and valid)...
>> Saving training samples (data/UGallery/train.csv)
>> Saving validation samples in (data/UGallery/validation.csv)

Creating output files (evaluation)...
>> Saving evaluation data in (data/UGallery/validation.csv)

Done
CPU times: user 11min 38s, sys: 1min 40s, total: 13min 18s
Wall time: 11min 27s
