In [None]:
import os
import random
from math import ceil

import numpy as np
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm

from utils.data.pinterest import (
    get_interactions_dataframe, mark_evaluation_rows,
    get_holdout,
)
from utils.hashing import pre_hash, HashesContainer
from utils.sampling import StrategyHandler
from utils.similarity import HybridScorer, VisualSimilarityHandler


# Mode
# Use 'MODE_PROFILE = True' for CuratorNet-like training 
# Use 'MODE_PROFILE = False' for VBPR-like training
MODE_PROFILE = True
MODE_PROFILE = "profile" if MODE_PROFILE else "user"

# Parameters
RNG_SEED = 0
EMBEDDING_FN = os.path.join("data", "Pinterest", "pinterest_embedding.npy")
PCA_COMPONENTS = 200
CLUSTERING_RNG = None
CLUSTERING_N_CLUSTERS = 100
CLUSTERING_N_TIMES = 5  # 20
CLUSTERING_N_INIT = 1  # 8
INTERACTIONS_PATH = os.path.join("data", "Pinterest", "pinterest.csv")
OUTPUT_TRAIN_PATH = os.path.join("data", "Pinterest", f"{MODE_PROFILE}-train.csv")
OUTPUT_VALID_PATH = os.path.join("data", "Pinterest", f"{MODE_PROFILE}-validation.csv")
OUTPUT_EVAL_PATH = os.path.join("data", "Pinterest", f"{MODE_PROFILE}-evaluation.csv")

# Parameters (sampling)
ARTIST_BOOST = 0.2
CONFIDENCE_MARGIN = 0.18
FINE_GRAINED_THRESHOLD = 0.7
FAKE_COEF = 0. if MODE_PROFILE is True else 0.
assert all(
    0. <= var <= 1.
    for var in [ARTIST_BOOST, CONFIDENCE_MARGIN, FINE_GRAINED_THRESHOLD, FAKE_COEF]
)
TOTAL_SAMPLES_TRAIN = 3_000_000  # 10_000_000
TOTAL_SAMPLES_VALID = 150_000  # 500_000

# Parameters (checkpoints)
LOAD_PCA_EMBEDDING = True
PCA_EMBEDDING_FN = os.path.join("data", "Pinterest", "pinterest_pca.npy")
LOAD_PCA_LABELS = True
PCA_LABELS_FN = os.path.join("data", "Pinterest", "pinterest_labels.npy")


In [None]:
%%time
# ~26 min
# Freezing RNG seed if needed
if RNG_SEED is not None:
    print(f"\nUsing random seed...")
    random.seed(RNG_SEED)
    np.random.seed(RNG_SEED)


# Load embedding from file
print(f"\nLoading embedding from file... ({EMBEDDING_FN})")
embedding = np.load(EMBEDDING_FN, allow_pickle=True)


# Extract features and id2index
print("\nExtracting data into variables...")
features = list()
id2index = dict()
for i, (_id, vector_embedding) in enumerate(embedding):
    _id = str(_id)
    if _id not in id2index:
        id2index[_id] = len(features)
        features.append(vector_embedding)
features = np.asarray(features)
print(f">> Features shape: {features.shape}")

# Release some memory
del embedding


if LOAD_PCA_EMBEDDING:
    # Loading visual clusters
    print("\nLoading visual clusters: After scaling (0) and PCA (1)")
    features = np.load(PCA_EMBEDDING_FN, allow_pickle=True)
else:
    # Creating visual clusters (~58 min)
    print("\nCreating visual clusters: 0. z-score normalization of embedding")
    # features = StandardScaler().fit_transform(features)
    features = StandardScaler().partial_fit(features).transform(features)
    print(f">> Features shape: {features.shape}")

    print("\nCreating visual clusters: 1. Conduct (incremental) PCA to reduce dimension")
    features = IncrementalPCA(n_components=PCA_COMPONENTS).fit_transform(features)
    np.save(PCA_EMBEDDING_FN, features, allow_pickle=True)
print(f">> Features shape: {features.shape}")


if LOAD_PCA_LABELS:
    # Loading cluster labels
    print("\nLoading visual clusters: After KMeans (2)")
    clusterer_labels = np.load(PCA_LABELS_FN, allow_pickle=True)
    best_score = silhouette_score(features, clusterer_labels, sample_size=40_000)
else:
    # Creating cluster labels (~17 min)
    print("\nCreating visual clusters: 2. Perform k-means clustering")
    best_score = float("-inf")
    best_clusterer = None
    for i in range(CLUSTERING_N_TIMES):
        clusterer = MiniBatchKMeans(
            n_clusters=CLUSTERING_N_CLUSTERS,
            batch_size=2000,
            max_iter=2000,
            n_init=CLUSTERING_N_INIT,
            random_state=CLUSTERING_RNG,
            # verbose=1, ###
        ).fit(features)
        score = silhouette_score(features, clusterer.labels_, sample_size=40_000)
        if score > best_score:
            best_clusterer = clusterer
            best_score = score
        if CLUSTERING_RNG is not None:
            break
        print((f">> Silhouette score ({i + 1:02}/{CLUSTERING_N_TIMES}): "
               f"{score:.4f} (Best: {best_score:.4f})"), flush=True, end="\r")
    clusterer_labels = best_clusterer.labels_
    np.save(PCA_LABELS_FN, clusterer_labels, allow_pickle=True)
print(f">> Best Silhouette score: {best_score}")


# Load interactions CSVs
print(f"\nLoading interactions from files...")
interactions_df = get_interactions_dataframe(
    INTERACTIONS_PATH,
    display_stats=True,
)

# Apply id2index, to work with indexes only
interactions_df["item_id"] = interactions_df["item_id"].map(id2index)
print(f">> Mapping applied ({interactions_df['item_id'].isna().sum()} missing values)")

# Select available images only
interactions_df = interactions_df[interactions_df["item_id"].notnull()]
# Transform indexes to int
interactions_df["item_id"] = interactions_df["item_id"].astype(int)

# Store mapping from user id to index (0-index, no skipping)
unique_user_ids = interactions_df["user_id"].unique()
new_user_ids = np.argsort(unique_user_ids)
user_id_map = dict(zip(
    unique_user_ids,
    new_user_ids,
))

# Mark interactions used for evaluation procedure
interactions_df = mark_evaluation_rows(interactions_df, threshold=10)
# Check if new column exists and has boolean dtype
assert interactions_df["evaluation"].dtype.name == "bool"
print(f">> Interactions: {interactions_df.shape}")

# Store index for sorting
interactions_df["index"] = interactions_df.index
# Split interactions data according to evaluation column
evaluation_df, interactions_df = get_holdout(interactions_df)
assert not interactions_df.empty
assert not evaluation_df.empty
print(f">> Evaluation: {evaluation_df.shape} | Interactions: {interactions_df.shape}")


# Create helper mapping from idx to data
print("\nCreating mappings from index to data")
artist_by_idx = np.full((features.shape[0],), -1)
cluster_by_idx = clusterer_labels


# Create helper mapping from data to idxs
print("\nCreating mappings from data to index")
artistId2artworkIndexes = {-1: interactions_df["item_id"].unique()}
clustId2artIndexes = dict()
for i, cluster in enumerate(cluster_by_idx):
    if cluster not in clustId2artIndexes:
        clustId2artIndexes[cluster] = list()
    clustId2artIndexes[cluster].append(i)


print("\nCreating helpers instances...")
# Creating hashes container for duplicates detection
hashes_container = HashesContainer()
# Creating custom score helpers
vissimhandler = VisualSimilarityHandler(clusterer_labels, features)
hybrid_scorer = HybridScorer(vissimhandler, artist_by_idx, artist_boost=ARTIST_BOOST)


# Sampling constants
print("\nCalculating important values...")
N_REAL_STRATEGIES = 2
N_FAKE_STRATEGIES = 2
print(f">> There are {N_REAL_STRATEGIES} real strategies and {N_FAKE_STRATEGIES} fake strategies")
N_SAMPLES_PER_REAL_STRAT_TRAIN = ceil((1 - FAKE_COEF) * TOTAL_SAMPLES_TRAIN / N_REAL_STRATEGIES)
N_SAMPLES_PER_REAL_STRAT_VALID = ceil((1 - FAKE_COEF) * TOTAL_SAMPLES_VALID / N_REAL_STRATEGIES)
N_SAMPLES_PER_FAKE_STRAT_TRAIN = ceil(FAKE_COEF * TOTAL_SAMPLES_TRAIN / N_FAKE_STRATEGIES)
N_SAMPLES_PER_FAKE_STRAT_VALID = ceil(FAKE_COEF * TOTAL_SAMPLES_VALID / N_FAKE_STRATEGIES)
N_USERS = interactions_df["user_id"].nunique()
N_ITEMS = len(features)
print(f">> N_USERS = {N_USERS} | N_ITEMS = {N_ITEMS}")


# Actual sampling section
print("\nCreating samples using custom strategies")
strategy_handler = StrategyHandler(
    vissimhandler, hybrid_scorer,
    clustId2artIndexes, cluster_by_idx,
    artistId2artworkIndexes, artist_by_idx,
    threshold=FINE_GRAINED_THRESHOLD,
    confidence_margin=CONFIDENCE_MARGIN,
    user_as_items=MODE_PROFILE,
)

print(">> Strategy #1: Given real profile, recommend profile")
# Sampling training samples
samples_train_1 = strategy_handler.strategy_1(
    interactions_df.copy(),  # interactions_df
    ceil(N_SAMPLES_PER_REAL_STRAT_TRAIN / N_USERS),  # samples_per_user
    hashes_container,  # hashes_container
)
assert len(samples_train_1) >= N_SAMPLES_PER_REAL_STRAT_TRAIN
# Sampling validation samples
samples_valid_1 = strategy_handler.strategy_1(
    interactions_df.copy(),  # interactions_df
    ceil(N_SAMPLES_PER_REAL_STRAT_VALID / N_USERS),  # samples_per_user
    hashes_container,  # hashes_container
)
assert len(samples_valid_1) >= N_SAMPLES_PER_REAL_STRAT_VALID
print(f">> Strategy #1 Training samples ({len(samples_train_1)}) and validation samples ({len(samples_valid_1)})")

print(">> Strategy #2: Given fake profile, recommend profile")
# Sampling training samples
samples_train_2 = strategy_handler.strategy_2(
    features,  # features
    ceil(N_SAMPLES_PER_FAKE_STRAT_TRAIN / N_ITEMS),  # samples_per_item
    hashes_container,  # hashes_container
)
assert len(samples_train_2) >= N_SAMPLES_PER_FAKE_STRAT_TRAIN
# Sampling validation samples
samples_valid_2 = strategy_handler.strategy_2(
    features,  # features
    ceil(N_SAMPLES_PER_FAKE_STRAT_VALID / N_ITEMS),  # samples_per_item
    hashes_container,  # hashes_container
)
assert len(samples_valid_2) >= N_SAMPLES_PER_FAKE_STRAT_VALID
print(f">> Strategy #2: Training samples ({len(samples_train_2)}) and validation samples ({len(samples_valid_2)})")

print(">> Strategy #3: Given real profile, recommend items according to hybrid scorer")
# Sampling training samples
samples_train_3 = strategy_handler.strategy_3(
    interactions_df.copy(),  # purchases_df
    ceil(N_SAMPLES_PER_REAL_STRAT_TRAIN / N_USERS),  # samples_per_user
    hashes_container,  # hashes_container
)
assert len(samples_train_3) >= N_SAMPLES_PER_REAL_STRAT_TRAIN
# Sampling validation samples
samples_valid_3 = strategy_handler.strategy_3(
    interactions_df.copy(),  # purchases_df
    ceil(N_SAMPLES_PER_REAL_STRAT_VALID / N_USERS),  # samples_per_user
    hashes_container,  # hashes_container
)
assert len(samples_valid_3) >= N_SAMPLES_PER_REAL_STRAT_VALID
print(f">> Strategy #3: Training samples ({len(samples_train_3)}) and validation samples ({len(samples_valid_3)})")

print(">> Strategy #4: Given fake profile, recommend items according to hybrid scorer")
# Sampling training samples
samples_train_4 = strategy_handler.strategy_4(
    features,  # features
    ceil(N_SAMPLES_PER_FAKE_STRAT_TRAIN / N_ITEMS),  # samples_per_item
    hashes_container,  # hashes_container
)
assert len(samples_train_4) >= N_SAMPLES_PER_FAKE_STRAT_TRAIN
# Sampling validation samples
samples_valid_4 = strategy_handler.strategy_4(
    features,  # features
    ceil(N_SAMPLES_PER_FAKE_STRAT_VALID / N_ITEMS),  # samples_per_item
    hashes_container,  # hashes_container
)
assert len(samples_valid_4) >= N_SAMPLES_PER_FAKE_STRAT_VALID
print(f">> Strategy #4: Training samples ({len(samples_train_4)}) and validation samples ({len(samples_valid_4)})")

# Log out detected collisions
print(f">> Total hash collisions: {hashes_container.collisions}")
print(f">> Total visual collisions: {vissimhandler.count}")


# Merge triples into a single list
print("\nMerging strategies samples into a single list")
TRAINING_DATA = [samples_train_1, samples_train_2, samples_train_3, samples_train_4]
for i, samples in enumerate(TRAINING_DATA, start=1):
    print(f">> Strategy {i}: Size: {len(samples):07d} | Sample: {samples[0] if samples else None}")
TRAINING_DATA = [
    triple
    for strategy_samples in TRAINING_DATA
    for triple in strategy_samples
]
print(f">> Training samples: {len(TRAINING_DATA)}")
# Merge strategies samples
VALIDATION_DATA = [samples_valid_1, samples_valid_2, samples_valid_3, samples_valid_4]
for i, samples in enumerate(VALIDATION_DATA, start=1):
    print(f">> Strategy {i}: Size: {len(samples):07d} | Sample: {samples[0] if samples else None}")
VALIDATION_DATA = [
    triple
    for strategy_samples in VALIDATION_DATA
    for triple in strategy_samples
]
print(f">> Validation samples: {len(VALIDATION_DATA)}")


# Search for duplicated hashes
print(f"\nNaive triples validation and looking for duplicates...")
validation_hash_check = HashesContainer()
all_samples = [
    triple
    for subset in (TRAINING_DATA, VALIDATION_DATA)
    for triple in subset
]
user_ids = interactions_df["user_id"].unique()
user_data = dict()
for triple in tqdm(all_samples, desc="Naive validation"):
    profile, pi, ni, ui = triple
    if MODE_PROFILE:
        assert validation_hash_check.enroll(pre_hash((profile, pi, ni)))
    else:
        assert validation_hash_check.enroll(pre_hash((ui, pi, ni), contains_iter=False))
    assert 0 <= pi < N_ITEMS
    assert 0 <= ni < N_ITEMS
    assert pi != ni
    assert not vissimhandler.same(pi, ni)
    if ui == -1:
        continue
    assert ui in user_ids
    if not ui in user_data:
        user = interactions_df[interactions_df["user_id"] == ui]
        user_data[ui] = set(np.hstack(user["item_id"].values))
    user_artworks = user_data[ui]
    assert all(i in user_artworks for i in profile)
    spi = hybrid_scorer.get_score(ui, user_artworks, pi)
    sni = hybrid_scorer.get_score(ui, user_artworks, ni)
    assert spi > sni
print(">> No duped hashes found")


print("\nCreating output files (train and valid)...")
# Training dataframe
df_train = pd.DataFrame(TRAINING_DATA, columns=["profile", "pi", "ni", "ui"])
df_train["ui"] = df_train["ui"].map(user_id_map)
df_train["profile"] = df_train["profile"].map(lambda l: " ".join(map(str, l)))
print(f">> Saving training samples ({OUTPUT_TRAIN_PATH})")
df_train.to_csv(OUTPUT_TRAIN_PATH, index=False)

# Validation dataframe
df_validation = pd.DataFrame(VALIDATION_DATA, columns=["profile", "pi", "ni", "ui"])
df_validation["ui"] = df_validation["ui"].map(user_id_map)
df_validation["profile"] = df_validation["profile"].map(lambda l: " ".join(map(str, l)))
print(f">> Saving validation samples in ({OUTPUT_VALID_PATH})")
df_validation.to_csv(OUTPUT_VALID_PATH, index=False)


print("\nCreating output files (evaluation)...")
# Add event columns
interactions_df["event"] = "interaction"
evaluation_df["event"] = "evaluation"

# Evaluation dataframe
df_evaluation = interactions_df.append(evaluation_df)
# Map user_id columns
df_evaluation["user_id"] = df_evaluation["user_id"].map(user_id_map)
# Use old index to set new index
df_evaluation = df_evaluation.sort_values(by=["index"])
df_evaluation = df_evaluation.reset_index(drop=True)
# Move timestamp and event to first columns
df_evaluation.insert(0, "event", df_evaluation.pop("event"))
df_evaluation = df_evaluation.drop(["index"], axis=1)
print(f">> Saving evaluation data in ({OUTPUT_VALID_PATH})")
df_evaluation.to_csv(OUTPUT_EVAL_PATH, index=False)


# Finished
print("\nDone")
