In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import os
# Change working directory to the root of the project, try to be cautious with this implementation since it will break if the cell is run more than once
os.chdir(r"..")


import pandas as pd
import numpy as np
from resources.constants import *
from src import load_dataframes

OUTFIT_EMBEDDINGS_DF_PATH = r"resources\data\outfit_embeddings_triplets_50_df.pkl"
REPRESENTATION_COLUMN = "outfit_embeddings"

pictures_df = pd.read_csv(PICTURE_TRIPLETS_CSV_PATH, sep=CSV_SEPARATOR)
user_triplets_df = pd.read_csv(USER_ACTIVITY_TRIPLETS_CSV_PATH, sep=CSV_SEPARATOR)

embeddings_df = pd.read_pickle(OUTFIT_EMBEDDINGS_DF_PATH)

In [2]:
outfits_df = load_dataframes.load_pickle(DATA_SAVE_PATH, OUTFITS_PATH)
outfit_id_to_outfit_group_dict = outfits_df.groupby("id").first().reset_index().set_index("id")[["group"]].to_dict()["group"]
outfit_group_to_outfit_embeddings_dict = embeddings_df.groupby("group").first().reset_index().set_index("group")[REPRESENTATION_COLUMN].to_dict()

In [3]:
user_rentals_df = user_triplets_df.groupby("customer.id").agg({"outfit.id": list, "meta.validFrom": list, "derived.bookingTime": list,}).reset_index()
user_rentals_df["group"] = user_rentals_df["outfit.id"].apply(lambda x: [outfit_id_to_outfit_group_dict[outfit_id] for outfit_id in x])
user_rentals_df["outfit_embeddings"] = user_rentals_df["group"].apply(lambda x: [outfit_group_to_outfit_embeddings_dict[group] for group in x])
user_rentals_df["mean_outfit_embeddings"] = user_rentals_df["outfit_embeddings"].apply(lambda x: np.mean(x, axis=0))

In [4]:
most_popular_items = user_triplets_df["outfit.id"].value_counts()
top_items = most_popular_items.sort_values(ascending=False).head(30).index
top_item_embeddings = np.stack([outfit_group_to_outfit_embeddings_dict[outfit_id_to_outfit_group_dict[outfit_id]] for outfit_id in top_items])
mean_top_item_embeddings = np.mean(top_item_embeddings, axis=0)

In [5]:
# Define a distance function
from scipy.spatial.distance import cosine, euclidean
import numpy as np
from tqdm import tqdm

distance_function = euclidean

def calculate_mean_distance_to_all_outfits(anchor_point, embeddings, disable_progress_bar=True, leave=False):
    distances = []
    for outfit in tqdm(embeddings, disable=disable_progress_bar, leave=leave):
        distances.append(distance_function(anchor_point, outfit))
    return sum(distances) / len(distances)

embeddings = np.stack(embeddings_df[REPRESENTATION_COLUMN].values)
mean_all_embedding = embeddings.mean(axis=0)

mean_distance_to_all_outfits = calculate_mean_distance_to_all_outfits(mean_all_embedding, embeddings, disable_progress_bar=False)
mean_distance_to_all_outfits

                                        

0.43788078168279454

In [6]:
# import time
# from sklearn.cluster import KMeans
# import numpy as np
# import pandas as pd
# import random
# from IPython.display import clear_output

# from src.display_images import display_image_ids


# REPRESENTATION_COLUMN = "outfit_embeddings"
# NUM_SAMPLES_PER_CLUSTER = 27

# # Note: not implemented for more than 2 clusters yet
# NUM_CLUSTERS = 2

# def cluster_current_split(current_cluster_df, collect_cluster_samples=True):
#     outfit_representations = np.stack(current_cluster_df[REPRESENTATION_COLUMN].values)
#     kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0, n_init="auto").fit(outfit_representations)
#     current_cluster_df["cluster"] = kmeans.labels_

#     cluster_samples = []
#     for i in range(NUM_CLUSTERS):
#         cluster_outfits = current_cluster_df[current_cluster_df["cluster"] == i].copy()
#         cluster_embeddings = np.stack(cluster_outfits[REPRESENTATION_COLUMN].values)
        
#         # Properly represent the diversity of the cluster by applying KMeans to the embeddings
#         if collect_cluster_samples:
#             num_cluster_samples = min(NUM_SAMPLES_PER_CLUSTER, len(cluster_outfits))
#             cluster_kmeans = KMeans(n_clusters=num_cluster_samples, random_state=1, n_init="auto").fit(cluster_embeddings)
#             cluster_outfits["representation_cluster"] = cluster_kmeans.labels_
#             cluster_representation = cluster_outfits.groupby("representation_cluster").first().reset_index()
#             cluster_samples.append(cluster_representation)
        
#     return current_cluster_df, cluster_samples


# CONVERGE_THRESHOLD = 30

# outfits_to_lead_picture_id_dict = embeddings_df.set_index("id")["lead_picture_id"].to_dict()
# recursive_cluster_list = [embeddings_df.dropna().copy()]

# run_num = 0
# converged_cluster_embeddings = []
# while len(recursive_cluster_list) > 0:
#     recursive_cluster_df = recursive_cluster_list.pop(0)
#     current_cluster_df, _ = cluster_current_split(recursive_cluster_df, collect_cluster_samples=False)

#     clear_output(wait=True)
#     for cluster_num in range(NUM_CLUSTERS):
#         cluster_i_df = current_cluster_df[current_cluster_df["cluster"] == cluster_num].copy()
#         cluster_i_df = cluster_i_df.drop(columns=["cluster"])

#         if len(cluster_i_df) < CONVERGE_THRESHOLD:
#             converged_cluster_embeddings.append(cluster_i_df)
#             print(f"Cluster {run_num}.{cluster_num} has converged")
#             continue

#         #print(f"Cluster {run_num}.{cluster_num} has {len(cluster_i_df)} outfits")
#         recursive_cluster_list.append(cluster_i_df)

#     print(f"Run {run_num} completed, {len(recursive_cluster_list)} clusters left")
#     run_num += 1


# from tqdm.notebook import tqdm

# converged_embeddings_list = []
# for converged_df in tqdm(converged_cluster_embeddings):
#     df_embeddings = np.stack(converged_df[REPRESENTATION_COLUMN].values)
#     mean_all_embedding = df_embeddings.mean(axis=0)
#     converged_embeddings_list.append(mean_all_embedding)

Cluster 576.0 has converged
Cluster 576.1 has converged
Run 576 completed, 0 clusters left


  0%|          | 0/578 [00:00<?, ?it/s]

In [87]:
converged_embeddings_list = []
for cluster_num in embeddings_df["cluster"].unique():
    cluster_df = embeddings_df[embeddings_df["cluster"] == cluster_num]
    df_embeddings = np.stack(cluster_df[REPRESENTATION_COLUMN].values)
    mean_all_embedding = df_embeddings.mean(axis=0)
    converged_embeddings_list.append(mean_all_embedding)

In [7]:
test_cluster_embeddings = np.stack(converged_embeddings_list)
#test_cluster_embeddings = np.append(test_cluster_embeddings, mean_all_embedding.reshape(1, -1), axis=0)
def find_closest_embedding_in_list(embedding, embedding_list, distance_function):
    distances = []
    for other_embedding in embedding_list:
        distance = distance_function(embedding, other_embedding)
        distances.append(distance)
    return min(distances)

results_list = []
for i, user_row in user_rentals_df[user_rentals_df["outfit_embeddings"].apply(len) > 4].iterrows():
    mean_embedding = user_row["mean_outfit_embeddings"]
    outfit_embeddings = user_row["outfit_embeddings"]

    distances = []
    for outfit_embedding in outfit_embeddings:
        distance = distance_function(mean_embedding, outfit_embedding)
        distances.append(distance)
    distance_to_embedding_space_mean_point = euclidean(mean_all_embedding, mean_embedding)
    distance_to_closest_converged = find_closest_embedding_in_list(mean_embedding, test_cluster_embeddings, distance_function)
    distance_to_popular_mean = distance_function(mean_top_item_embeddings, mean_embedding)

    results_list.append([user_row["customer.id"], len(user_row["outfit.id"]), sum(distances) / len(distances), distance_to_embedding_space_mean_point, distance_to_closest_converged, distance_to_popular_mean, mean_embedding])
    print(f"User with {len(user_row['outfit.id'])} rentals has a mean distance of {sum(distances) / len(distances):.3f} to all outfits. {distance_to_embedding_space_mean_point:.3f} distance to mean. \
          {distance_to_closest_converged:.3f} distance to closest converged cluster, {distance_to_popular_mean:.3f} distance distance to most popular cluster. {i} / {len(user_rentals_df)} completed..")

user_investigation_df = pd.DataFrame(results_list, columns=["customer.id", "number_of_rentals", "mean_distance_to_mean_outfit", "distance_to_embedding_space_mean_point", "distance_to_closest_converged", "distance_to_popular_mean", "embedding"])

User with 11 rentals has a mean distance of 0.222 to all outfits. 0.140 distance to mean.           0.084 distance to closest converged cluster, 0.137 distance distance to most popular cluster. 1 / 2249 completed..
User with 5 rentals has a mean distance of 0.249 to all outfits. 0.220 distance to mean.           0.110 distance to closest converged cluster, 0.201 distance distance to most popular cluster. 2 / 2249 completed..
User with 10 rentals has a mean distance of 0.132 to all outfits. 0.123 distance to mean.           0.048 distance to closest converged cluster, 0.060 distance distance to most popular cluster. 4 / 2249 completed..
User with 58 rentals has a mean distance of 0.195 to all outfits. 0.138 distance to mean.           0.066 distance to closest converged cluster, 0.076 distance distance to most popular cluster. 5 / 2249 completed..
User with 38 rentals has a mean distance of 0.212 to all outfits. 0.119 distance to mean.           0.073 distance to closest converged clust

In [8]:
def check_if_closer_than_existing(embedding, all_embeddings, distances_dict, excluded_index=None):
    for i, existing_embedding in enumerate(all_embeddings):
        # Functionality to exclude a specific index, used to make sure the outfit itself is not considered
        if i == excluded_index:
            continue

        distance = distance_function(embedding, existing_embedding)
        if distance < distances_dict[i]:
            distances_dict[i] = distance


# Calculate the shortest distance to a converged cluster for each outfit
all_embeddings = embeddings_df[REPRESENTATION_COLUMN].values
closest_converged_distance = {i: 100.0 for i in range(len(all_embeddings))}

for converged_embedding in tqdm(converged_embeddings_list):
    check_if_closer_than_existing(converged_embedding, all_embeddings, closest_converged_distance)


  0%|          | 0/578 [00:00<?, ?it/s]

In [124]:
mean_distance_to_a_convergence_point = sum(closest_converged_distance.values()) / len(closest_converged_distance)
print(f"Mean distance to a convergence point is {mean_distance_to_a_convergence_point:.3f}, compared to {mean_distance_to_all_outfits:.3f} to all outfits. Which is a ratio of {mean_distance_to_a_convergence_point / mean_distance_to_all_outfits:.3f}")

Mean distance to a convergence point is 0.189, compared to 0.438 to all outfits. Which is a ratio of 0.431


In [10]:
user_investigation_df["difference_between_mean_and_converged"] = user_investigation_df["distance_to_embedding_space_mean_point"] - user_investigation_df["distance_to_closest_converged"]
user_investigation_df["difference_between_mean_and_popular"] = user_investigation_df["distance_to_embedding_space_mean_point"] - user_investigation_df["distance_to_popular_mean"]

In [126]:
user_investigation_df["difference_between_mean_and_converged"].mean(), user_investigation_df["difference_between_mean_and_popular"].mean()

(np.float64(0.1582476986294526), np.float64(0.13734611792905843))

In [12]:
print(f'{user_investigation_df["distance_to_closest_converged"].mean()}, {user_investigation_df["distance_to_embedding_space_mean_point"].mean()} ratio: {user_investigation_df["distance_to_closest_converged"].mean() / user_investigation_df["distance_to_embedding_space_mean_point"].mean()}')
user_investigation_df["distance_to_closest_converged"].describe()

0.06091972109991225, 0.1150618992971312 ratio: 0.5294517253065293


count    1711.000000
mean        0.060920
std         0.020129
min         0.025907
25%         0.048536
50%         0.056662
75%         0.067713
max         0.370272
Name: distance_to_closest_converged, dtype: float64

In [128]:
print(f'{user_investigation_df["distance_to_closest_converged"].mean()}, {user_investigation_df["distance_to_embedding_space_mean_point"].mean()} ratio: {user_investigation_df["distance_to_closest_converged"].mean() / user_investigation_df["distance_to_embedding_space_mean_point"].mean()}')
user_investigation_df["distance_to_closest_converged"].describe()

0.06191222600712221, 0.2201599246365748 ratio: 0.28121478561242585


count    1711.000000
mean        0.061912
std         0.020465
min         0.029599
25%         0.049958
50%         0.057333
75%         0.068152
max         0.377091
Name: distance_to_closest_converged, dtype: float64

In [129]:
print(f'{user_investigation_df["distance_to_closest_converged"].mean()}, {user_investigation_df["distance_to_popular_mean"].mean()} ratio: {user_investigation_df["distance_to_closest_converged"].mean() / user_investigation_df["distance_to_popular_mean"].mean()}')
user_investigation_df["distance_to_popular_mean"].describe()

0.06191222600712221, 0.08281380670751635 ratio: 0.7476075363349156


count    1711.000000
mean        0.082814
std         0.037073
min         0.023466
25%         0.057058
50%         0.076359
75%         0.100049
max         0.452490
Name: distance_to_popular_mean, dtype: float64

In [130]:
user_investigation_df["distance_to_embedding_space_mean_point"].describe()

count    1711.000000
mean        0.220160
std         0.034278
min         0.154366
25%         0.197715
50%         0.213665
75%         0.234487
max         0.550456
Name: distance_to_embedding_space_mean_point, dtype: float64

In [96]:
user_investigation_df["mean_distance_to_mean_outfit"].describe()

count    1711.000000
mean        0.173582
std         0.036401
min         0.073594
25%         0.153517
50%         0.170742
75%         0.189770
max         0.699909
Name: mean_distance_to_mean_outfit, dtype: float64

In [14]:
import scipy.spatial as spatial

point_tree = spatial.cKDTree(np.stack(all_embeddings))
point_near_mean_point = point_tree.query_ball_point(mean_all_embedding, 0.05)
print(len(point_near_mean_point))

3


In [15]:
def calculate_number_of_points_within_distance(point_tree, point, distance):
    return len(point_tree.query_ball_point(point, distance))

user_investigation_df["items_within_popular_radius"] = user_investigation_df.apply(lambda x: calculate_number_of_points_within_distance(point_tree, x["embedding"], x["distance_to_popular_mean"]), axis=1)
user_investigation_df["items_within_convergence_radius"] = user_investigation_df.apply(lambda x: calculate_number_of_points_within_distance(point_tree, x["embedding"], x["distance_to_closest_converged"]), axis=1)

In [133]:
user_investigation_df["items_within_popular_radius"].describe()

count    1711.000000
mean      113.123904
std       182.711362
min         0.000000
25%         4.000000
50%        30.000000
75%       150.000000
max      1364.000000
Name: items_within_popular_radius, dtype: float64

In [134]:
user_investigation_df["items_within_convergence_radius"].describe()

count    1711.000000
mean        3.344243
std         3.714854
min         0.000000
25%         1.000000
50%         2.000000
75%         5.000000
max        26.000000
Name: items_within_convergence_radius, dtype: float64

In [98]:
# for converged_point in converged_embeddings_list:
#     point_near_converged = point_tree.query_ball_point(converged_point, 0.2)
#     print(len(point_near_converged))

from sklearn.neighbors import NearestNeighbors

neigh = NearestNeighbors(n_neighbors=5, metric=distance_function)
neigh.fit(np.stack(all_embeddings))

distances, indices = neigh.kneighbors(np.stack(all_embeddings))

In [101]:
closest_neighbor_distances = distances[:,1]

In [102]:
closest_neighbor_distances.mean()

np.float64(0.1636780545746244)

In [69]:
closest_neighbor_distances.sort()

In [80]:
embeddings_df.iloc[np.argmax(closest_neighbor_distances)]

group                                 group.fffaf834a25fe349be65d2b1b69418c8
id                                   outfit.8776f5d660d948c9bf3243220d6b1f16
name                                            Artsy Empreinte Monogram Bag
description                This elegant and stylish leather bag features ...
owner                                  user.4badce9cc5dd4014bef7409db4c2f830
timeCreated                                              2018-11-13 13:15:52
retailPrice                                                       30000.0000
meta.validTo                                             9999-01-01 00:00:00
outfit_tags                  [Everyday, Louis Vuitton, Black, Leather, Bags]
tag_categories                  [Occasion, Brand, Color, Material, Category]
Outfit_size                                                           [None]
category                                                              [Bags]
embeddings                 [[-0.10126, -0.1192, 0.2998, -0.10455, 0.964, ...