In [None]:
import os

import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm

from models import CuratorNet, CuratorNet2, VBPR
from utils.data import extract_embedding
from utils.metrics import (
    auc_exact,
    nDCG,
    precision,
    recall,
    reciprocal_rank,
)


# Evaluation procedure

### Google Colaboratory setup

Clone repository contents in VM and install dependencies using the script:

```python
# (1) Replace contents of VM
!rm -rf sample_data
# (Replace username and password/token)
!git clone --single-branch --branch master https://username:password@github.com/aaossa/CuratorNet-experiments.git
!cp -a CuratorNet-experiments/. .
!rm -r CuratorNet-experiments/
# Setup VM using script
!chmod +x ./scripts/colaboratory.sh
!./scripts/colaboratory.sh requirements/dev.txt
```

Mount Google Drive in case the data is available there:

```python
# (2) Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")
```

Extract data in the right folder:

```python
# (3) Bring actual data to VM
# Extract data from mounted drive to data folder
!tar -xvzf "/content/drive/My Drive/dataset/dataset.tar.gz" -C data/dataset
```

**Important:** Restart the VM after following the steps to make sure you're using the right version of the declared requirements.

In [None]:
!nvidia-smi

In [None]:
# Dataset
# * UGallery
# * Wikimedia
# * Pinterest
# * Tradesy
DATASET = "UGallery"
assert DATASET in ["UGallery", "Wikimedia", "Pinterest", "Tradesy"]

# Model
# * CuratorNet
# * VBPR
MODEL = "CuratorNet"
assert MODEL in ["CuratorNet", "CuratorNet2", "VBPR"]

# Feature extractor
FEATURE_EXTRACTOR = "resnet50"


In [None]:
# Mode
# Use 'MODE_PROFILE = True' for CuratorNet-like training 
# Use 'MODE_PROFILE = False' for VBPR-like training
MODE_PROFILE = MODEL in ["CuratorNet", "CuratorNet2"]
MODE_PROFILE = "profile" if MODE_PROFILE else "user"

# Checkpoint (ex. 'CuratorNet_2020-08-07-23-59-50')
CHECKPOINT = None
if CHECKPOINT is not None:
    assert CHECKPOINT.startswith(f"{MODEL}_{DATASET}_")


In [None]:
# Paths (general)
CHECKPOINT_PATH = os.path.join("checkpoints", MODEL, f"{CHECKPOINT}.tar")
EMBEDDING_PATH = os.path.join("data", DATASET, f"{DATASET.lower()}_embedding-{FEATURE_EXTRACTOR}.npy")
EVALUATION_PATH = os.path.join("data", DATASET, f"{MODE_PROFILE}-evaluation.csv")

# Paths (images)
IMAGES_DIR = None
if DATASET == "UGallery":
    IMAGES_DIR = os.path.join("/", "mnt", "workspace", "Ugallery", "images")
elif DATASET == "Wikimedia":
    IMAGES_DIR = os.path.join("/", "mnt", "data2", "wikimedia", "images", "img")
elif DATASET == "Pinterest":
    IMAGES_DIR = os.path.join("/", "mnt", "data2", "pinterest_iccv", "images")
elif DATASET == "Tradesy":
    print("Tradesy dataset not supported at the moment.")

# General constants
RNG_SEED = 0
USE_GPU = True


In [None]:
# Freezing RNG seed if needed
if RNG_SEED is not None:
    print(f"\nUsing random seed... ({RNG_SEED})")
    torch.manual_seed(RNG_SEED)


In [None]:
# Load embedding from file
print(f"\nLoading embedding from file... ({EMBEDDING_PATH})")
embedding = np.load(EMBEDDING_PATH, allow_pickle=True)

# Extract features and "id2index" mapping
print("\nExtracting data into variables...")
features, _, item_index2fn = extract_embedding(embedding, verbose=True)
print(f">> Features shape: {features.shape}")
del embedding  # Release some memory


In [None]:
# Load evaluation dataframe
print("\nLoad evaluation dataframe")
evaluation_df = pd.read_csv(EVALUATION_PATH)
# Transform lists from str to int
string_to_list = lambda s: list(map(int, s.split()))
evaluation_df["profile"] = evaluation_df["profile"].apply(
    lambda s: string_to_list(s) if isinstance(s, str) else s,
)
evaluation_df["predict"] = evaluation_df["predict"].apply(
    lambda s: string_to_list(s) if isinstance(s, str) else s,
)
# Group evaluations by profile and user
evaluation_df["profile"] = evaluation_df["profile"].map(tuple)
evaluation_df = evaluation_df.groupby(["profile", "user_id"]).agg({"predict": sum}).reset_index()
evaluation_df["profile"] = evaluation_df["profile"].map(list)
print(f">> Evaluation: {evaluation_df.shape}")


In [None]:
# Create device instance
print("\nDevice initialization")
device = torch.device("cuda:0" if torch.cuda.is_available() and USE_GPU else "cpu")
if torch.cuda.is_available() != USE_GPU:
    print((f"\nNotice: Not using GPU - "
           f"Cuda available ({torch.cuda.is_available()}) "
           f"does not match USE_GPU ({USE_GPU})"
    ))

# Loading checkpoint
if CHECKPOINT is not None:
    print("\nLoading checkpoint")
    checkpoint = torch.load(CHECKPOINT_PATH, map_location=torch.device("cpu"))
    print(f">> Best epoch: {checkpoint['epoch']} | Best accuracy: {checkpoint['accuracy']}")

# Model initialization
print("\nModel initialization")
model = None
if MODEL == "CuratorNet":
    model = CuratorNet(
        torch.Tensor(features),  # Pretrained visual features
        input_size=features.shape[1],  # Network input size
    ).to(device)
elif MODEL == "CuratorNet2":
    model = CuratorNet2(
        torch.Tensor(features),  # Pretrained visual features
        input_size=features.shape[1],  # Network input size
    ).to(device)
elif MODEL == "VBPR":
    n_users = checkpoint["model"]["gamma_users.weight"].size(0)
    n_items = checkpoint["model"]["gamma_items.weight"].size(0)
    dim_gamma = checkpoint["model"]["gamma_users.weight"].size(1)
    dim_theta = checkpoint["model"]["theta_users.weight"].size(1)
    model = VBPR(
        n_users, n_items,  # Number of users and items
        torch.Tensor(features),  # Pretrained visual features
        dim_gamma, dim_theta,  # Size of internal spaces
    ).to(device)

# Load state dict
if CHECKPOINT is not None:
    model.load_state_dict(checkpoint["model"])

# Change model mode to eval
print("\nChanging model mode to eval")
model.eval()


In [None]:
# Predict all
# If True, ranks every item including already consumed items
# If False, ranks ALL - PROFILE (consumed) + PREDICT (ground truth)
PREDICT_ALL = False


In [None]:
%%time
# Metrics
N_EVALS = len(evaluation_df.index)
# Area Under the Curve (AUC)
AUC = torch.zeros([N_EVALS], dtype=torch.float64, device=device)
# Reciprocal Rank (RR)
RR = torch.zeros([N_EVALS], dtype=torch.float64, device=device)
# Recall
R20 = torch.zeros([N_EVALS], dtype=torch.float64, device=device)
R100 = torch.zeros([N_EVALS], dtype=torch.float64, device=device)
R200 = torch.zeros([N_EVALS], dtype=torch.float64, device=device)
# Precision
P20 = torch.zeros([N_EVALS], dtype=torch.float64, device=device)
P100 = torch.zeros([N_EVALS], dtype=torch.float64, device=device)
P200 = torch.zeros([N_EVALS], dtype=torch.float64, device=device)
# Normalized discounted cumulative gain (nDCG)
N20 = torch.zeros([N_EVALS], dtype=torch.float64, device=device)
N100 = torch.zeros([N_EVALS], dtype=torch.float64, device=device)
N200 = torch.zeros([N_EVALS], dtype=torch.float64, device=device)
PROFILE_SIZES = torch.zeros([N_EVALS], dtype=int, device=device)
N_ITEMS = len(features)


cache = model.generate_cache()


evaluation_df["profile"] = evaluation_df["profile"].map(tuple)
grouped_evals = evaluation_df.groupby(["profile", "user_id"]).agg({"predict": sum}).reset_index()
for i, row in tqdm(enumerate(evaluation_df.itertuples()), total=len(evaluation_df.index)):
    # Load data into tensors
    profile = torch.tensor(row.profile).to(device, non_blocking=True).unsqueeze(0)
    user_id = torch.tensor([int(row.user_id)]).to(device, non_blocking=True)
    predict = torch.tensor(row.predict).to(device, non_blocking=True)
    # Prediction
    if MODE_PROFILE == "profile":
        scores = model.recommend_all(profile, cache=cache)
    elif MODE_PROFILE == "user":
        scores = model.recommend_all(user_id, cache=cache).squeeze()
    # Ranking
    pos_of_evals = (torch.argsort(scores, descending=True)[..., None] == predict).any(-1).nonzero().flatten()
    if not PREDICT_ALL:
        pos_of_profi = (torch.argsort(scores, descending=True)[..., None] == profile).any(-1).nonzero().flatten()
        # Relevant dimensions
        _a, _b = pos_of_evals.size(0), pos_of_profi.size(0)
        # Calculate shift for each eval item
        shift = (pos_of_profi.expand(_a, _b) < pos_of_evals.reshape(_a, 1).expand(_a, _b)).sum(1)
        # Apply shift
        pos_of_evals -= shift.squeeze(0)
    # Store metrics
    AUC[i] = auc_exact(pos_of_evals, N_ITEMS)
    RR[i] = reciprocal_rank(pos_of_evals)
    R20[i] = recall(pos_of_evals, 20)
    P20[i] = precision(pos_of_evals, 20)
    N20[i] = nDCG(pos_of_evals, 20)
    R100[i] = recall(pos_of_evals, 100)
    P100[i] = precision(pos_of_evals, 100)
    N100[i] = nDCG(pos_of_evals, 100)
    R200[i] = recall(pos_of_evals, 200)
    P200[i] = precision(pos_of_evals, 200)
    N200[i] = nDCG(pos_of_evals, 200)
    PROFILE_SIZES[i] = len(row.profile)


In [None]:
# Display stats
print(f"AVG AUC = {AUC.mean()}")
print(f"AVG RR = {RR.mean()}")
print(f"AVG R20 = {R20.mean()}")
print(f"AVG P20 = {P20.mean()}")
print(f"AVG NDCG20 = {N20.mean()}")
print(f"AVG R100 = {R100.mean()}")
print(f"AVG P100 = {P100.mean()}")
print(f"AVG NDCG100 = {N100.mean()}")
print(f"AVG R200 = {R200.mean()}")
print(f"AVG P200 = {P200.mean()}")
print(f"AVG NDCG200 = {N200.mean()}")


## Relevant plots

In [None]:
import numpy as np


def smart_group(value):
    if value == 0:
        return 0
    digits = int(np.log10(value)) + 1
    return (10**(digits - 1)) * (value // (10**(digits - 1)))

In [None]:
import pandas as pd


metrics_data = [
    [
        PROFILE_SIZES[i].item(), AUC[i].item(), RR[i].item(),
        R20[i].item(), P20[i].item(), N20[i].item(),
        R100[i].item(), P100[i].item(), N100[i].item(),
    ]
    for i in range(N_EVALS)
]
metrics_df = pd.DataFrame(metrics_data, columns=[
    "PROFILE_SIZES", "AUC", "RR",
    "R20", "P20", "N20",
    "R100", "P100", "N100",
])
metrics_df["PROFILE_SIZES_STEPS"] = metrics_df["PROFILE_SIZES"].map(smart_group)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


# Metric
METRIC = "AUC"
# Profile size range
metrics_df_plot = metrics_df.copy()
metrics_df_plot = metrics_df_plot[
    (metrics_df_plot["PROFILE_SIZES_STEPS"] >= 0) & (metrics_df_plot["PROFILE_SIZES_STEPS"] < 100)
]
# Plot METRIC distribution across users grouped by profile size
plt.figure(figsize=(24, 9))
ax = sns.violinplot(x="PROFILE_SIZES_STEPS", y=METRIC, data=metrics_df_plot, inner=None)
if DATASET != "Pinterest":
    ax = sns.swarmplot(x="PROFILE_SIZES_STEPS", y=METRIC, data=metrics_df_plot, color="black", edgecolor="gray")


In [None]:
# Area Under the Curve distribution across users
metrics_df["AUC"].plot.box(sym="r+")

In [None]:
# First relevant item position (1 / reciprocal_rank) distribution across users
# Line marks the 10% of the dataset
graph = (1 / metrics_df["RR"]).plot.box(sym="r+")
plt.ylim(0, features.shape[0])
graph.axhline(features.shape[0] / 10, color="red")

In [None]:
# First relevant item position (1 / reciprocal_rank) histogram
graph = (1 / metrics_df["RR"]).plot.hist(bins=50)

## Results inspection

In [None]:
ROW = 0


In [None]:
# Row in evaluation dataframe
row = evaluation_df.iloc[ROW]

# Load data into tensors
profile = torch.tensor(row.profile).to(device, non_blocking=True).unsqueeze(0)
user_id = torch.tensor([int(row.user_id)]).to(device, non_blocking=True)
predict = torch.tensor(row.predict).to(device, non_blocking=True)
# Prediction
if MODE_PROFILE == "profile":
    scores = model.recommend_all(profile)
elif MODE_PROFILE == "user":
    scores = model.recommend_all(user_id).squeeze()
# Ranking
pos_of_evals = (torch.argsort(scores, descending=True)[..., None] == predict).any(-1).nonzero().flatten()
if not PREDICT_ALL:
    pos_of_profi = (torch.argsort(scores, descending=True)[..., None] == profile).any(-1).nonzero().flatten()
    pos_of_evals -= (pos_of_profi < pos_of_evals).sum()

# Display metrics
print(f"| {'-' * 15} | {'-' * 7} |")
print(f"| {'Metric':^15} | {'Score':^7} |")
print(f"| {'-' * 15} | {'-' * 7} |")
print(f"| {'AUC':^15} | {auc_exact(pos_of_evals, N_ITEMS):.5f} |")
print(f"| {'RR':^15} | {reciprocal_rank(pos_of_evals):.5f} |")
for k in [20, 100, 500]:
    print(f"| {'-' * 15} | {'-' * 7} |")
    print(f"| {f'Recall@{k}':^15} | {recall(pos_of_evals, k):.5f} |")
    print(f"| {f'Precision@{k}':^15} | {precision(pos_of_evals, k):.5f} |")
    print(f"| {f'nDCG@{k}':^15} | {nDCG(pos_of_evals, k):.5f} |")
print(f"| {'-' * 15} | {'-' * 7} |")

# Profile and prediction
profile = profile.cpu().numpy().flatten()
predict = predict.cpu().numpy().flatten()
# Ranking
K = 20
ranking = torch.argsort(scores, descending=True).cpu().numpy().flatten()
if not PREDICT_ALL:
    ranking = ranking[(~np.isin(ranking, profile)) | (np.isin(ranking, predict))]
ranking = ranking[:K]
print()
print(f"Size of profile: {profile.size}")
print(f"Position of actual items: {pos_of_evals.cpu().numpy()}")


In [None]:
import matplotlib.image as mpimg
import matplotlib.pyplot as plt


COLUMNS = 10
ELEMENTS = {
    "Consumed": profile,
    "Recommendation": ranking,
    "Ground truth": predict,
}
SHOW_FILENAME = False

for label, items in ELEMENTS.items():
    n_rows = ((len(items) - 1) // COLUMNS + 1)
    fig = plt.figure(figsize=(COLUMNS * 2, 4 * n_rows))
    plt.title(f"{label.title()} (n={len(items)})")
    plt.axis("off")
    for i, img_id in enumerate(items, start=1):
        img_fn = item_index2fn[img_id]
        image = mpimg.imread(os.path.join(IMAGES_DIR, img_fn))
        ax = fig.add_subplot(n_rows, COLUMNS, i)
        if SHOW_FILENAME:
            ax.set_title(img_fn)
        if label == "Recommendation":
            if img_id in predict:
                ax.patch.set_edgecolor("green")
                ax.patch.set_linewidth("5")
                if SHOW_FILENAME:
                    ax.set_title(img_fn, color="green")
                else:
                    ax.set_title("Ground truth", color="green")
            elif img_id in profile:
                ax.patch.set_edgecolor("red")
                ax.patch.set_linewidth("5")
                if SHOW_FILENAME:
                    ax.set_title(img_fn, color="red")
                else:
                    ax.set_title("Consumed", color="red")
        plt.xticks([])
        plt.yticks([])
        plt.imshow(image)
