In [None]:
import numpy as np
import pandas as pd
import gc
import tqdm
import os
import random
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

epochs = 100
batch_size = 16
margin = 1

In [None]:
def make_pairs(data):
    pairs = []
    labels = []
    cust_ids = data.user_id.unique()
    
    for j in tqdm.tqdm(cust_ids):
        sets = data[data.user_id == j].reset_index(drop=True)
            
        x = sets.img_embedding.reset_index(drop=True)
        y = sets.label.reset_index(drop=True)
        
        num_classes = max(y) + 1
        digit_indices = [np.where(y == i)[0] for i in range(num_classes)]

        for idx1 in range(len(x)):
            # add a matching example
            x1 = x[idx1]
            label1 = y[idx1]

            if len(digit_indices[label1]) == 1:
                continue
            idx2 = random.choice(digit_indices[label1])

            x2 = x[idx2]

            pairs += [[x1, x2]]
            labels += [0]
            
            if len(digit_indices) == 1:
                continue
            # add a non-matching example
            label2 = 0 if label1 == 1 else 1
            if len(digit_indices[label2]) == 0:
                continue
            idx2 = random.choice(digit_indices[label2])
            x2 = x[idx2]

            pairs += [[x1, x2]]
            labels += [1]
            #print(j)

    return np.array(pairs), np.array(labels).astype("float32")

In [None]:
def make_test_pairs(data,group_b):
    pairs = []
    labels = []
    cust_ids = data.user_id.unique()
    
    for j in tqdm.tqdm(cust_ids):
        sets_a = data[data.user_id == j].reset_index(drop=True)
        sets_b = group_b[group_b.user_id == j].reset_index(drop=True)
            
        x_a = sets_a.img_embedding
        x_b = sets_b.img_embedding


        for idx1 in range(len(x_a)):
            for idx2 in range(len(x_b)):
               # add a matching example
                x1 = x_a[idx1]
                x2 = x_b[idx2]

                pairs += [[x1, x2]]
                labels += [0]

    return np.array(pairs), np.array(labels).astype("float32")

In [None]:
def euclidean_distance(vects):
    x, y = vects
    sum_square = tf.math.reduce_sum(tf.math.square(x - y), axis=1, keepdims=True)
    return tf.math.sqrt(tf.math.maximum(sum_square, tf.keras.backend.epsilon()))

input_1 = layers.Input(2048)
input_2 = layers.Input(2048)
merge_layer = layers.Lambda(euclidean_distance)([input_1, input_2])
normal_layer = tf.keras.layers.BatchNormalization()(merge_layer)
output_layer = layers.Dense(1, activation="sigmoid")(normal_layer)
siamese = keras.Model(inputs=[input_1, input_2], outputs=output_layer)

def loss(margin=1):
    # Contrastive loss = mean( (1-true_value) * square(prediction) +
    #                         true_value * square( max(margin-prediction, 0) ))
    def contrastive_loss(y_true, y_pred):
        square_pred = tf.math.square(y_pred)
        margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
        return tf.math.reduce_mean(
            (1 - y_true) * square_pred + (y_true) * margin_square
        )

    return contrastive_loss

siamese.compile(loss=loss(margin=margin), optimizer="RMSprop", metrics=["accuracy"])
siamese.summary()

In [None]:
def prepare_candidates(customers_id, n_candidates = 10):
  prediction_dict = {}
  dummy_list = list((ali_4w['item_id'].value_counts()).index)[:n_candidates]

  for i, cust_id in enumerate(customers_id):
    # comment this for validation
    if cust_id in purchase_dict_4w:
        l = sorted((purchase_dict_4w[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>n_candidates:
            s = l[:n_candidates]
        else:
            s = l+dummy_list_4w[:(n_candidates-len(l))]
    else:
        s = dummy_list
    prediction_dict[cust_id] = s

  k = list(map(lambda x: x[0], prediction_dict.items()))
  v = list(map(lambda x: x[1], prediction_dict.items()))
  negatives_df = pd.DataFrame({'user_id': k, 'negatives': v})
  negatives_df = (
      negatives_df
      .explode('negatives')
      .rename(columns = {'negatives': 'item_id'})
  )
  return negatives_df

In [None]:
def recall_at_k(true_labels, pred_labels, k):
    true_positives = 0
    total_positives = len(true_labels)
    if total_positives > k:
        true_labels = true_labels[:k]
    for pred in pred_labels[:k]:
        if pred in true_labels:
            true_positives += 1
    recall = true_positives / total_positives if total_positives > 0 else 0
    return recall

def calculate_recall_at_k(true_labels_list, pred_labels_list, k):
    recall_scores = []
    for true_labels, pred_labels in zip(true_labels_list, pred_labels_list):
        recall = recall_at_k(true_labels, pred_labels, k)
        recall_scores.append(recall)
    mean_recall = sum(recall_scores) / len(recall_scores)
    return mean_recall

def dcg_at_k(r, k):
    r = np.asarray(r)[:k]
    discounts = np.log2(np.arange(len(r)) + 2)
    return np.sum(r / discounts)

def ndcg_at_k(true_labels, pred_scores, k):
    dcg = dcg_at_k(true_labels, k)
    idcg = dcg_at_k(sorted(true_labels, reverse=True), k)
    return dcg / idcg if idcg != 0 else 0

def calculate_ndcg_at_k(y_true, y_pred, k):
    ndcg_scores = []
    for true_items, pred_items in zip(y_true, y_pred):
        relevance = [1 if item in true_items else 0 for item in pred_items]
        ndcg_scores.append(ndcg_at_k(relevance, relevance,k))
    return np.mean(ndcg_scores)

def average_precision_at_k(true_labels, pred_labels, k):
    num_correct = 0
    precision_sum = 0
    for i, pred in enumerate(pred_labels[:k]):
        if pred in true_labels:
            num_correct += 1
            precision_sum += num_correct / (i + 1)
    return precision_sum / min(k, len(true_labels))

def map_at_k(true_labels_list, pred_labels_list, k):
    average_precisions = []
    for true_labels, pred_labels in zip(true_labels_list, pred_labels_list):
        average_precision = average_precision_at_k(true_labels, pred_labels, k)
        average_precisions.append(average_precision)
    return sum(average_precisions) / len(average_precisions)

# Similar-Style

In [None]:
ali_user = pd.read_pickle('ali_user.pkl')

In [None]:
embeddings = pd.read_pickle('path/ali_embeddings.pkl')
img_id = pd.read_pickle('path/filenames.pkl')
img_id = img_id[:430000]
img_feat = pd.DataFrame(img_id, columns=['item_id'])
img_feat['img_embedding'] = embeddings
img_feat = img_feat[~img_feat.img_embedding.isna()].reset_index(drop=True)

In [None]:
ali_user = ali_user[ali_user.item_id.isin(list(img_feat.item_id))]
outfit = ali_user.groupby('outfit_id').item_id.agg('count').reset_index().sort_values(['item_id'], ascending=False)
outfit = outfit[outfit.item_id >=2]
ali_user = ali_user[ali_user.outfit_id.isin(list(outfit.outfit_id))]

In [None]:
sim = pd.read_csv('img_feat_res_euc.csv')
sim = sim.drop(['top_dist'],axis=1)

In [None]:
ali_user = pd.merge(ali_user, sim, on='item_id',how='left')
ali_user = ali_user.sample(frac = 1)
ali_train, ali_test, ali_val = np.split(ali_user, [int(0.5*len(ali_user)), int(0.6*len(ali_user))])

In [None]:
df_val = ali_val.drop_duplicates(subset=['user_id','item_id'])
pos = pd.merge(ali_train, df_val, left_on=['user_id','top_sim'], right_on=['user_id','item_id'],how='inner')
pos = pos[['user_id','top_sim_x']].rename(columns={'top_sim_x':'item_id'})

In [None]:
purchase_dict_4w = {}
ali_4w = ali_val
for i,x in enumerate(zip(ali_4w['user_id'], ali_4w['item_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_4w:
        purchase_dict_4w[cust_id] = {}
    
    if art_id not in purchase_dict_4w[cust_id]:
        purchase_dict_4w[cust_id][art_id] = 0
    
    purchase_dict_4w[cust_id][art_id] += 1

dummy_list_4w = list((ali_4w['item_id'].value_counts()).index)[:10]

In [None]:
user_id = pos['user_id'].unique()
positives = prepare_candidates(user_id, 5)
positives['label'] = 1

In [None]:
df2 = positives.drop_duplicates(subset=['user_id','item_id'])
lgb_neg = ali_train[ali_train.user_id.isin(list(df2.user_id))]
negatives = lgb_neg.merge(df2, left_on =['user_id','top_sim'], right_on=['user_id','item_id'],how='left')
negatives = negatives[negatives.label.isna()]
negatives['label'] = negatives.label.fillna(0)
negatives = negatives[['user_id','top_sim','label']].rename(columns={'top_sim':'item_id'})

In [None]:
sia_full = pd.concat([positives, negatives],ignore_index=True)
sia_full = sia_full.drop_duplicates(['user_id','item_id'])
sia_full = pd.merge(sia_full, img_feat, on='item_id',how='left')
sia_full['label'] = sia_full.label.astype(int)
sia_full = sia_full.sample(frac=1)
sia_train, sia_val = np.split(sia_full, [int(0.8*len(sia_full))])

In [None]:
pairs_train, labels_train = make_pairs(sia_train)
pairs_val, labels_val = make_pairs(sia_val)
train_1 = pairs_train[:, 0] 
train_2 = pairs_train[:, 1]
val_1 = pairs_val[:, 0]
val_2 = pairs_val[:, 1]

In [None]:
history = siamese.fit(
    [train_1, train_2],
    labels_train,
    validation_data=([val_1, val_2], labels_val),
    batch_size=batch_size,
    epochs=epochs,
)

In [None]:
def plt_metric(history, metric, title, has_valid=True):
    plt.plot(history[metric])
    if has_valid:
        plt.plot(history["val_" + metric])
        plt.legend(["train", "validation"], loc="upper left")
    plt.title(title)
    plt.ylabel(metric)
    plt.xlabel("epoch")
    plt.show()


# Plot the accuracy
plt_metric(history=history.history, metric="accuracy", title="Model accuracy")

# Plot the constrastive loss
plt_metric(history=history.history, metric="loss", title="Constrastive Loss")

In [None]:
val_full = ali_val.copy()
val_full = val_full.drop('item_id',axis=1).rename(columns={'top_sim':'item_id'})
top5_ppl = val_full.item_id.value_counts().to_frame().index.astype('str')[:30]

top5_ppl_train = ali_test.drop_duplicates(['user_id'], keep='last').drop('item_id', axis=1)
top5_ppl_train['item_id'] = [list(top5_ppl) for _ in range(len(top5_ppl_train))]
top5_ppl_train = top5_ppl_train.explode('item_id')
top5_ppl_train.head()

trsc_train = val_full.copy()
trsc_train = trsc_train.groupby('user_id').tail(55)
trsc_train.head()


ali_pred = pd.concat([trsc_train, top5_ppl_train])
ali_pred = ali_pred[ali_pred.user_id.isin(list(set(ali_test.user_id)))]

In [None]:
group_a = ali_pred[['user_id']].drop_duplicates(subset='user_id')
click = ali_val.groupby('user_id').tail(5)
group_a = group_a.merge(click, on='user_id',how='left')
ppl = ali_val.item_id.value_counts().to_frame().index.astype('str')[0]
group_a['item_id'] = group_a['item_id'].fillna(ppl)

group_a = group_a.merge(img_feat, on='item_id',how='left')
ali_pred = ali_pred.merge(img_feat,on='item_id',how='left')

In [None]:
pairs_test, labels_test = make_test_pairs(group_a, ali_pred)
test_1 = pairs_test[:, 0]
test_2 = pairs_test[:, 1]

In [None]:
predictions = []
batch_size = 5000
for bucket in tqdm.tqdm(range(0, len(test_1), batch_size)):
  outputs = siamese.predict(
      [test_1[bucket: bucket+batch_size], test_2[bucket: bucket+batch_size] ]
      )
  predictions.append(outputs)
predictions = np.concatenate(predictions)
len(prediction)

In [None]:
test = ali_test.groupby('user_id')[['item_id']].aggregate(lambda x: x.tolist())
sia_pred = ali_pred.copy()
sia_pred['pred'] = predictions
sia_pred = sia_pred.sort_values(['user_id','pred'],ascending=False)
sia_pred = sia_pred.groupby(['user_id','item_id']).img_embedding.agg('count').reset_index().sort_values('img_embedding',ascending=False)
sia_pred = sia_pred.groupby('user_id').head(20)

In [None]:
sia_pred_lst = (
    sia_pred
    .groupby('user_id')[['item_id']]
    .aggregate(lambda x: x.tolist())
)

In [None]:
prediction = list(sia_pred_lst.item_id)
true = list(test.item_id)

In [None]:
print(round(calculate_recall_at_k(true, prediction, 5),5))
print(round(calculate_recall_at_k(true, prediction, 10),5))
print(round(calculate_recall_at_k(true, prediction, 20),5))

print(round(map_at_k(true, prediction, 5),5))
print(round(map_at_k(true, prediction, 10),5))
print(round(map_at_k(true, prediction, 20),5))

print(round(calculate_ndcg_at_k(true, prediction,5),5))
print(round(calculate_ndcg_at_k(true, prediction,10),5))
print(round(calculate_ndcg_at_k(true, prediction,20),5))

# Compatible-Style

In [None]:
ali_user = pd.read_pickle('ali_user.pkl')
ali_user = ali_user[ali_user.item_id.isin(list(img_feat.item_id))]
outfit = ali_user.groupby('outfit_id').item_id.agg('count').reset_index().sort_values(['item_id'], ascending=False)
outfit = outfit[outfit.item_id >=2]
ali_user = ali_user[ali_user.outfit_id.isin(list(outfit.outfit_id))]
ali_user = ali_user.sample(frac = 1) # shuffle rows
ali_train, ali_test, ali_val = np.split(ali_user, [int(0.5*len(ali_user)), int(0.6*len(ali_user))])

In [None]:
df_val = ali_val.drop_duplicates(subset=['user_id','outfit_id','item_id',])
identical = pd.merge(ali_train.reset_index(), df_val, on=['user_id','outfit_id','item_id',],how='inner').set_index('index')
df_val = ali_val.drop_duplicates(subset=['user_id','outfit_id'])
pos = pd.merge(ali_train.reset_index(), df_val, on=['user_id','outfit_id'],how='inner').set_index('index')
pos = pos[~pos.index.isin(list(identical.index))].reset_index(drop=True)
pos = pos.rename(columns={'item_id_x':'item_id'}).drop('item_id_y',axis=1)

In [None]:
neg = ali_train.reset_index(drop=True).set_index(['user_id','item_id'])
pos = pos.reset_index(drop=True).set_index(['user_id','item_id'])
neg = neg[~neg.index.isin(list(pos.index))].reset_index()
pos = pos.reset_index()
neg = neg[neg.user_id.isin(list(set(pos.user_id)))]
neg = neg.groupby('user_id').filter(lambda x: len(x)>=7)

In [None]:
pos = pos.groupby('user_id').filter(lambda x: len(x)>=5)
positives = pos.groupby('user_id').head(10)
positives['label'] = 1
negatives = neg.groupby('user_id').head(10)
negatives['label'] = 0

In [None]:
sia_full = pd.concat([positives, negatives],ignore_index=True)
sia_full = sia_full.drop_duplicates(['user_id','item_id'])
sia_full = pd.merge(sia_full, img_feat, on='item_id',how='left')
sia_full = sia_full.sample(frac=1)
sia_train, sia_val = np.split(sia_full, [int(0.8*len(sia_full))])

In [None]:
pairs_train, labels_train = make_pairs(sia_train)
pairs_val, labels_val = make_pairs(sia_val)
train_1 = pairs_train[:, 0] 
train_2 = pairs_train[:, 1]
val_1 = pairs_val[:, 0]
val_2 = pairs_val[:, 1]

In [None]:
history = siamese.fit(
    [train_1, train_2],
    labels_train,
    validation_data=([val_1, val_2], labels_val),
    batch_size=batch_size,
    epochs=epochs,
)

In [None]:
# Plot the accuracy
plt_metric(history=history.history, metric="accuracy", title="Model accuracy")

# Plot the constrastive loss
plt_metric(history=history.history, metric="loss", title="Constrastive Loss")

In [None]:
group_a = ali_test[['user_id']].drop_duplicates(subset='user_id')
click = ali_val.groupby('user_id').head(5) #1 for plots purpose
group_a = group_a.merge(click, on='user_id',how='left')
ppl = ali_train.item_id.value_counts().to_frame().index.astype('str')[0]
group_a['item_id'] = group_a['item_id'].fillna(ppl)

In [None]:
pred_outfit = ali_val.drop_duplicates(['user_id','outfit_id','item_id'])
ali_pred = ali_test[['user_id']].drop_duplicates(subset='user_id')
ali_pred = pd.merge(ali_pred, pred_outfit, on='user_id',how='left')

outfit_full = ali_user.drop_duplicates(subset='outfit_id')
ali_pred = pd.merge(ali_pred,outfit_full, on=['user_id','outfit_id'],how='left')
ali_pred = ali_pred.drop('item_id_x',axis=1).rename(columns={'item_id_y':'item_id'})

identical = pred_outfit.reset_index(drop=True).set_index(['user_id','item_id','outfit_id'])
ali_pred = ali_pred.reset_index(drop=True).set_index(['user_id','item_id','outfit_id'])
ali_pred = ali_pred[~ali_pred.index.isin(list(identical.index))]
ali_pred = ali_pred.reset_index()

In [None]:
val_full = ali_val.copy()
top5_ppl = val_full.item_id.value_counts().to_frame().index.astype('str')[:30]

top5_ppl_train = ali_pred.drop_duplicates(['user_id'], keep='last').drop('item_id', axis=1)
top5_ppl_train['item_id'] = [list(top5_ppl) for _ in range(len(top5_ppl_train))]
top5_ppl_train = top5_ppl_train.explode('item_id')

ali_pred = pd.concat([ali_pred, top5_ppl_train])
ali_pred = ali_pred[~ali_pred.item_id.isna()]
ali_pred = ali_pred.drop(['outfit_id'],axis=1)

In [None]:
group_a = group_a.merge(img_feat, on='item_id',how='left')
ali_pred = ali_pred.merge(img_feat,on='item_id',how='left')

In [None]:
pairs_test, labels_test = make_test_pairs(group_a, ali_pred)
test_1 = pairs_test[:, 0]
test_2 = pairs_test[:, 1]

In [None]:
predictions = []
batch_size = 5000
for bucket in tqdm.tqdm(range(0, len(test_1), batch_size)):
  outputs = siamese.predict(
      [test_1[bucket: bucket+batch_size], test_2[bucket: bucket+batch_size] ]
      )
  predictions.append(outputs)
predictions = np.concatenate(predictions)

In [None]:
sia_pred = ali_pred.copy()
sia_pred['pred'] = predictions
sia_pred = sia_pred.sort_values(['user_id','pred'],ascending=False)
sia_pred = sia_pred.groupby(['user_id','item_id']).img_embedding.agg('count').reset_index().sort_values('img_embedding',ascending=False)
sia_pred = sia_pred.groupby('user_id').head(20)

In [None]:
sia_pred_lst = (
    sia_pred
    .groupby('user_id')[['item_id']]
    .aggregate(lambda x: x.tolist())
)

In [None]:
prediction = list(sia_pred_lst.item_id)

In [None]:
print(round(calculate_recall_at_k(true, prediction, 5),5))
print(round(calculate_recall_at_k(true, prediction, 10),5))
print(round(calculate_recall_at_k(true, prediction, 20),5))

print(round(map_at_k(true, prediction, 5),5))
print(round(map_at_k(true, prediction, 10),5))
print(round(map_at_k(true, prediction, 20),5))

print(round(calculate_ndcg_at_k(true, prediction,5),5))
print(round(calculate_ndcg_at_k(true, prediction,10),5))
print(round(calculate_ndcg_at_k(true, prediction,20),5))

# Contradictory-Style

In [None]:
ali_user = pd.read_pickle('ali_user.pkl')
ali_user = ali_user[ali_user.item_id.isin(list(img_feat.item_id))]
outfit = ali_user.groupby('outfit_id').item_id.agg('count').reset_index().sort_values(['item_id'], ascending=False)
outfit = outfit[outfit.item_id >=2]
ali_user = ali_user[ali_user.outfit_id.isin(list(outfit.outfit_id))]

In [None]:
ali_item = pd.read_csv('path/item_data.txt', header=None, delimiter=',',on_bad_lines='skip',
                      names=['item_id', 'cate_id', 'imgLink', 'title'])
ali_cate = ali_item[ali_item.item_id.isin(list(img_feat.item_id))]
ali_cate = ali_cate.drop_duplicates('item_id')
cate = ali_cate.groupby('cate_id').filter(lambda x: x['cate_id'].count()>2)
ali_user = ali_user.merge(cate, on='item_id',how='left').reset_index(drop=True)
ali_user = ali_user.drop(['imgLink','title'],axis=1)
ali_user = ali_user.sample(frac = 1) # shuffle rows
ali_train, ali_test, ali_val = np.split(ali_user, [int(0.5*len(ali_user)), int(0.6*len(ali_user))])

In [None]:
df_val = ali_val.drop_duplicates(subset=['user_id','outfit_id','item_id',])
identical = pd.merge(ali_train.reset_index(), df_val, on=['user_id','outfit_id','item_id',],how='inner').set_index('index')
df_val = ali_val.drop_duplicates(subset=['user_id','outfit_id'])
neg = pd.merge(ali_train.reset_index(), df_val, on=['user_id','outfit_id'],how='inner').set_index('index')
neg = neg[~neg.index.isin(list(identical.index))].reset_index(drop=True)
neg = neg.rename(columns={'item_id_x':'item_id'}).drop('item_id_y',axis=1)

In [None]:
pos = ali_train.reset_index(drop=True).set_index(['user_id','item_id'])
neg = neg.reset_index(drop=True).set_index(['user_id','item_id'])
pos = pos[~pos.index.isin(list(neg.index))].reset_index()
neg = neg.reset_index()
pos = pos[pos.user_id.isin(list(set(neg.user_id)))]
pos = pos.groupby('user_id').filter(lambda x: len(x)>=7)
neg = neg[['user_id','item_id','outfit_id','cate_id_x']].rename(columns={'cate_id_x':'cate_id'})

In [None]:
neg_cate = pd.merge(neg.reset_index(), df_val, on=['user_id','cate_id'],how='inner').set_index('index')
neg_cate = neg_cate[~neg_cate.index.isin(list(identical.index))].reset_index(drop=True)
neg_cate = neg_cate.rename(columns={'item_id_x':'item_id'}).drop('item_id_y',axis=1)
neg_cate = neg_cate[['user_id','outfit_id_x','item_id','cate_id']].rename(columns={'outfit_id_x':'outfit_id'})

In [None]:
neg_cate = neg_cate.reset_index(drop=True).set_index(['user_id','item_id'])
pos_cate = pos[~pos.index.isin(list(neg_cate.index))].reset_index()
neg_cate = neg_cate.reset_index()

In [None]:
neg_cate = neg_cate.drop_duplicates(['user_id','item_id'])
pos_cate = pos_cate.drop_duplicates(['user_id','item_id'])
neg_cate = neg_cate[neg_cate.user_id.isin(list(set(pos_cate.user_id)))]
pos_cate = pos_cate[pos_cate.user_id.isin(list(set(neg_cate.user_id)))]

In [None]:
positives = pos_cate.groupby('user_id').head(10)
positives['label'] = 1
negatives = neg_cate.groupby('user_id').head(10)
negatives['label'] = 0

In [None]:
sia_full = pd.concat([positives, negatives],ignore_index=True)
sia_full = sia_full.drop_duplicates(['user_id','item_id'])
sia_full = pd.merge(sia_full, img_feat, on='item_id',how='left')
sia_full = sia_full.sample(frac=1)
sia_train, sia_val = np.split(sia_full, [int(0.8*len(sia_full))])

In [None]:
pairs_train, labels_train = make_pairs(sia_train)
pairs_val, labels_val = make_pairs(sia_val)
train_1 = pairs_train[:, 0] 
train_2 = pairs_train[:, 1]
val_1 = pairs_val[:, 0]
val_2 = pairs_val[:, 1]

In [None]:
history = siamese.fit(
    [train_1, train_2],
    labels_train,
    validation_data=([val_1, val_2], labels_val),
    batch_size=batch_size,
    epochs=epochs,
)

In [None]:
# Plot the accuracy
plt_metric(history=history.history, metric="accuracy", title="Model accuracy")

# Plot the constrastive loss
plt_metric(history=history.history, metric="loss", title="Constrastive Loss")

In [None]:
ali_pred = ali_test[['user_id']].drop_duplicates(subset='user_id')
ali_pred = ali_pred.merge(pos, on='user_id',how='left')
ali_pred = ali_pred.drop_duplicates(['user_id','item_id'])
ali_pred = ali_pred.drop(['imgLink','title'],axis=1)
ali_pred = ali_pred[~ali_pred.item_id.isna()]

In [None]:
cate_ppl = ali_val.groupby(['cate_id','item_id']).user_id.agg('count').reset_index().sort_values(
                                    ['user_id'], ascending=False)
top_cate_ppl = cate_ppl.groupby('cate_id')[['item_id']].aggregate(lambda x: x.tolist()).reset_index()

for i in range(len(top_cate_ppl)):
    top_cate_ppl.item_id[i] = top_cate_ppl.item_id[i][:10]
    
top_prod_pred = ali_pred[['user_id']].drop_duplicates(['user_id'], keep='last')

top5_cate_ppl = top_cate_ppl.iloc[:10,:]
df1_repeated = pd.concat([top5_cate_ppl] * len(top_prod_pred), ignore_index=True)
df2_repeated = pd.concat([top_prod_pred] * len(top5_cate_ppl), ignore_index=True)
df_combined = pd.concat([df1_repeated, df2_repeated], axis=1)

cate_ppl = df_combined.explode('item_id')

In [None]:
cate_clicked = ali_val.drop_duplicates(['user_id','cate_id'])
cate_clicked = cate_clicked[cate_clicked.user_id.isin(list(set(ali_pred.user_id)))].reset_index().set_index(['user_id','cate_id'])
cate_ppl = cate_ppl.reset_index(drop=True).set_index(['user_id','cate_id'])
cate_ppl = cate_ppl[~cate_ppl.index.isin(list(cate_clicked.index))]
cate_ppl = cate_ppl.reset_index()

In [None]:
outfit_full = ali_user.drop_duplicates(subset='outfit_id')
outfit_full = outfit_full[['outfit_id','item_id']]
cate_ppl = pd.merge(cate_ppl,outfit_full,on='item_id',how='left')
ali_pred = pd.concat([ali_pred, cate_ppl],ignore_index=True)

ali_pred = ali_pred.groupby('user_id').head(85)

In [None]:
group_a = ali_test[['user_id']].drop_duplicates(subset='user_id')
click = pos.groupby('user_id').head(1)
group_a = group_a.merge(click, on='user_id',how='left')
ppl = ali_train.item_id.value_counts().to_frame().index.astype('str')[0]
group_a['item_id'] = group_a['item_id'].fillna(ppl)
group_a = group_a[group_a.user_id.isin(list(set(ali_pred.user_id)))]

In [None]:
group_a = group_a.merge(img_feat, on='item_id',how='left')
ali_pred = ali_pred.merge(img_feat,on='item_id',how='left')

In [None]:
pairs_test, labels_test = make_test_pairs(group_a, ali_pred)
test_1 = pairs_test[:, 0]
test_2 = pairs_test[:, 1]

In [None]:
predictions = []
batch_size = 5000
for bucket in tqdm.tqdm(range(0, len(test_1), batch_size)):
  outputs = siamese.predict(
      [test_1[bucket: bucket+batch_size], test_2[bucket: bucket+batch_size] ]
      )
  predictions.append(outputs)
predictions = np.concatenate(predictions)

In [None]:
sia_pred = ali_pred.copy()
sia_pred['pred'] = predictions
sia_pred = sia_pred.sort_values(['user_id','pred'],ascending=False)
sia_pred = sia_pred.groupby(['user_id','item_id']).img_embedding.agg('count').reset_index().sort_values('img_embedding',ascending=False)
sia_pred = sia_pred.groupby('user_id').head(20)

In [None]:
sia_pred_lst = (
    sia_pred
    .groupby('user_id')[['item_id']]
    .aggregate(lambda x: x.tolist())
)

In [None]:
prediction = list(sia_pred_lst.item_id)

In [None]:
print(round(calculate_recall_at_k(true, prediction, 5),5))
print(round(calculate_recall_at_k(true, prediction, 10),5))
print(round(calculate_recall_at_k(true, prediction, 20),5))

print(round(map_at_k(true, prediction, 5),5))
print(round(map_at_k(true, prediction, 10),5))
print(round(map_at_k(true, prediction, 20),5))

print(round(calculate_ndcg_at_k(true, prediction,5),5))
print(round(calculate_ndcg_at_k(true, prediction,10),5))
print(round(calculate_ndcg_at_k(true, prediction,20),5))