In [1]:
import numpy as np
import pickle
import pandas as pd
from tqdm import tqdm
from math import log
from scipy.sparse import coo_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats import mode

In [2]:
def gini_coefficient(sorted_array):
    n = len(sorted_array)
    return np.sum(((np.arange(n) + 1.) * 2 - n - 1) * sorted_array) / (n * np.sum(sorted_array))

In [3]:
with open('./info.pkl', 'rb') as f:
    info = pickle.load(f)
    num_user = info['num_user']
    num_item = info['num_item']

train_df = pd.read_csv('./train_df.csv')

train_like = list(np.load('./user_train_like.npy', allow_pickle=True))
test_like = list(np.load('./user_test_like.npy', allow_pickle=True))

In [4]:
pos_user_array = train_df['userId'].values
pos_item_array = train_df['itemId'].values
train_mat = coo_matrix((np.ones(len(pos_user_array)), (pos_user_array, pos_item_array)), shape=(num_user, num_item)).toarray()

user_pop = np.sum(train_mat, axis=1)
item_pop = np.sum(train_mat, axis=0)
user_item_pop = np.sum(train_mat * item_pop.reshape((1, -1)), axis=1) / (np.sum(train_mat, axis=1) + 1e-7)

Rec = np.load('./Rec_Meta_Weighted_VAE_DCsim.npy')

In [5]:
top1 = 1
top2 = 5
top3 = 10
top4 = 20
k_set = [top1, top2, top3, top4]
def user_precision_recall_ndcg(new_user_prediction, test):
    dcg_list = []

    # compute the number of true positive items at top k
    count_1, count_5, count_10, count_15 = 0, 0, 0, 0
    for i in range(k_set[3]):
        if i < k_set[0] and new_user_prediction[i] in test:
            count_1 += 1.0
        if i < k_set[1] and new_user_prediction[i] in test:
            count_5 += 1.0
        if i < k_set[2] and new_user_prediction[i] in test:
            count_10 += 1.0
        if new_user_prediction[i] in test:
            count_15 += 1.0
            dcg_list.append(1)
        else:
            dcg_list.append(0)

    # calculate NDCG@k
    idcg_list = [1 for i in range(len(test))]
    ndcg_tmp_1 = NDCG_at_k(dcg_list, idcg_list, k_set[0])
    ndcg_tmp_5 = NDCG_at_k(dcg_list, idcg_list, k_set[1])
    ndcg_tmp_10 = NDCG_at_k(dcg_list, idcg_list, k_set[2])
    ndcg_tmp_15 = NDCG_at_k(dcg_list, idcg_list, k_set[3])

    # precision@k
    precision_1 = count_1 * 1.0 / k_set[0]
    precision_5 = count_5 * 1.0 / k_set[1]
    precision_10 = count_10 * 1.0 / k_set[2]
    precision_15 = count_15 * 1.0 / k_set[3]

    l = len(test)
    if l == 0:
        l = 1
    # recall@k
    recall_1 = count_1 / l
    recall_5 = count_5 / l
    recall_10 = count_10 / l
    recall_15 = count_15 / l

    # return precision, recall, ndcg_tmp
    return np.array([precision_1, precision_5, precision_10, precision_15]), \
           np.array([recall_1, recall_5, recall_10, recall_15]), \
           np.array([ndcg_tmp_1, ndcg_tmp_5, ndcg_tmp_10, ndcg_tmp_15])

def NDCG_at_k(predicted_list, ground_truth, k):
    dcg_value = [(v / log(i + 1 + 1, 2)) for i, v in enumerate(predicted_list[:k])]
    dcg = np.sum(dcg_value)
    if len(ground_truth) < k:
        ground_truth += [0 for i in range(k - len(ground_truth))]
    idcg_value = [(v / log(i + 1 + 1, 2)) for i, v in enumerate(ground_truth[:k])]
    idcg = np.sum(idcg_value)
    return dcg / idcg

In [6]:
user_precision = []
user_recall = []
user_ndcg = []
for u in range(num_user):
    Rec[u, train_like[u]] = -100000.0

for u in tqdm(range(num_user)):
    scores = Rec[u, :]
    top_iid = np.argpartition(scores, -20)[-20:]
    top_iid = top_iid[np.argsort(scores[top_iid])[-1::-1]]

    # calculate the metrics
    if not len(test_like[u]) == 0:
        precision_u, recall_u, ndcg_u = user_precision_recall_ndcg(top_iid, test_like[u])
    else:
        precision_u = recall_u = ndcg_u = [-1, -1, -1, -1]
    user_precision.append(precision_u)
    user_recall.append(recall_u)
    user_ndcg.append(ndcg_u)

100%|██████████| 6040/6040 [00:01<00:00, 3636.19it/s]


In [7]:
ndcg = np.array(user_ndcg)[:, 3]

In [8]:
np.mean(user_ndcg, axis=0)

array([0.40198675, 0.35519147, 0.33807983, 0.33396327])

In [9]:
MSV_Jsim = np.load('./MSV_Jsim.npy')
MSV_LOF = np.load('./MSV_LOF.npy')
MSV_Asim = np.load('./MSV_Asim.npy')
MSV_DeepSVDD = np.load('./MSV_DeepSVDD.npy')

In [10]:
def distribution_plot(x, y, bins=20):
    x_max = np.max(x)
    x_min = np.min(x)
    step = (x_max - x_min) / bins
    x_array = []
    mean_array = []
    std_array = []
    for i in range(bins):
        start = x_min + step * i
        end = x_min + step * (i + 1)
        x_array.append((start + end) / 2)
        tmp = y[np.where((x >= start) & (x <= end))[0]]
        mean_array.append(np.mean(tmp) if len(tmp) > 0 else 0)
        std_array.append(np.std(tmp) if len(tmp) > 0 else 0)
#     print(x_array)
    print(mean_array)
    

In [11]:
num = [10, 20, 40, 50, 60, 70, 80, 90, 0.08, 0.11, 0.14, 0.17, 0.23, 0.26, 0.29, 0.32]

In [12]:
for i in range(len(num)):
    print(str(num[i]))
    Rec = np.load('./Rec_Meta_Weighted_VAE_DCsim_' + str(num[i]) +'.npy')
    
    user_precision = []
    user_recall = []
    user_ndcg = []
    for u in range(num_user):
        Rec[u, train_like[u]] = -100000.0

    for u in range(num_user):
        scores = Rec[u, :]
        top_iid = np.argpartition(scores, -20)[-20:]
        top_iid = top_iid[np.argsort(scores[top_iid])[-1::-1]]

        # calculate the metrics
        if not len(test_like[u]) == 0:
            precision_u, recall_u, ndcg_u = user_precision_recall_ndcg(top_iid, test_like[u])
        else:
            precision_u = recall_u = ndcg_u = [-1, -1, -1, -1]
        user_precision.append(precision_u)
        user_recall.append(recall_u)
        user_ndcg.append(ndcg_u)
    ndcg = np.array(user_ndcg)[:, 3]
    print(np.mean(user_ndcg, axis=0))

    user_sort_idx = np.argsort(MSV_DeepSVDD)
    distribution_plot(np.arange(num_user), ndcg[user_sort_idx], bins=5)
    gini_coefficient(ndcg[user_sort_idx])
    print('')

10
[0.3986755  0.35069208 0.33366668 0.33120702]
[0.24767185623617496, 0.28439327347530396, 0.2933603794511913, 0.342553392291518, 0.488056195892837]

20
[0.40049669 0.355382   0.33896416 0.33568521]
[0.2522690939967766, 0.2870789792672504, 0.2973870542263071, 0.3466855609922721, 0.4950053557626626]

40
[0.39834437 0.35718986 0.34058535 0.33808343]
[0.2568219914378601, 0.2897855586017568, 0.2980710030028633, 0.3500312783201907, 0.49570733534541783]

50
[0.40281457 0.35946006 0.34161787 0.33922607]
[0.26063604169277044, 0.29025103369867006, 0.2982474612045863, 0.3516962701133355, 0.4952995314083472]

60
[0.40115894 0.35765629 0.34084493 0.33952147]
[0.26164440791901883, 0.28995528801383014, 0.29951063396617267, 0.35262089056544477, 0.4938761123111245]

70
[0.39884106 0.35627887 0.33990769 0.33875733]
[0.2615703558663046, 0.2901285422831612, 0.2987570948541314, 0.35148352112284414, 0.4918471426788819]

80
[0.3968543  0.35579184 0.33836187 0.33787874]
[0.26043054222549566, 0.2900164374446