In [1]:
SAVE_NAME = 'MRM_ALL_Embedding200_L2_retrain'

In [2]:
import os
import sys
import pandas as pd
import numpy as np
import random
import time
import tensorflow as tf
import math
from IPython.display import clear_output

In [3]:
# Helper function
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)
        
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()

In [4]:
def relu(x):
    return np.maximum(0,x)  

def softmax(x):
    exp_x = np.exp(x)
    softmax_x = exp_x / np.sum(exp_x)
    return softmax_x 

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Load numpy array

In [5]:
all_npy = np.load('./npy/all_4876.npy')
movie_genre = np.load('./npy/movie_genre.npy')
usr_following = np.load('./npy/user_followings.npy')
usr_genre = np.load('./npy/user_genre.npy')

print('All features:', all_npy.shape)
print('Movie genre:', movie_genre.shape)
print('User following:', usr_following.shape)
print('User genre:', usr_genre.shape)

All features: (165, 4876)
Movie genre: (165, 20)
User following: (1582, 165)
User genre: (1582, 20)


In [6]:
usr_nb = len(usr_following) # the number of users
movie_nb = len(movie_genre)  # the number of movies

print(usr_nb, movie_nb)

usr_test_amount = 150
movie_test_amount = 32

print(usr_test_amount, movie_test_amount)

latent_dim = 128 # latent dims
ft_dim = all_npy.shape[1] # feature dims
embedding_dims = 200

print(latent_dim, ft_dim, embedding_dims)

1582 165
150 32
128 4876 200


### Normalize usr_genre

In [7]:
usr_genre_norm = np.zeros(usr_genre.shape)
for i in range(len(usr_genre)):
    usr_genre_norm[i] = usr_genre[i]/np.max(usr_genre[i])
print(usr_genre_norm.shape)
# print('Before:', usr_genre)
# print('After:', usr_genre_norm)

(1582, 20)


# Training & testing split

## Prepare

In [8]:
#The number of followers for each movie
moive_followers = np.sum(usr_following, axis=0)
# print(moive_followers)

print('Min number of followers:', np.min(moive_followers))
print('Max number of followers:', np.max(moive_followers))
print('Avg of followers:', np.mean(moive_followers))

asc = np.sort(moive_followers)
# print(asc)
desc = np.flip(asc)
# print(desc)

over5 = 0
for num in moive_followers:
    if num >= 5:
        over5 += 1
print('The num of followers over 5:', over5)

Min number of followers: 1
Max number of followers: 520
Avg of followers: 142.0969696969697
The num of followers over 5: 163


In [9]:
print('Over 50:', np.sum(moive_followers >= 50))
print('Over 100:', np.sum(moive_followers >= 100))
print('Over 150:', np.sum(moive_followers >= 150))
print('Over 200:', np.sum(moive_followers >= 200))
print('Over 250:', np.sum(moive_followers >= 250))
print('Over 300:', np.sum(moive_followers >= 300))

Over 50: 125
Over 100: 89
Over 150: 58
Over 200: 42
Over 250: 31
Over 300: 21


In [10]:
over200_idx = np.nonzero(moive_followers >= 200)[0]
print(over200_idx.shape, over200_idx)

random.seed(42)
movie_test_idx = sorted(random.sample(list(over200_idx), movie_test_amount))
print(len(movie_test_idx), movie_test_idx) # 32 [0, 2, 3, 12, 24, 28, 30, 44, 49, 55, 57, 58, 60, 66, 78, 80, 81, 84, 86, 87, 102, 112, 119, 122, 123, 125, 127, 128, 129, 144, 161, 164]

(42,) [  0   2   3   4   9  12  24  28  30  34  40  44  49  55  57  58  60  66
  68  78  80  81  84  86  87  99 101 102 112 119 122 123 125 126 127 128
 129 134 144 156 161 164]
32 [0, 2, 3, 12, 24, 28, 30, 44, 49, 55, 57, 58, 60, 66, 78, 80, 81, 84, 86, 87, 102, 112, 119, 122, 123, 125, 127, 128, 129, 144, 161, 164]


In [11]:
#The number of following movie for each user
each_user = np.sum(usr_following, axis=1)
# print(each_user)

print('Min number of followings:', np.min(each_user))
print('Max number of followings:', np.max(each_user))
print('Avg of followers:', np.mean(each_user))

asc = np.sort(each_user)
# print(each_user)
# print(asc)
desc = np.flip(asc)
# print(desc)

Min number of followings: 10
Max number of followings: 133
Avg of followers: 14.820480404551201


In [12]:
print('Over 10:', np.sum(each_user >= 10))
print('Over 12:', np.sum(each_user >= 12))
print('Over 14:', np.sum(each_user >= 14))
print('Over 16:', np.sum(each_user >= 16))
print('Over 18:', np.sum(each_user >= 18))
print('Over 20:', np.sum(each_user >= 20))

Over 10: 1582
Over 12: 937
Over 14: 613
Over 16: 440
Over 18: 315
Over 20: 229


In [13]:
usr_idx = [i for i in range(len(usr_following))]
print(len(usr_idx))

random.seed(42)
test_idx = sorted(random.sample(usr_idx, usr_test_amount))
print(len(test_idx), test_idx[:10]) # 150 [13, 51, 54, 61, 65, 88, 93, 96, 114, 130]

1582
150 [13, 51, 54, 61, 65, 88, 93, 96, 114, 130]


## Setup 

In [14]:
# init
train_t = []
train_f = []
test_t = []
test_f = []

for i in range(usr_nb):
    # init
    t_for_train = []
    f_for_train = []
    t_for_test = []
    f_for_test = []
    
    if i not in test_idx: #if not in test id, just append it to true or false list
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                t_for_train.append(j)
            else:
                f_for_train.append(j)
                
        train_t.append(t_for_train)
        train_f.append(f_for_train)
#         print(len(t_for_train) + len(f_for_train))
        
    else: #if in test id, choose half of true and other 
        temp_t = []
        temp_f = []
        
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                temp_t.append(j)
            else:
                temp_f.append(j)
        
        # random choose half true and half false for test 
        t_for_test = random.sample(temp_t, math.ceil(0.5*len(temp_t)))
        f_for_test  = random.sample(temp_f, movie_test_amount-len(t_for_test))
        
        test_t.append(t_for_test)
        test_f.append(f_for_test)
        
        #the others for training
        t_for_train = [item for item in temp_t if not item in t_for_test]
        f_for_train = [item for item in temp_f if not item in f_for_test]
        train_t.append(t_for_train)
        train_f.append(f_for_train)
        
    if not (len(t_for_train) + len(f_for_train) + len(t_for_test) + len(f_for_test)) == movie_nb:
        print('Error!!!')
        break

In [15]:
print('The length of train_t:',len(train_t))
print('The length of train_f:',len(train_f))
print('The length of test_t:',len(test_t))
print('The length of test_f:',len(test_f))

The length of train_t: 1582
The length of train_f: 1582
The length of test_t: 150
The length of test_f: 150


## Stat

In [16]:
#average num of following for training user
total_train = 0
for t in train_t:
    total_train += len(t)
avg = total_train / usr_nb
print('Training:', avg)

#average num of following for testing user
total_test = 0
for t in test_t:
    total_test += len(t)
avg = total_test / usr_test_amount
print('Testing:', avg)

Training: 14.139064475347661
Testing: 7.1866666666666665


In [17]:
all_auxilary = [i for i in range(movie_nb)]

# Get latent factor and Each weight

In [18]:
# reload params if crash
params = np.load('./weight/' + SAVE_NAME + '.npz')
print(params)
U = params['U']
Y = params['Y']
A = params['A']
E = params['E']
Au = params['Wu']
Ay = params['Wy']
Aa = params['Wa']
Av = params['Wv']
B = params['B']

print('User latent shape: ',U.shape)
print('photo latent shape: ', Y.shape)
print('Auxilary latent shape: ',A.shape)
print('Embedding shape:', E.shape)
print('Wu weight shape:', Au.shape)
print('Wy weight shape:', Ay.shape)
print('Wa weight shape:', Aa.shape)
print('Wv weight shape:', Av.shape)
print('Beta shape:',B.shape)

<numpy.lib.npyio.NpzFile object at 0x7f10159fdc88>
User latent shape:  (1582, 128)
photo latent shape:  (165, 128)
Auxilary latent shape:  (165, 128)
Embedding shape: (200, 4876)
Wu weight shape: (1582, 165, 128)
Wy weight shape: (1582, 165, 128)
Wa weight shape: (1582, 165, 128)
Wv weight shape: (1582, 165, 200)
Beta shape: (1582, 200)


# Testing Part

In [19]:
'''
usr_test_amount = 150
movie_test_amount = 32
'''

#with Embedding
result = np.zeros((usr_test_amount, movie_nb))
RS = np.zeros((usr_test_amount, movie_nb))

#test_idx --> Test 的 index length = 150
test_yes_id = []

for s in range(usr_test_amount):
    print(s, test_idx[s])

    yes = []
    sample = random.sample(train_t[test_idx[s]],len(train_t[test_idx[s]]))
    #sample=result_yes_id[now]
    test_yes_id.append(sample)
    alpha = np.zeros([len(sample)])
    
    for a in range(len(sample)):
        r = np.max(movie_genre[sample[a]] * usr_genre_norm[test_idx[s]]) #sample a 的category vec *user_category vec
        
# #         ''' Observe each part in attention
#         WuUu = np.sum(np.dot(Au[test_idx[s]],np.expand_dims(U[test_idx[s]],0).T))
#         WyYy = np.sum(np.dot(Ay[sample[a]],np.expand_dims(Y[sample[a]],0).T))
#         WaAa = np.sum(np.dot(Aa[test_idx[s]],np.expand_dims(A[sample[a]],0).T))
#         WvVy = np.sum(np.dot(np.dot(Av[test_idx[s]], E),np.expand_dims(all_npy[sample[a]],0).T))
#         print('The sum of each par -->',
#               '\nw1:',testW1,
#               '\nWuU:',WuUu,
#               '\nwyY:',WyYy,
#               '\nWaA:',WaAa,
#               '\nWvV:',WvVy)
# #         '''
        
        alpha[a] = np.sum((relu(np.dot(Au[test_idx[s]],np.expand_dims(U[test_idx[s]],0).T) +
                                np.dot(Ay[sample[a]],np.expand_dims(Y[sample[a]],0).T) +
                                np.dot(Aa[test_idx[s]],np.expand_dims(A[sample[a]],0).T) +
                                np.dot(np.dot(Av[test_idx[s]], E),np.expand_dims(all_npy[sample[a]],0).T))))*r
        
    mul = np.zeros((1,latent_dim))
    
    print("{:<15}{}".format('alpha:', alpha))
    print("{:<15}{}".format('softmax alpha:', softmax(alpha)))
    print('==================================================')
    
    for i in range(len(sample)):
        mul += alpha[i] * A[sample[i]] #attention alpha * Ai part 
    new_mul = mul + U[test_idx[s]]  #(U+auxilary)
    
    for k in range(movie_nb):
        result[s][k] = np.dot(new_mul,Y[k].T) #(U+auxilary)*photo latent factor
        RS[s][k] = np.dot(new_mul,Y[k].T) + np.dot(B[test_idx[s]], np.dot(E, all_npy[k].T))

0 13
alpha:         [0.02774524 0.02953485 0.08315664 0.02260261 0.03408557 0.07617806
 0.00969344]
softmax alpha: [0.14100926 0.14126184 0.14904332 0.14028597 0.14190615 0.14800683
 0.13848663]
1 51
alpha:         [0.19391839 0.07012048 0.13753379 0.03810934 0.02937639 0.09551304]
softmax alpha: [0.18385665 0.16244808 0.17377681 0.15733028 0.15596231 0.16662587]
2 54
alpha:         [0.08465813 0.06927584 0.07905364 0.05089013 0.07589684 0.11613613]
softmax alpha: [0.16752669 0.16496947 0.16659042 0.1619641  0.16606535 0.17288397]
3 61
alpha:         [0.28550511 0.23000379 0.16383496 0.25330815 0.22539796]
softmax alpha: [0.21090681 0.19952011 0.18674541 0.2042244  0.19860327]
4 65
alpha:         [0.13224631 0.09930894 0.14849889 0.08164523 0.06131635 0.07902678]
softmax alpha: [0.17198853 0.16641595 0.17480662 0.16350224 0.16021198 0.16307468]
5 88
alpha:         [0.02002347 0.02278649 0.00864983 0.02789286 0.00745877 0.03862086
 0.01520124 0.00779309 0.03073421]
softmax alpha: [0.111

In [20]:
#取出test的資料
print(RS.shape)

testRS = np.zeros((usr_test_amount, movie_test_amount)) #shape 150 * 32
target = np.zeros((usr_test_amount, movie_test_amount)) #shape 150 * 32
        
for z in range(usr_test_amount):
    user_id = test_idx[z]
    # positive target YouTuber list
    youtube_t = test_t[z] 
    # not target YouTuber list
    youtube_f = test_f[z]
    
#     print(user_id)
#     print(youtube_t)
#     print(youtube_f)
    
    #前面放target的RS
    for i in range(len(youtube_t)):
        testRS[z][i] = RS[z][youtube_t[i]]
        target[z][i] = 1
        
    for i in range(len(youtube_f)):
        testRS[z][i+len(youtube_t)] = RS[z][youtube_f[i]]
    
#     print(testRS[z])
#     print(target[z])
#     print('==============================')

(150, 165)


In [21]:
print(target.shape, testRS.shape)
sumtarget = np.sum(target)
print('num of positive data in testing:', sumtarget) # whole matrix: 4800

(150, 32) (150, 32)
num of positive data in testing: 1078.0


# Metrics

## Top N

In [22]:
def F1_score(prec,rec):
    f1 = (2*prec*rec)/(prec+rec)
    return f1

def topN(RSls, n):
    maxn = np.argsort(RSls)[::-1][:n]
    return maxn

## NDCG

In [23]:
all_sort = []

for i in range(usr_test_amount):
    all_sort.append(topN(list(testRS[i]),len(testRS[i])))
    
all_sort = np.asarray(all_sort)
print(all_sort.shape)

(150, 32)


In [24]:
def DCG(prec_list): #找出前n名的[1,1,1,0,...]
    dcg = 0
    for i in range(len(prec_list)):
        dcg += (2**prec_list[i]-1)/math.log2(i+2)
    return dcg

def NDCG(target, testRS, num_ndcg): #target是真正的喜好
    total_ndcg = 0
    
    for m in range(usr_test_amount): # the number of testing users
#         print(target[m][:num_ndcg])
        idcg = DCG(target[m][:num_ndcg])
        
        pre_list = []
        for s in all_sort[m][:num_ndcg]:
            #print(m,s,target[m][s])
            pre_list.append(target[m][s]) #把prec_list 的 score加進去
#         print(pre_list)
        dcg = DCG(pre_list)
        ndcg = dcg/idcg
#         print(ndcg)
        total_ndcg += ndcg
        
    avg_ndcg = total_ndcg/usr_test_amount
    return avg_ndcg

## MAP

In [25]:
from sklearn.metrics import average_precision_score

def MAP(target,testRS):
    total_prec = 0
    for u in range(usr_test_amount):
        y_true = target[u]
        y_scores = testRS[u]
        total_prec += average_precision_score(y_true, y_scores)
        
    Map_value = total_prec/usr_test_amount
    
    return Map_value

## Print out results

In [26]:
# Top N
N = [1, 3, 5]
correct = 0

for n in N:
    print('Top', n)
    
    for i in range(len(testRS)):
        topn = topN(testRS[i], n)
        sum_target = int(np.sum(target[i]))
        
        TP = 0
        for i in topn:
            if i < sum_target:
                TP += 1
                
        correct += TP

    print('Num of TP:', correct)

    prec = correct/(len(testRS)*n)
    recall = correct/sumtarget
    
    print('prec:', prec)
    print('recall:', recall)
    print('F1_score:', F1_score(prec, recall))
    
    print('*****')

print('\n==============================\n')

# NDCG
num_ndcgs = [10]
for num_ndcg in num_ndcgs:
    print('NDCG@', num_ndcg)
    print('NDCG score:', NDCG(target, testRS, num_ndcg))
    print('*****')

print('\n==============================\n')

# MAP
print('MAP:', MAP(target,testRS))

Top 1
Num of TP: 69
prec: 0.46
recall: 0.0640074211502783
F1_score: 0.11237785016286646
*****
Top 3
Num of TP: 255
prec: 0.5666666666666667
recall: 0.23654916512059368
F1_score: 0.3337696335078534
*****
Top 5
Num of TP: 526
prec: 0.7013333333333334
recall: 0.48794063079777367
F1_score: 0.575492341356674
*****


NDCG@ 10
NDCG score: 0.4160738701382943
*****


MAP: 0.4069408103797797
