In [1]:
import os
import sys
import json
import pandas as pd
import numpy as np
import random
import time
import tensorflow as tf
import math
from IPython.display import clear_output

from helper import lineNotify as LN
from helper import writeProgress, newPath, write_json

In [2]:
def relu(x):
    return np.maximum(0,x)  

def softmax(x):
    exp_x = np.exp(x)
    softmax_x = exp_x / np.sum(exp_x)
    return softmax_x 

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Load numpy array

In [3]:
all_npy = np.load('./npy/all_2372.npy')
movie_genre = np.load('./npy/movie_genre.npy')
usr_following = np.load('./npy/user_followings.npy')
# usr_genre = np.load('./npy/user_genre.npy')
usr_genre = np.load('./npy/user_genre_like.npy')

print('All features:', all_npy.shape)
print('Movie genre:', movie_genre.shape)
print('User following:', usr_following.shape)
print('User genre:', usr_genre.shape)

All features: (165, 2372)
Movie genre: (165, 20)
User following: (1582, 165)
User genre: (1582, 20)


In [4]:
usr_nb = len(usr_following) # the number of users
movie_nb = len(movie_genre)  # the number of movies
print(usr_nb, movie_nb)

usr_test_amount = 150
movie_test_amount = 82
print(usr_test_amount, movie_test_amount)

latent_dim = 64 # latent dims
ft_dim = all_npy.shape[1] # feature dims
# embedding_dims = 260
print(latent_dim, ft_dim)

1582 165
150 82
64 2372


### Normalize usr_genre

In [5]:
usr_genre_norm = np.zeros(usr_genre.shape)
for i in range(len(usr_genre)):
    usr_genre_norm[i] = usr_genre[i]/np.max(usr_genre[i])
print(usr_genre_norm.shape)
print('Before:', usr_genre)
print('After:', usr_genre_norm)

(1582, 20)
Before: [[ 2  2  0 ...  1  0  0]
 [34 49 14 ...  1  0  0]
 [10 17 17 ...  4  0  0]
 ...
 [79 43  0 ...  8 12  0]
 [ 6  6  0 ...  0  1  0]
 [67 53  1 ...  3  5  0]]
After: [[0.05263158 0.05263158 0.         ... 0.02631579 0.         0.        ]
 [0.41463415 0.59756098 0.17073171 ... 0.01219512 0.         0.        ]
 [0.24390244 0.41463415 0.41463415 ... 0.09756098 0.         0.        ]
 ...
 [0.30620155 0.16666667 0.         ... 0.03100775 0.04651163 0.        ]
 [0.24       0.24       0.         ... 0.         0.04       0.        ]
 [0.74444444 0.58888889 0.01111111 ... 0.03333333 0.05555556 0.        ]]


# Training & testing split

## Prepare

In [6]:
#The number of followers for each movie
moive_followers = np.sum(usr_following, axis=0)
# print(moive_followers)

print('Min number of followers:', np.min(moive_followers))
print('Max number of followers:', np.max(moive_followers))
print('Avg of followers:', np.mean(moive_followers))

Min number of followers: 1
Max number of followers: 520
Avg of followers: 142.0969696969697


In [7]:
#The number of following movie for each user
each_user = np.sum(usr_following, axis=1)
# print(each_user)

print('Min number of followings:', np.min(each_user))
print('Max number of followings:', np.max(each_user))
print('Avg of followers:', np.mean(each_user))

Min number of followings: 10
Max number of followings: 133
Avg of followers: 14.820480404551201


In [8]:
usr_idx = [i for i in range(len(usr_following))]
print(len(usr_idx))

random.seed(42)
test_idx = sorted(random.sample(usr_idx, usr_test_amount))
print(len(test_idx), test_idx[:10])
# 150 [13, 51, 54, 61, 65, 88, 93, 96, 114, 130]

1582
150 [13, 51, 54, 61, 65, 88, 93, 96, 114, 130]


## Setup 

In [13]:
# init
random.seed(42)
train_t = []
train_f = []
test_t = []
test_f = []

for i in range(usr_nb):
    # init
    t_for_train = []
    f_for_train = []
    t_for_test = []
    f_for_test = []
    
    if i not in test_idx: #if not in test id, just append it to true or false list
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                t_for_train.append(j)
            else:
                f_for_train.append(j)
                
        train_t.append(t_for_train)
        train_f.append(f_for_train)
#         print(len(t_for_train) + len(f_for_train))
        
    else: #if in test id, choose half of true and other 
        temp_t = []
        temp_f = []
        
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                temp_t.append(j)
            else:
                temp_f.append(j)
        
        # random choose half true and half false for test 
        t_for_test = random.sample(temp_t, math.ceil(0.5*len(temp_t)))
        f_for_test = random.sample(temp_f, movie_test_amount-len(t_for_test))
        
        test_t.append(t_for_test)
        test_f.append(f_for_test)
        
        #the others for training
        t_for_train = [item for item in temp_t if not item in t_for_test]
        f_for_train = [item for item in temp_f if not item in f_for_test]
        train_t.append(t_for_train)
        train_f.append(f_for_train)
        
    if not (len(t_for_train) + len(f_for_train) + len(t_for_test) + len(f_for_test)) == movie_nb:
        print('Error!!!')
        break

In [10]:
# # init
# random.seed(42)
# train_t = []
# train_f = []
# test_t = []
# test_f = []
# all_movie = [i for i in range(movie_nb)]

# for i in range(usr_nb):
#     # init
#     t_for_train = []
#     f_for_train = []
#     t_for_test = []
#     f_for_test = []
    
#     if i not in test_idx: #if not in test id, just append it to true or false list
#         for j in range(movie_nb):
#             if usr_following[i][j] == 1:
#                 t_for_train.append(j)
#             else:
#                 f_for_train.append(j)
                
#         train_t.append(t_for_train)
#         train_f.append(f_for_train)
# #         print(len(t_for_train) + len(f_for_train))
        
#     else: #if in test id, choose half of true and other 
#         test = random.sample(all_movie, movie_test_amount)
# #         print('Row:', len(test), sum(test))
#         for j in range(movie_nb):
#             if j in test:
#                 if usr_following[i][j] == 1:
#                     t_for_test.append(j)
#                 else:
#                     f_for_test.append(j)
#             else:
#                 if usr_following[i][j] == 1:
#                     t_for_train.append(j)
#                 else:
#                     f_for_train.append(j)
                    
#         test_t.append(t_for_test)
#         test_f.append(f_for_test)
        
#         train_t.append(t_for_train)
#         train_f.append(f_for_train)
        
#     if not (len(t_for_train) + len(f_for_train) + len(t_for_test) + len(f_for_test)) == movie_nb:
#         print('Error!!!')
#         break

In [14]:
print('The length of train_t:',len(train_t))
print('The length of train_f:',len(train_f))
print('The length of test_t:',len(test_t))
print('The length of test_f:',len(test_f))

The length of train_t: 1582
The length of train_f: 1582
The length of test_t: 150
The length of test_f: 150


## Stat

In [15]:
#average num of following for training user
total_train = 0
for t in train_t:
    total_train += len(t)
avg = total_train / usr_nb
print('Training:', avg)

#average num of following for testing user
total_test = 0
for t in test_t:
    total_test += len(t)
avg = total_test / usr_test_amount
print('Testing:', avg)
'''
Training: 14.54677623261694
Testing: 2.8866666666666667
'''

Training: 14.139064475347661
Testing: 7.1866666666666665


'\nTraining: 14.54677623261694\nTesting: 2.8866666666666667\n'

In [16]:
all_auxilary = [i for i in range(movie_nb)]

# Recommendation model

# Testing Part

# Metrics

In [17]:
# Top N
def F1_score(prec,rec):
    f1 = 2*((prec*rec)/(prec+rec))
    return f1

def topN(RSls, n):
    maxn = np.argsort(RSls)[::-1][:n]
    return maxn

# NDCG
def allSortPrepare(testRS):
    all_sort = []

    for i in range(usr_test_amount):
        all_sort.append(topN(list(testRS[i]),len(testRS[i])))

    all_sort = np.asarray(all_sort)
    print(all_sort.shape)
    return all_sort

def DCG(prec_list): #找出前n名的[1,1,1,0,...]
    dcg = 0
    for i in range(len(prec_list)):
        dcg += (2**prec_list[i]-1)/math.log2(i+2)
    return dcg

def NDCG(target, testRS, num_ndcg, all_sort): #target是真正的喜好
    total_ndcg = []
    
    for m in range(usr_test_amount): # the number of testing users
        idcg = DCG(target[m][:num_ndcg])
#         print('target[m][:num_ndcg]:\n', target[m][:num_ndcg])
        
        pre_list = []
        for s in all_sort[m][:num_ndcg]:
            #print(m,s,target[m][s])
#             print('target[m][:num_ndcg]:\n', target[m][:num_ndcg])
            pre_list.append(target[m][s]) #把prec_list 的 score加進去
#         print('pre_list:\n', pre_list)
        dcg = DCG(pre_list)
#         print('dcg:', dcg)
#         print('idcg:', idcg)
#         print('=====')
        if idcg == 0:
#             ndcg = 0
#             total_ndcg.append(ndcg)
            pass
        else:
            ndcg = dcg/idcg
            total_ndcg.append(ndcg)
#         total_ndcg += ndcg
    
    avg_ndcg = np.mean(total_ndcg)
    print('len(total_ndcg):', len(total_ndcg))
#     print('avg(total_ndcg):', avg_ndcg)
    return avg_ndcg

# MAP
from sklearn.metrics import average_precision_score

def MAP(target,testRS):
    total_prec = 0
    for u in range(usr_test_amount):
        y_true = target[u]
        y_scores = testRS[u]
        total_prec += average_precision_score(y_true, y_scores)
        
    Map_value = total_prec/usr_test_amount
    
    return Map_value

In [18]:
def metrics(testRS, target, sumtarget, all_sort):
    msg = ''
    print('\n==============================\n')
    # Top N
    N = [1, 3]

    for n in N:
        print('Top', n)
        correct = 0

        for i in range(len(testRS)):
            topn = topN(testRS[i], n)
            sum_target = int(np.sum(target[i]))

            TP = 0
            for i in topn:
                if i < sum_target:
                    TP += 1

            correct += TP

        prec = correct/(len(testRS)*n) #150*n
        recall = correct/sumtarget
        print('TP:', correct)
        print('prec:', prec)
        print('recall:', recall)
        print('F1_score:', F1_score(prec, recall))
        print('*****')
        msg += '\nTop {}\nTP: {}\nprec: {}\nrecall: {}\nF1_score: {}\n'.format(n, correct, prec, recall, F1_score(prec, recall))
    print('\n==============================\n')

    # NDCG
    num_ndcgs = [10]
    for num_ndcg in num_ndcgs:
        print('NDCG@', num_ndcg)
        print('NDCG score:', NDCG(target, testRS, num_ndcg, all_sort))
        print('*****')
        msg += '\nNDCG@{}: {}'.format(num_ndcg, NDCG(target, testRS, num_ndcg, all_sort))
    print('\n==============================\n')
    
    LN(msg)
#     # MAP
#     print('MAP:', MAP(target,testRS))
#     print('\n==============================\n')

In [19]:
def testing(U, Y, A, E, Au, Ay, Aa, Av, B):
    #with Embedding
    result = np.zeros((usr_test_amount, movie_nb))
    RS = np.zeros((usr_test_amount, movie_nb))

    #test_idx --> Test 的 index length = 150
    sum_alpha = 0
    test_yes_id = []

    for s in range(usr_test_amount):
#         print(s, test_idx[s])

        yes = []
        sample = train_t[test_idx[s]]
        alpha = np.zeros([len(sample)])

        for a in range(len(sample)):
            r = np.max(movie_genre[sample[a]] * usr_genre_norm[test_idx[s]]) #sample a 的category vec *user_category vec

    # #         ''' Observe each part in attention
    #         WuUu = np.sum(np.dot(Au[test_idx[s]],np.expand_dims(U[test_idx[s]],0).T))
    #         WyYy = np.sum(np.dot(Ay[sample[a]],np.expand_dims(Y[sample[a]],0).T))
    #         WaAa = np.sum(np.dot(Aa[test_idx[s]],np.expand_dims(A[sample[a]],0).T))
    #         WvVy = np.sum(np.dot(np.dot(Av[test_idx[s]], E),np.expand_dims(all_npy[sample[a]],0).T))
    #         print('The sum of each par -->',
    #               '\nw1:',testW1,
    #               '\nWuU:',WuUu,
    #               '\nwyY:',WyYy,
    #               '\nWaA:',WaAa,
    #               '\nWvV:',WvVy)
    # #         '''

            alpha_a = (np.dot(Au[test_idx[s]][sample[a]],np.expand_dims(U[test_idx[s]],0).T) + 
                       np.dot(Ay[test_idx[s]][sample[a]],np.expand_dims(Y[sample[a]],0).T) + 
                       np.dot(Aa[test_idx[s]][sample[a]],np.expand_dims(A[sample[a]],0).T) +
                       np.dot(Av[test_idx[s]][sample[a]],np.dot(E,np.expand_dims(all_npy[sample[a]],0).T)))


            # relu part
            alpha[a]=np.sum((relu(alpha_a)))*r
            # tanh part
    #         alpha[a]=np.sum((np.tanh(alpha_a)))*r

        mul = np.zeros((1,latent_dim))
        added_alpha = np.add(alpha,0.0000000001)
        norm_alpha = added_alpha/np.sum(added_alpha)
        sum_alpha += np.sum(alpha)

#         print("{:<15}{}".format('sum_alpha:', sum_alpha))
#         print('==================================================')

        for i in range(len(sample)):
            mul += norm_alpha[i] * A[sample[i]] # attention alpha*Ai part
        new_mul = mul + U[test_idx[s]]  #(U+auxilary)

        for k in range(movie_nb):
            result[s][k] = np.dot(new_mul,Y[k].T) #(U+auxilary)*photo latent factor
            RS[s][k] = np.dot(new_mul,Y[k].T) + np.dot(B[test_idx[s]], np.dot(E, all_npy[k].T))
        
    #取出test的資料
    print(RS.shape)

    testRS = np.zeros((usr_test_amount, movie_test_amount)) #shape 150 * 32
    target = np.zeros((usr_test_amount, movie_test_amount)) #shape 150 * 32

    for z in range(usr_test_amount):
        user_id = test_idx[z]
        # positive target YouTuber list
        youtube_t = test_t[z] 
        # not target YouTuber list
        youtube_f = test_f[z]

    #     print(user_id)
    #     print(youtube_t)
    #     print(youtube_f)

        #前面放target的RS
        for i in range(len(youtube_t)):
            testRS[z][i] = RS[z][youtube_t[i]]
            target[z][i] = 1

        for i in range(len(youtube_f)):
            testRS[z][i+len(youtube_t)] = RS[z][youtube_f[i]]

    #     print(testRS[z])
    #     print(target[z])
    #     print('==============================')

    print(target.shape, testRS.shape)
    sumtarget = np.sum(target)
    print('num of positive data in testing:', sumtarget) # whole matrix: 4800

    # for metrics
    metrics(testRS, target, sumtarget, allSortPrepare(testRS))

# Evaluate weight

# Get latent factor and Each weight

In [24]:
def changes(SAVE_NAME):
    print(SAVE_NAME)
    LN(SAVE_NAME)
    params = np.load('./weight/grid/' + SAVE_NAME + '.npz')
#     print(params)
    U = params['U']
    Y = params['Y']
    A = params['A']
    E = params['E']
    Au = params['Wu']
    Ay = params['Wy']
    Aa = params['Wa']
    Av = params['Wv']
    B = params['B']

    print('User latent shape: ',U.shape)
    print('photo latent shape: ', Y.shape)
    print('Auxilary latent shape: ',A.shape)
    print('Embedding shape:', E.shape)
    print('Wu weight shape:', Au.shape)
    print('Wy weight shape:', Ay.shape)
    print('Wa weight shape:', Aa.shape)
    print('Wv weight shape:', Av.shape)
    print('Beta shape:',B.shape)

    testing(U, Y, A, E, Au, Ay, Aa, Av, B)
    print('==================================================')

In [27]:
SAVE_NAMES = ['MRM_E240_20', 'MRM_E240_33']

In [28]:
for SAVE_NAME in SAVE_NAMES:
    changes(SAVE_NAME)

MRM_E240_20
User latent shape:  (1582, 64)
photo latent shape:  (165, 64)
Auxilary latent shape:  (165, 64)
Embedding shape: (240, 2372)
Wu weight shape: (1582, 165, 64)
Wy weight shape: (1582, 165, 64)
Wa weight shape: (1582, 165, 64)
Wv weight shape: (1582, 165, 240)
Beta shape: (1582, 240)
(150, 165)
(150, 82) (150, 82)
num of positive data in testing: 1078.0
(150, 82)


Top 1
TP: 146
prec: 0.9733333333333334
recall: 0.13543599257884972
F1_score: 0.23778501628664497
*****
Top 3
TP: 383
prec: 0.8511111111111112
recall: 0.35528756957328383
F1_score: 0.5013089005235603
*****


NDCG@ 10
len(total_ndcg): 150
NDCG score: 0.7330042327811274
*****
len(total_ndcg): 150


MRM_E240_33
User latent shape:  (1582, 64)
photo latent shape:  (165, 64)
Auxilary latent shape:  (165, 64)
Embedding shape: (240, 2372)
Wu weight shape: (1582, 165, 64)
Wy weight shape: (1582, 165, 64)
Wa weight shape: (1582, 165, 64)
Wv weight shape: (1582, 165, 240)
Beta shape: (1582, 240)
(150, 165)
(150, 82) (150, 82)
n