In [None]:
import os
import pandas as pd
import numpy as np
import random
import time
import tensorflow as tf
import math
from IPython.display import clear_output

In [None]:
def relu(x):
    return np.maximum(0,x)  

def softmax(x):
    exp_x = np.exp(x)
    softmax_x = exp_x / np.sum(exp_x)
    return softmax_x 

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

### Load numpy array

In [None]:
all_npy = np.load('./npy/all_4876.npy')
movie_genre = np.load('./npy/movie_genre.npy')
usr_following = np.load('./npy/user_followings.npy')
usr_genre = np.load('./npy/user_genre.npy')

print('All features:', all_npy.shape)
print('Movie genre:', movie_genre.shape)
print('User following:', usr_following.shape)
print('User genre:', usr_genre.shape)

### Normalize usr_genre

In [None]:
usr_genre_norm = np.zeros(usr_genre.shape)
for i in range(len(usr_genre)):
    usr_genre_norm[i] = usr_genre[i]/np.max(usr_genre[i])
print(usr_genre_norm.shape)

In [None]:
print('Before:', usr_genre)
print('After:', usr_genre_norm)

# Training & testing split

### Setup 

In [None]:
usr_nb = len(usr_following) # the number of users
movie_nb = len(movie_genre)  # the number of movies

print(usr_nb, movie_nb)

In [None]:
usr_test_amount = 150
movie_test_amount = 16

print(usr_test_amount, movie_test_amount)

In [None]:
usr_idx = [i for i in range(len(usr_following))]
print(len(usr_idx))

test_idx = random.sample(usr_idx, usr_test_amount)
print(len(test_idx))

In [None]:
#Training
train_t = [0] * usr_nb
train_f = [0] * usr_nb
# Testing
test_t = [0] * usr_test_amount
test_f = [0] * usr_test_amount
test_pos = -1

for i in range(len(usr_following)):
    
    t_for_train = []
    f_for_train = []
    if i not in test_idx: #if not in test id, just append it to true or false list
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                t_for_train.append(j)
            else:
                f_for_train.append(j)
        train_t[i] = t_for_train
        train_f[i] = f_for_train
        
    else: #if in test id, choose 2 true and other 
        test_pos += 1
        temp_t = []
        temp_f = []
        
        for j in range(movie_nb):
            
            if usr_following[i][j] == 1:
                temp_t.append(j)
            else:
                temp_f.append(j)
        
        # random choose 2 true and 8 false for test 
        t_for_test = random.sample(temp_t, 2)
        f_for_test  = random.sample(temp_f, 8)
        test_t[test_pos] = t_for_test
        test_f[test_pos] = f_for_test
        
        #other for training
        t_for_train = [item for item in temp_t if not item in t_for_test]
        f_for_train = [item for item in temp_f if not item in f_for_test]
        train_t[i] = t_for_train
        train_f[i] = f_for_train

In [None]:
# train_t[i] 代表的是user i positive feedback
print('The length of train_t:',len(train_t))
print('The length of train_f:',len(train_f))
print('The length of test_t:',len(test_t))
print('The length of test_f:',len(test_f))

# Recommendation model

In [None]:
latent_dim = 128 # latent dims
ft_dim = all_npy.shape[1] # feature dims
embedding_dims = 150

print(latent_dim, ft_dim, embedding_dims)

In [None]:
user = tf.placeholder(tf.int32,shape=(1,))
i = tf.placeholder(tf.int32, shape=(1,))
j = tf.placeholder(tf.int32, shape=(1,))

#多少個auxliary 
xf = tf.placeholder(tf.float32, shape=(None,ft_dim))
l_id = tf.placeholder(tf.int32, shape=(None,))
l_id_len = tf.placeholder(tf.int32,shape=(1,))
r = tf.placeholder(tf.float32,shape=(None,))

image_i = tf.placeholder(tf.float32, [1, ft_dim])
image_j = tf.placeholder(tf.float32, [1, ft_dim])

with tf.variable_scope("item_level"):
    user_latent = tf.get_variable("user_latent", [usr_nb, latent_dim],
                                  initializer=tf.random_normal_initializer(0,0.1,seed=3))
    item_latent = tf.get_variable("item_latent", [movie_nb, latent_dim],
                                  initializer=tf.random_normal_initializer(0,0.1,seed=3)) 
    aux_item = tf.get_variable("aux_item", [movie_nb, latent_dim],
                               initializer=tf.random_normal_initializer(0,0.1,seed=3))
    W1 = tf.get_variable("W1", [usr_nb, latent_dim],
                         initializer=tf.contrib.layers.xavier_initializer())
    Wu = tf.get_variable("Wu", [latent_dim,latent_dim], 
                         initializer=tf.contrib.layers.xavier_initializer())
    Wy = tf.get_variable("Wy", [movie_nb, latent_dim, latent_dim],
                         initializer=tf.contrib.layers.xavier_initializer())
    Wa = tf.get_variable("Wa", [latent_dim, latent_dim],
                         initializer=tf.contrib.layers.xavier_initializer())
    Wv = tf.get_variable("Wv", [latent_dim, ft_dim],
                         initializer=tf.contrib.layers.xavier_initializer())
    
    aux_new = tf.get_variable("aux_new", [1, latent_dim], initializer=tf.constant_initializer(0.0))
    ########## Error part, how to get auxisize dynamically
    ####aux_size= tf.get_variable(name='aux_size', initializer=l_id.get_shape().as_list()[-1])
    
with tf.variable_scope('feature_level'):
    Beta = tf.get_variable("beta", [usr_nb, ft_dim],
                           initializer=tf.random_normal_initializer(0.00001,0.000001,seed=10))
    
#     embedding = tf.get_variable("embedding", [embedding_dims, ft_dim],
#                                 initializer=tf.contrib.layers.xavier_initializer())
#     Beta = tf.get_variable("beta", [usr_nb, embedding_dims],
#                            initializer=tf.random_normal_initializer(0.00001,0.000001,seed=10))

#lookup the latent factors by user and id
u = tf.nn.embedding_lookup(user_latent, user)
vi = tf.nn.embedding_lookup(item_latent, i)
vj = tf.nn.embedding_lookup(item_latent, j)

w1 = tf.nn.embedding_lookup(W1, user) #(1*k)
wu = Wu
#wu = tf.squeeze(tf.nn.embedding_lookup(Wu, user)) #(k*k)
wy = tf.squeeze(tf.nn.embedding_lookup(Wy, i)) #(k*k)
wa = Wa
#wa = tf.squeeze(tf.nn.embedding_lookup(Wa, user)) #(k*k)
wv = Wv
#wv = tf.squeeze(tf.nn.embedding_lookup(Wv, user)) #(k,l)

beta = tf.nn.embedding_lookup(Beta, user) #user feature latent factor

In [None]:
print(w1.shape)
print(wu.shape)
print(wa.shape)
print(wy.shape)
print(wv.shape)

In [None]:
a_list = tf.Variable([])
q = tf.constant(0)

def att_cond(q,a_list):
    return tf.less(q,l_id_len[0])

def att_body(q,a_list):
    xfi = tf.expand_dims(xf[q],0) #(1,ft_dim)
    
    a_list = tf.concat([a_list,[(tf.matmul( w1, tf.nn.relu( tf.matmul(wu, u, transpose_b=True) +
        tf.matmul(wy, tf.expand_dims(tf.nn.embedding_lookup(item_latent,l_id[q]),0), transpose_b=True) +
        tf.matmul(wa, tf.expand_dims(tf.nn.embedding_lookup(aux_item, l_id[q]),0), transpose_b=True) +
        tf.matmul(wv, xfi, transpose_b=True)))[0][0])*r[q]]],0)
    q += 1
    return q,  a_list

_, a_list = tf.while_loop(att_cond,att_body,[q,a_list],shape_invariants=[q.get_shape(),tf.TensorShape([None])])

a_list_soft=tf.nn.softmax(a_list)


aux_np = tf.expand_dims(tf.zeros(128),0)
q = tf.constant(0)
def sum_att_cond(q,aux_np):
    return tf.less(q,l_id_len[0])

def sum_att_body(q,aux_np):
    #aux_np+=a_list_soft[q]*tf.expand_dims(tf.nn.embedding_lookup(aux_item, l_id[q]),0)
    aux_np = tf.math.add_n([aux_np,a_list_soft[q]*tf.expand_dims(tf.nn.embedding_lookup(aux_item, l_id[q]),0)]) 
    q += 1
    return q, aux_np

_,aux_np = tf.while_loop(sum_att_cond,sum_att_body,[q,aux_np])

"""
for q in range(3): #取q個auxliary item
    aux_np+=a_list_soft[q]*tf.expand_dims(tf.nn.embedding_lookup(aux_item, l_id[q]),0)
"""


aux_np+=u #user_latent factor + sum (alpha*auxilary)
aux_new=tf.assign(aux_new,aux_np) #把aux_new 的 值變成aux_np

#矩陣中對應函數各自相乘
xui = tf.matmul(aux_new, vi, transpose_b=True)+ tf.matmul(beta,image_i, transpose_b=True)
xuj = tf.matmul(aux_new, vj, transpose_b=True)+ tf.matmul(beta,image_j, transpose_b=True)

xuij = xui- xuj

l2_norm = tf.add_n([
            0.001 * tf.reduce_sum(tf.multiply(u, u)),
            0.001 * tf.reduce_sum(tf.multiply(vi, vi)),
            0.001 * tf.reduce_sum(tf.multiply(vj, vj)),
  
            0.001 * tf.reduce_sum(tf.multiply(w1, w1)),
            0.001 * tf.reduce_sum(tf.multiply(wu, wu)),
            0.001 * tf.reduce_sum(tf.multiply(wy, wy)),
            0.001 * tf.reduce_sum(tf.multiply(wa, wa)),
            0.001 * tf.reduce_sum(tf.multiply(wv,wv)),
            
            0.1 * tf.reduce_sum(tf.multiply(beta,beta)),
            
          ])

loss = l2_norm -tf.log(tf.sigmoid(xuij)) # objective funtion
train_op = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss) #parameter optimize 
auc = tf.reduce_mean(tf.to_float(xuij > 0))

In [None]:
print('Start time:', time.ctime())

init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
loss_acc_list = []
t0 = time.time()

#use_true=init_list_of_objects(136)
#use_test=init_list_of_objects(136)

train_pair_t=[] #positive feedback
train_pair_f=[] #negative feedback
train_yes_id=[]

for q in range(5):
    print('Iteraction:',q)
    train_auc=0
    total_loss=0
    xuij_auc=0
    length = 0
    
    for z in range(usr_nb):
        """
        yes 用來存放選擇到的YouTuber feature (for auxilary)
        yesr 用來存放user對該YouTuber的喜好程度(user_category 跟 YouTuber_category的相似性)
        r_3 用來存放user 對該YouTuber種類的偏好(取max)
        """
        
        yes=[]
        yesr=[]
        
        
        sample=random.sample(train_t[z],len(train_t[z])) #隨機選3個sample true's YouTuber
        train_yes_id.append(sample) #sample全部丟進去
        
        #sample=random.sample(train_t[z]+train_f[z],len(train_t[z])+len(train_f[z]))
        
        #change
        r_3 = np.zeros(len(sample)) 
        alpha_history = []
        a_list_history = []
        U_history = []
        Y_history = []
        
        #print(len(sample))
        #check if all YouTuber are in train_t or train_f
        #if len(train_t[z])+len(train_f[z]) != 88:
            #print(z,len(train_t[z])+len(train_f[z]))
         
        for k in range(len(sample)):
            yes.append(all_npy[sample[k]])
            yesr.append(movie_genre[sample[k]] * usr_genre_norm[z])
            #print('YouTuber_category ', YouTuber_category[sample[k]])
            #print('User_category ',user_category_norm[z])
        #print(len(yes))
        
        for k in range(len(sample)):
            r_3[k]=max(yesr[k])
        #print('r_3:',r_3)
        
        yes=np.array(yes)
        #print('user shape should be ',np.array([z]).shape)
        #print('xf shape should be ',yes.shape)
        #print('r shape should be ',np.array(r_3).shape)
        #print('l_id shape should be ',np.array(sample).shape)
        
        #not_used_list = list(set(train_t[z]).difference(set(sample)))
        
        train_t_sample = random.sample(train_t[z],len(train_t[z]))
        #print('number of positive feedback', len(train_t[z]))
        for ta in train_t_sample:
            #ta=random.choice(train_t[z]) #ta is true positve photo
            train_pair_t.append(ta)
            image_1=np.expand_dims(all_npy[ta],0) #(1,2048)
            #print('Image_1 shape ',image_1.shape)
            train_f_sample = random.sample(train_f[z],20)
            
            for b in train_f_sample:
                #print('likes:',ta,';Not likes:',b)
                #b=random.choice(train_f[z])  #b is no feedback photo
                train_pair_f.append(b)
                image_2=np.expand_dims(all_npy[b],0) #(1,2048)
                #print('Image_2 shape',image_2.shape)
            
                #use_test[z].append(b)
                Uu,Yy,_a_list,r3,_auc, _loss,_=sess.run([user_latent,item_latent,a_list,a_list_soft,auc,loss,train_op], feed_dict={user: [z],
                                        i: [ta], j: [b], xf: yes , l_id:sample, l_id_len:[len(sample)],r:r_3,
                                        image_i:image_1,image_j:image_2})
                #print('User latent factor')
                #print(Uu.shape)
                #print(Uu)
                #print('Item latent factor')
                #print(Yy.shape)
                #print(Yy)
                U_history.append(Uu)
                Y_history.append(Yy)
                
                #print(XUIJ)
                #print('loss=',_loss)
                #print('auc=',_auc)
                #print(z,ta,b)
                #print('alpha list after softmax:',r3)
                #print('alpha list before softmax:',_a_list)
                a_list_history.append(_a_list)
                alpha_history.append(r3)
                train_auc+=_auc
                total_loss+=_loss
                length += 1
            #now1+=1
        
        """
        with open('../Data/alpha_txt/'+str(q)+'_'+str(z)+'.txt', 'w') as f:
            for idd in range(len(alpha_history)):
                f.write('softmax alpha:')
                f.write(str(alpha_history[idd])+'\n')
                f.write('before softmax:')
                f.write(str(a_list_history[idd])+'\n')
        """
        #np.savez('../Data/latent_factor/YRM_up10/'+str(q)+'_'+str(z)+'.npz', User=U_history, YouTuber=Y_history)
    
    #print('mine:',xuij_auc/136)    
    #print('a_list_soft:',r3)
    print("total_loss:-----------------", total_loss/length)
    print("train_auc:-------------------", train_auc/length)
    loss_acc_list.append([total_loss/length,train_auc/length,time.time()-t0])
    print('time:',time.time()-t0,' sec')
    
print('Total cost ',time.time()-t0,' sec')

print('End time:', time.ctime())

In [None]:
for i in range(len(loss_acc_list)):
    print('Iteration:',i)
    print('loss=',loss_acc_list[i][0])
    print('acc=',loss_acc_list[i][1])
    print('time=',loss_acc_list[i][2])

# Get latent factor and Each weight

In [None]:
U, Y, A, A1, Au, Ay, Aa, Av, B =sess.run([user_latent, item_latent, aux_item, W1, Wu, Wy, Wa, Wv,Beta])

In [None]:
print('User latent shape: ',U.shape)
print('photo latent shape: ', Y.shape)
print('Auxilary latent shape: ',A.shape)
print('W1 weight shape: ',A1.shape)
print('Wu weight shape:',Au.shape)
print('Wy weight shape:', Ay.shape)
print('Wa weight shape:',Aa.shape)
print('Wv weight shape:',Av.shape)
print('Beta shape:',B.shape)

# Testing Part

In [None]:
result = np.zeros((test_amount, movie_nb))
RS = np.zeros((test_amount, movie_nb))
#test_idx --> Test 的 index

test_yes_id=[]

for s in range(test_amount):
    print(s,test_idx[s])

    yes=[]
    sample=random.sample(train_t[test_idx[s]],len(train_t[test_idx[s]])) #從training part 的positive feedback 取出YouTuber 當成Auxilary
    #sample=result_yes_id[now]
    test_yes_id.append(sample)
    alpha=np.zeros([len(sample)])
    
    for a in range(len(sample)):
        r =np.max(movie_genre[sample[a]] * usr_genre_norm[test_idx[s]]) #sample a 的category vec *user_category vec
        #print(test_idx[s])
        #print(np.dot(Au[test_idx[s]],np.expand_dims(U[test_idx[s]],0)))
        alpha[a]=np.dot(A1[test_idx[s]],(relu(np.dot(Au,np.expand_dims(U[test_idx[s]],0).T)+np.dot(Ay[sample[a]],np.expand_dims(Y[sample[a]],0).T)+np.dot(Aa,
                            np.expand_dims(A[sample[a]],0).T)+ np.dot(Av,np.expand_dims(all_3374[sample[a]],0).T))))*r
    mul=np.zeros((1,128))
    #print('alpha------------',alpha)
    print('softmax alpha--------------',softmax(alpha))
    for i in range(len(sample)):
        mul+=softmax(alpha)[i]*A[sample[i]] #attention alpha*Ai part 
    new_mul=mul+U[test_idx[s]]  #(U+auxilary)
    for k in range(88):
        result[s][k]=np.dot(new_mul,Y[k].T) #(U+auxilary)*photo latent factor
        RS[s][k] = np.dot(new_mul,Y[k].T)+np.dot(B[test_idx[s]], all_3374[k].T)
print(RS[s])

In [None]:
#取出test的資料
testRS = np.zeros((test_amount,yt_test_amount)) #shape 150*20
target = np.zeros((test_amount,yt_test_amount))
#test_t 是true的
#test_f 是false的
        
for z in range(test_amount):
    user_id = test_idx[z]
    #positive target YouTuber list
    youtube_t = test_t[z] 
    #not target YouTuber list
    youtube_f = test_f[z]
    
    #前兩個放target的RS
    for i in range(len(youtube_t)):
        testRS[z][i] = RS[z][youtube_t[i]]
        target[z][i] = 1
    for i in range(len(youtube_f)):
        testRS[z][i+len(youtube_t)] = RS[z][youtube_f[i]]

In [None]:
target

In [None]:
sumtarget = 0
for i in range(len(target)):
    #print(np.sum(target[i]))
    sumtarget += np.sum(target[i])
print('num of positive data in testing:',sumtarget)
print('total testing data:',test_amount*yt_test_amount)

In [None]:
def topN(sortlist,n):
    topList = []
    for i in range(n):
        topList.append(sortlist.index(max(sortlist)))
        #print(max(sortlist))
        #print(sortlist.index(max(sortlist)))
        sortlist[sortlist.index(max(sortlist))] = -1000000000
    return topList

In [None]:
count_0_all = []
for i in range(len(testRS)):
    top_0 = topN(list(testRS[i]),int(np.sum(target[i])))
    count_0_all.append(top_0)
    print(top_0)

acc_0 = 0
total = 0
for i in range(len(count_0_all)):
    for j in range(len(count_0_all[i])):
        #print(int(np.sum(target[i])))
        total+=int(np.sum(target[i]))
        if count_0_all[i][j] < int(np.sum(target[i])): #代表是0或1 (也就是target)
            acc_0 += 1
avg_acc = acc_0/total
print('avg_accuarcy for count_0:',avg_acc)

In [None]:
acc_0

In [None]:
total

# Top n
## Top 1

In [None]:
def F1_score(prec,rec):
    f1 = (2*prec*rec)/(prec+rec)
    return f1

In [None]:
correct = 0
for i in range(len(testRS)):
    top_0 = topN(list(testRS[i]),1) #取一個
    count_0_all.append(top_0)
    print(top_0)
    if top_0[0] < int(np.sum(target[i])):
        correct += 1

In [None]:
top1_prec = correct/len(testRS)
top1_recall = correct/(sumtarget)
print('prec ',top1_prec,'recall ',top1_recall)

In [None]:
#f1 score
print('F1_score:',F1_score(top1_prec,top1_recall))

## Top 3

In [None]:
correct = 0
for i in range(len(testRS)):
    top_3 = topN(list(testRS[i]),3) #取一個
    count_0_all.append(top_3)
    #print(top_3)
    for j in range(len(top_3)):
        if top_3[j] < int(np.sum(target[i])):
            correct += 1

In [None]:
top3_prec = correct/(len(testRS)*3)
top3_recall = correct/(sumtarget)
print('prec ',top3_prec,'recall ',top3_recall)

In [None]:
#f1 score
print('F1_score:',F1_score(top3_prec,top3_recall))

## Top 5

In [None]:
correct = 0
for i in range(len(testRS)):
    top_5 = topN(list(testRS[i]),5) #取一個
    count_0_all.append(top_5)
    #print(top_5)
    for j in range(len(top_5)):
        if top_5[j] < int(np.sum(target[i])):
            correct += 1

In [None]:
top5_prec = correct/(len(testRS)*5)
top5_recall = correct/(sumtarget)
print('prec ',top5_prec,'recall ',top5_recall)

In [None]:
#f1 score
print('F1_score:',F1_score(top5_prec,top5_recall))

In [None]:
np.savez('./latent_factor/YRM_up10_ALL/Final.npz', User=U, YouTuber=Y)