In [2]:
"""
imports
"""
import pandas as pd
import numpy as np
import random
import os
from collections import defaultdict
from time import time
import tensorflow as tf
from tqdm import tqdm
import time
import argparse
import scipy.sparse as sp
import os
import mindspore as ms

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
"""
load data
"""
class Data(object):
    def __init__(self,npzpath="./data/viedo10/video10.npz"):

        self.user_item = defaultdict(set)
        self.item_user = defaultdict(set)

        self.user_vali_item = dict()
        self.user_test_item = dict()

        _data = np.load(npzpath, allow_pickle=True)
        self.train_data = _data['train_data']
        self.test_data = _data['test_data'].tolist()
        vali_data = _data['vali_data'].tolist()

        # todo consider using os.path.join
        p = npzpath.split('/')
        self.path = p[0] + '/' + p[1] + '/' + p[2]

        self.n_users, self.n_items = self.train_data.max(axis=0) + 1
        self.R = sp.dok_matrix((self.n_users, self.n_items), dtype=np.float32)

        for u, i in self.train_data:
            self.user_item[u].add(i)
            self.item_user[i].add(u)

            self.R[u, i] = 1.

        self.train_number = np.shape(self.train_data)[0]
        print(self.n_users, self.n_items,self.train_number, self.train_number/(self.n_users*self.n_items))

        for u in self.test_data.keys():
            self.user_test_item[u]=[self.test_data[u][0]]
            self.user_test_item[u].extend(self.test_data[u][1])

        for u in vali_data.keys():
            self.user_vali_item[u] = [vali_data[u][0]]
            self.user_vali_item[u].extend(vali_data[u][1])

        # self.nodesum = self.get_nodesum(depth)


    def gen_batch_train_data(self, neg_number, batch_size):
        np.random.shuffle(self.train_data)
        batch = np.zeros((batch_size, 3), dtype=np.uint32)
        idx = 0
        for u,i in self.train_data:
            for neg_num in range(neg_number):
                neg_item = random.randint(0, self.n_items - 1)
                while (neg_item in self.user_item[u]):
                    neg_item = random.randint(0, self.n_items  - 1)
                batch[idx, :] = [u,i, neg_item]
                idx += 1

                if (idx == batch_size):
                    yield batch
                    idx = 0

        if (idx > 0):
            yield batch[:idx]


    def gen_batch_test_data(self, test_neg_number, data='test'):
        size = test_neg_number + 1
        batch = np.zeros((size, 2), dtype=np.uint32)

        idx = 0
        if(data=='test'):
            for user, items in self.user_test_item.items():
                for item in items:
                    batch[idx, :] = [user, item]
                    idx += 1

                yield items[0], batch
                idx = 0

        elif(data=='vali'):
            for user, items in self.user_vali_item.items():
                for item in items:
                    batch[idx, :] = [user, item]
                    idx += 1

                yield items[0], batch
                idx = 0
        else:
            print("data type error.")
            exit(-1)

    def get_adj_mat(self):
        try:
            t1 = time()
            mean_adj_mat = sp.load_npz(self.path + '/s_mean_adj_mat.npz')
            print('already load adj matrix', mean_adj_mat.shape, time() - t1)

        except Exception:
            mean_adj_mat = self.create_adj_mat()
            sp.save_npz(self.path + '/s_mean_adj_mat.npz', mean_adj_mat)

        return  mean_adj_mat

    def get_adj_mat_nonorm(self):
        # try:
        #     t1 = time()
        #     adj_mat = sp.load_npz(self.path + '/adj_mat.npz')
        #     print('already load adj matrix', adj_mat.shape, time() - t1)

        # except Exception:
        adj_mat = sp.dok_matrix((self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32)
        adj_mat = adj_mat.tolil()
        R = self.R.tolil()
        adj_mat[:self.n_users, self.n_users:] = R
        adj_mat[self.n_users:, :self.n_users] = R.T

        rowsum = np.array(adj_mat.sum(1)).flatten()
        d_mat_inv = sp.diags(rowsum)

        adj_mat = adj_mat+d_mat_inv

        adj_mat = adj_mat.tocsr()
        sp.save_npz(self.path + '/adj_mat.npz', adj_mat)

        return adj_mat

    def get_nodesum(self,depth):
        adj_mat = self.get_adj_mat_nonorm()
        edge_mat = adj_mat.dot(adj_mat)
        for i in range(depth-1):
            if(i!=0):
                edge_mat = edge_mat.dot(adj_mat)
            else:
                pass
        nodesum = edge_mat.sum(1).flatten()
        return nodesum

    def create_adj_mat(self):
        t1 = time()
        adj_mat = sp.dok_matrix((self.n_users+self.n_items, self.n_users+self.n_items), dtype=np.float32)
        adj_mat = adj_mat.tolil()
        R = self.R.tolil()

        adj_mat[:self.n_users, self.n_users:] = R
        adj_mat[self.n_users:, :self.n_users] = R.T
        adj_mat = adj_mat.todok()
        print('already create adjacency matrix', adj_mat.shape, time() - t1)

        t2 = time()

        def normalized_adj_single(adj):
            rowsum = np.array(adj.sum(1))

            d_inv = np.power(rowsum, -1).flatten()
            d_inv[np.isinf(d_inv)] = 0.
            d_mat_inv = sp.diags(d_inv)

            norm_adj = d_mat_inv.dot(adj)
            # norm_adj = adj.dot(d_mat_inv)
            print('generate single-normalized adjacency matrix.')
            return norm_adj.tocoo()

        mean_adj_mat = normalized_adj_single(adj_mat)

        print('already normalize adjacency matrix', time() - t2)
        return mean_adj_mat.tocsr()

In [None]:
def leave_one_out(purchased_item, recommend_list, top_k_recommand_number):
    top_recommend_list=recommend_list[:top_k_recommand_number]
    if (purchased_item in top_recommend_list):
        return 1, np.log2(2.0) / np.log2(top_recommend_list.index(purchased_item) + 2.0)
    else:
        return 0, 0

def NDCG_k(recommend_list, purchased_list):
    Z_u = 0
    temp=0
    for j in range(min(len(recommend_list), len(purchased_list))):
        Z_u = Z_u + 1 / np.log2(j + 2)
    for j in range(len(recommend_list)):
        if recommend_list[j] in purchased_list:
            temp = temp + 1 / np.log2(j + 2)
    if Z_u == 0:
        temp = 0
    else:
        temp = temp / Z_u
    return temp

def top_k(recommend_list, purchased_list):
    temp = []
    for j in recommend_list:
        if j in purchased_list:
            temp.append(j)
    if len(temp):
        HR = 1
    else:
        HR = 0
    co_length=len(temp)
    re_length=len(recommend_list)
    pu_length=len(purchased_list)

    if re_length == 0:
        p = 0.0
    else:
        p = co_length / float(re_length)

    if pu_length == 0:
        r = 0.0
    else:
        r = co_length / float(pu_length)

    if r != 0 or p != 0:
        f=2.0 * p * r / (p + r)
    else:
        f=0.0
    return p, r, f, HR

In [None]:
"""
define the model
"""

class LECF():
    def __init__(self, args):
        self.para = args

        self.filename = './data/' + self.para.dataset + '/' + self.para.dataset + '.npz'
        self.data = Data(self.filename)
        self.test_user_number = len(list(self.data.user_test_item.keys()))
        self.train_number = self.data.train_number * self.para.neg_number

        initializer = tf.contrib.layers.xavier_initializer()
        # self.initializer = tf.random_uniform_initializer(minval=0,maxval=0.1)  # tf.truncated_normal_initializer(stddev=0.1)#
        self.all_weights = dict()
        self.all_weights['user_embedding'] = tf.get_variable('user_embedding_matrix', initializer=initializer,
                                                             shape=[self.data.n_users, self.para.global_dimension])
        self.all_weights['item_embedding'] = tf.get_variable('item_embedding_matrix', initializer=initializer,
                                                             shape=[self.data.n_items, self.para.global_dimension])

        self.all_weights['edge_weight'] = tf.get_variable('edge_weight', initializer=initializer,
                                            shape=[2 * self.para.global_dimension, self.para.global_dimension])

        self.dl=1
        if(self.para.edge=='concat'):
            self.dl=2
        self.test_u_g_embeddings = tf.get_variable('test_u_g_embeddings',
                                                   shape=[self.data.n_users,
                                                          self.para.global_dimension*self.dl])
        self.test_i_g_embeddings = tf.get_variable('test_i_g_embeddings',
                                                   shape=[self.data.n_items,
                                                          self.para.global_dimension*self.dl])

        self.node_dropout = tf.placeholder(tf.float32)
        self.mess_dropout = tf.placeholder(tf.float32)
        self.user_id = tf.placeholder(tf.int32, shape=[None], name='user_id')
        self.item_id = tf.placeholder(tf.int32, shape=[None], name='item_id')
        self.neg_item_id = tf.placeholder(tf.int32, shape=[None], name='neg_item_id')

    def get_fold_hat(self, outward):
        mean_adj_mat = self.data.get_adj_mat()

        A_fold_hat = []

        if(outward==-1):
            mat = 0.5*mean_adj_mat
            # mat = mat.tocsr()
        else:
            mat= outward*mean_adj_mat + (1-outward)*sp.eye(mean_adj_mat.shape[0])
            # mat = mat.tocsr()

        fold_len = (self.data.n_users + self.data.n_items) // self.para.n_fold
        for i_fold in range(self.para.n_fold):
            start = i_fold * fold_len
            if (i_fold == self.para.n_fold - 1):
                end = self.data.n_users + self.data.n_items
            else:
                end = (i_fold + 1) * fold_len

            coo = mat[start:end].tocoo().astype(np.float32)
            indices = np.mat([coo.row, coo.col]).transpose()
            temp = tf.SparseTensor(indices, coo.data, coo.shape)

            if (self.para.node_dropout != 0):
                random_tensor = 1 - self.node_dropout
                random_tensor += tf.random_uniform([mat[start:end].count_nonzero()])
                dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
                temp = tf.sparse_retain(temp, dropout_mask) * tf.div(1., 1 - self.node_dropout)

            A_fold_hat.append(temp)
        return A_fold_hat

    def build_model(self):

        A_fold_hat_c = self.get_fold_hat(self.para.outward)
        A_fold_hat_e = self.get_fold_hat(-1)

        ego_embeddings = tf.concat([self.all_weights['user_embedding'], self.all_weights['item_embedding']], axis=0)

        for k in range(self.para.depth):

            if(k==0):
                A_fold_hat = A_fold_hat_e
            else:
                A_fold_hat = A_fold_hat_c


            temp_embed = []
            for f in range(self.para.n_fold):
                temp_embed.append(tf.sparse_tensor_dense_matmul(A_fold_hat[f], ego_embeddings))

            if(k==0):
                if(self.para.edge=='hadam'):
                    ego_embeddings = tf.multiply(tf.concat(temp_embed, 0),ego_embeddings)
                elif (self.para.edge == 'weight1'):
                    ego_embeddings = tf.matmul(tf.concat([tf.concat(temp_embed, 0), ego_embeddings],1),self.all_weights['edge_weights'])
                elif (self.para.edge == 'concat'):
                    ego_embeddings = tf.concat([tf.concat(temp_embed, 0), 0.5*ego_embeddings],1)
                elif (self.para.edge == 'add'):
                    ego_embeddings = tf.concat(temp_embed, 0) + ego_embeddings
                elif (self.para.edge == 'subtract'):
                    ego_embeddings = ego_embeddings - tf.concat(temp_embed, 0)

            else:
                ego_embeddings = tf.concat(temp_embed, 0)

            if(self.para.mess_dropout != 0):
                ego_embeddings = tf.nn.dropout(ego_embeddings, 1 - self.mess_dropout)

        # norm_embeddings = tf.nn.l2_normalize(ego_embeddings)
        self.u_g_embeddings, self.i_g_embeddings = tf.split(ego_embeddings, [int(self.data.n_users), int(self.data.n_items)], 0)

        self.first_user_embedding = tf.nn.embedding_lookup(self.all_weights['user_embedding'], self.user_id)
        self.first_item_embedding = tf.nn.embedding_lookup(self.all_weights['item_embedding'], self.item_id)
        self.first_neg_item_embedding = tf.nn.embedding_lookup(self.all_weights['item_embedding'], self.neg_item_id)

        self.last_user_embedding = tf.nn.embedding_lookup(self.u_g_embeddings, self.user_id)
        self.last_item_embedding = tf.nn.embedding_lookup(self.i_g_embeddings, self.item_id)
        self.last_neg_item_embedding = tf.nn.embedding_lookup(self.i_g_embeddings, self.neg_item_id)

        self.query_pair = self.edge_embed(self.first_user_embedding, self.first_item_embedding)
        self.query_neg_pair = self.edge_embed(self.first_user_embedding, self.first_neg_item_embedding)


        # if(self.para.y=='edge'):
        self.y = tf.reduce_sum(tf.multiply(self.query_pair, self.last_user_embedding+self.last_item_embedding), 1)
        self.neg_y = tf.reduce_sum(tf.multiply(self.query_neg_pair, self.last_user_embedding+self.last_neg_item_embedding), 1)


        self.all_loss = self.creat_loss()


        if (self.para.learner == 'sgd'):
            print('------------------------------sgd')
            self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.para.learning_rate).minimize(self.all_loss)

        elif (self.para.learner == 'adag'):
            print('------------------------------adag')
            self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.para.learning_rate).minimize(self.all_loss)

        elif (self.para.learner == 'adam'):
            print('------------------------------adam')
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.para.learning_rate).minimize(self.all_loss)


        self.test_user_embedding = tf.nn.embedding_lookup(self.test_u_g_embeddings, self.user_id)
        self.test_item_embedding = tf.nn.embedding_lookup(self.test_i_g_embeddings, self.item_id)

        if (self.para.y == 'edge'):
            self.test_y = tf.reduce_sum(tf.multiply(self.query_pair,self.test_user_embedding + self.test_item_embedding), 1)

        test_top_value, self.test_top_index = tf.nn.top_k(self.test_y, k=10, sorted=True)

        self.saver = tf.train.Saver(max_to_keep=1)


    def creat_loss(self):
        self.mf_loss = -tf.reduce_sum(tf.log(tf.nn.sigmoid(self.y - self.neg_y)+1e-6))
        self.reg_loss = self.para.l2 * (tf.nn.l2_loss(self.first_user_embedding) +
                                        tf.nn.l2_loss(self.first_item_embedding) +
                                        tf.nn.l2_loss(self.first_neg_item_embedding)) / self.para.batch_size
        return self.mf_loss + self.reg_loss

    def edge_embed(self, user, item):
        if(self.para.edge=='add'):
            return user+item

        elif(self.para.edge=='hadam'):
            return tf.multiply(user,item)

        elif(self.para.edge=='weight1'):
            return tf.matmul(tf.concat([user,item], 1), self.edge_weights)

        elif (self.para.edge == 'concat'):
            return tf.concat([user,item], 1)

        elif (self.para.edge == 'subtract'):
            return user-item

    def run_model(self):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = self.para.gpu
        print('user number:', self.data.n_users, '  item number:', self.data.n_items,'  train number:', self.train_number,)


        self.build_model()

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        self.sess.run(tf.global_variables_initializer())

        best_epoch = 0
        best_result=np.array([0,0,0,0])

        for epoch in range(self.para.epochs):
            batch_loss = 0

            # progress = tqdm(enumerate(
            #     self.data.gen_batch_train_data(self.para.neg_number, self.para.batch_size)), dynamic_ncols=True,
            #     total=(self.train_number // self.para.batch_size))
            progress = enumerate(
                self.data.gen_batch_train_data(self.para.neg_number, self.para.batch_size))
            for k, e in progress:
                feed = {self.user_id: e[:, 0], self.item_id: e[:, 1], self.neg_item_id: e[:, 2],
                        self.node_dropout:self.para.node_dropout, self.mess_dropout:self.para.mess_dropout}

                _, loss = self.sess.run([self.optimizer, self.all_loss], feed_dict=feed)

                batch_loss += loss
                # progress.set_description(u"[{}] Loss: {:,.6f} ----- ".format(epoch, loss))

            if (epoch + 1) % self.para.verbose == 0:
                feed = {self.node_dropout: 0, self.mess_dropout: 0}
                user_embed, item_embed = self.sess.run([self.u_g_embeddings, self.i_g_embeddings], feed_dict=feed)
                self.sess.run([
                    self.test_u_g_embeddings.assign(user_embed),
                    self.test_i_g_embeddings.assign(item_embed)])

                vali_result,_ = self.test(epoch,'vali')
                test_result,_ = self.test(epoch,'test')

                # if(np.sum(vali_result)>np.sum(best_result)):
                #     best_result=vali_result
                #     best_epoch = epoch
                #     if(self.para.save==1):
                #         self.save(epoch)
                # else:
                #     if(epoch-best_epoch)>50:
                #         exit(0)

    def save(self, epoch):
        # self.saver.save(self.sess, self.savepath+'/tf_model', global_step=epoch)
        user_embed, item_embed = self.sess.run([self.all_weights['user_embedding'], self.all_weights['item_embedding']])
        np.savez(self.savepath + '/' +'embedding.npz', user=user_embed, item=item_embed)


    def test(self, epoch, data='test'):
        result = np.zeros([self.test_user_number, 4])

        # progress_test = tqdm(enumerate(
        #     self.data.gen_batch_test_data(self.para.test_neg_number,data)),
        #     dynamic_ncols=True, total=self.test_user_number)
        progress_test = enumerate(
            self.data.gen_batch_test_data(self.para.test_neg_number,data))

        for k, e in progress_test:
            purchased_item, batch = e
            batch_user, batch_item = batch[:, 0], batch[:, 1]

            recommend_list = self.sess.run(self.test_top_index,
                                           feed_dict={self.user_id: batch_user,
                                                      self.item_id: batch_item})
            recommend_list = list(np.array(batch_item)[recommend_list])

            HR5, NDCG5 = leave_one_out(purchased_item, recommend_list, 5)
            HR10, NDCG10 = leave_one_out(purchased_item, recommend_list, 10)

            result[k] = np.array([HR5, NDCG5, HR10, NDCG10])
            # progress_test.set_description(u"{}: [{}] HR: {} ----- ".format(data,epoch, HR5))
        avg = np.mean(result, axis=0)
        self.logger.info(avg)

        return avg,result


In [None]:
def parse_args():
    parser = argparse.ArgumentParser(description="Run CML.")

    parser.add_argument('-g', '--gpu', help='set gpu device number 0-3', type=str, default='0')
    parser.add_argument('--note', help='model-note', type=str, default='Edge-simi')
    parser.add_argument('--logdir', type=str, default='result/edge-edge')
    parser.add_argument('--verbose', help='test fre', type=int, default=1)
    parser.add_argument('-d', '--global_dimension', help='Embedding Size', type=int, default=50)
    parser.add_argument('--epochs', help='Max epoch', type=int, default=500)
    parser.add_argument('-n', '--neg_number', help='Negative Samples Count', type=int, default=1)
    parser.add_argument('--test_neg_number', type=int, default=100)
    parser.add_argument('-lr', '--learning_rate', help='learning_rate', type=float, default=0.001)
    parser.add_argument('--l2', help='l2 Regularization', type=float, default=0.0001)
    parser.add_argument('--dataset', help='path to file', type=str, default='video10')
    parser.add_argument('-b', '--batch_size', help='Batch Size', type=int, default=1024)
    parser.add_argument('--pretrain', help='1:pretrain', type=int, default=0)
    parser.add_argument('--learner', help='[sgd, adag, adam]', type=str, default='adam')
    parser.add_argument('--n_fold', type=int, default=1)
    parser.add_argument('--mess_dropout', type=float, default=0.0)
    parser.add_argument('--node_dropout', type=float, default=0.1)
    parser.add_argument('--depth', type=int, default=10)
    parser.add_argument('--alpha', type=float, default=0.5)
    parser.add_argument('--loss', type=int, default=0)
    parser.add_argument('-y', type=str,help='edge,dot-1,dot-2,u-i-1,u-i-2', default='edge')
    parser.add_argument('--outward', type=float,help='outward', default=0.5)
    parser.add_argument('--edge', type=str,help='add,hadam,weight1-2', default='add')
    parser.add_argument('--save', type=int,help='save the model', default=1)


    return parser.parse_args()

args = parse_args()
model = LECF(args)
model.run_model()