In [24]:
import numpy as np
import pandas as pd
import time
from collections import deque

import tensorflow as tf
from six import next
from sklearn import preprocessing
import sys
from scipy.sparse import lil_matrix
from scipy.sparse import coo_matrix

import plotly.graph_objs as go
from chart_studio.plotly import plot, iplot
import cufflinks as cf
cf.go_offline(connected=True)

In [25]:
def read_process(filname, sep="\t"):
    col_names = ["user", "item", "rate", "st"]
    df = pd.read_csv(filname, sep=sep, header=None, names=col_names, engine='python')
    df["user"] -= 1
    df["item"] -= 1
    for col in ("user", "item"):
        df[col] = df[col].astype(np.int32)
    df["rate"] = df["rate"].astype(np.float32)
    return df


def get_data100k():
    global PERC
    df = read_process("./data/ml100k/u.data", sep="\t")
    rows = len(df)
    df = df.iloc[np.random.permutation(rows)].reset_index(drop=True)
    split_index = int(rows * PERC)
    df_train = df[0:split_index]
    df_test = df[split_index:].reset_index(drop=True)
    return df_train, df_test

def get_UserData100k():
    col_names = ["user", "age", "gender", "occupation", "PostCode"]
    df = pd.read_csv('./data/ml100k/u.user', sep='|', header=None, names=col_names, engine='python')
    del df["PostCode"]
    df["user"] -= 1
    df = pd.get_dummies(df, columns=["age", "gender", "occupation"])
    del df["user"]
    return df.values

def get_ItemData100k():
    col_names = ["movieid", "movietitle", "releasedate", "videoreleasedate", "IMDbURL"
        , "unknown", "Action", "Adventure", "Animation", "Childrens", "Comedy", "Crime", "Documentary"
        , "Drama", "Fantasy", "FilmNoir", "Horror", "Musical", "Mystery", "Romance", "SciFi", "Thriller"
        , "War", "Western"]
    df = pd.read_csv('./data/ml100k/u.item', sep='|', header=None, names=col_names, engine='python')
    df['releasedate'] = pd.to_datetime(df['releasedate'])
    df['year'], df['month'] = zip(*df['releasedate'].map(lambda x: [x.year, x.month]))
    df['year'] -= df['year'].min()
    df['year'] /= df['year'].max()
    df['year'] = df['year'].fillna(0.0)

    del df["month"]
    del df["movietitle"]
    del df["releasedate"]
    del df["videoreleasedate"]
    del df["IMDbURL"]

    df["movieid"] -= 1
    del df["movieid"]
    return df.values

In [42]:
############# ML 100k dataset ###########
DEVICE = "/gpu:0"

BATCH_SIZE = 1000
PERC=0.9
USER_NUM = 943
ITEM_NUM = 1682
df_train, df_test = get_data100k()

MFSIZE = 50
UW = 0.05
IW = 0.02
LR = 0.00003
EPOCH_MAX = 196
tf.reset_default_graph()

In [43]:
class ShuffleIterator(object):

    def __init__(self, inputs, batch_size=10):
        self.inputs = inputs
        self.batch_size = batch_size
        self.num_cols = len(self.inputs)
        self.len = len(self.inputs[0])
        self.inputs = np.transpose(np.vstack([np.array(self.inputs[i]) for i in range(self.num_cols)]))

    def __len__(self):
        return self.len

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        ids = np.random.randint(0, self.len, (self.batch_size,)) #0과 len사이의  batch_size 크기의 랜덤 정수 생성
        out = self.inputs[ids, :] #뭐임?
        return [out[:, i] for i in range(self.num_cols)]


class OneEpochIterator(ShuffleIterator):
    def __init__(self, inputs, batch_size=10):
        super(OneEpochIterator, self).__init__(inputs, batch_size=batch_size)
        if batch_size > 0:
            self.idx_group = np.array_split(np.arange(self.len), np.ceil(self.len / batch_size)) #len 만큼의 array를 len/batch size의 올림만큼 분할
        else:
            self.idx_group = [np.arange(self.len)]
        self.group_id = 0

    def next(self):
        if self.group_id >= len(self.idx_group):
            self.group_id = 0
            raise StopIteration
        out = self.inputs[self.idx_group[self.group_id], :]
        self.group_id += 1
        return [out[:, i] for i in range(self.num_cols)]

## Embedding + dot.product

In [44]:
def inferenceDense(phase,user_batch, item_batch,idx_user,idx_item, user_num, item_num,UReg=0.05,IReg=0.1):
    with tf.device(DEVICE): 
        user_batch = tf.nn.embedding_lookup(idx_user, user_batch, name="embedding_user") #idx_iser,idx_item embedding
        item_batch = tf.nn.embedding_lookup(idx_item, item_batch, name="embedding_item")
        
        
        ul1mf=tf.layers.dense(inputs=user_batch, units=MFSIZE,activation=tf.nn.crelu, kernel_initializer=tf.random_normal_initializer(stddev=0.01))
        il1mf=tf.layers.dense(inputs=item_batch, units=MFSIZE,activation=tf.nn.crelu, kernel_initializer=tf.random_normal_initializer(stddev=0.01))
        InferInputMF=tf.multiply(ul1mf, il1mf) #PQ 내적


        infer=tf.reduce_sum(InferInputMF, 1, name="inference") #reduce_sum은 모든 차원제거하고 원소합

        regularizer = tf.add(UW*tf.nn.l2_loss(ul1mf), IW*tf.nn.l2_loss(il1mf), name="regularizer") # l2 regularize

    return infer, regularizer

## Optimization

def optimization(infer, regularizer, rate_batch, learning_rate=0.0005, reg=0.1):
    with tf.device(DEVICE):
        global_step = tf.train.get_global_step() #checkpoint
        assert global_step is not None
        cost_l2 = tf.nn.l2_loss(tf.subtract(infer, rate_batch))
        cost = tf.add(cost_l2, regularizer)
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost, global_step=global_step)
    return cost, train_op

## 뜯어보기

user_num = 943

Item_num = 1682

In [45]:
train,test = get_data100k() #train ,test 불러오기

AdjacencyUsers = np.zeros((USER_NUM,ITEM_NUM),dtype = np.float32) # N X M의 zeros 행렬
DegreeUsers = np.zeros((USER_NUM,1), dtype = np.float32) # N X 1
AdjacencyItems = np.zeros((ITEM_NUM,USER_NUM),dtype = np.float32)
DegreeItems = np.zeros((ITEM_NUM,1), dtype = np.float32) # M X 1

for index,row in train.iterrows():
    userid = int(row['user']) #row는 row전체를 받으니 index마다의 user,item index를 저장
    itemid = int(row['item'])
    AdjacencyUsers[userid][itemid] = row['rate'] / 5.0 #저장한값에 rate부여
    AdjacencyItems[itemid][userid] = row['rate'] / 5.0
    DegreeUsers[userid][0] += 1 #user의 평가 횟수 +
    DegreeItems[itemid][0] += 1 #item의 평가당하는 횟수 + 

DUserMax = np.amax(DegreeUsers) #array의 max값
DItemMax = np.amax(DegreeItems)
DegreeUsers = np.true_divide(DegreeUsers, DUserMax) #DegreeUsers의 array들 전부를 Max값으로 나누기
DegreeItems = np.true_divide(DegreeItems, DItemMax)

AdjacencyUsers=np.asarray(AdjacencyUsers,dtype=np.float32) #정규화된 rating이 적힌 matrix를 array로
AdjacencyItems=np.asarray(AdjacencyItems,dtype=np.float32)

#Graph = True 가정
#np.identity(User_NUM) NXN 의 정방행렬
# N X ( N + M + 1)  , M X ( N + M + 1) 행렬 생성
UserFeatures = np.concatenate((np.identity(USER_NUM,dtype = np.bool_),AdjacencyUsers,DegreeUsers),axis = 1)
ItemFeatures = np.concatenate((np.identity(ITEM_NUM,dtype = np.bool_),AdjacencyItems,DegreeItems),axis = 1)

UsrDat = get_UserData100k() #943 X 84 , userfeatures 2D matrix, age + occupation + gender
ItmDat = get_ItemData100k() #1682 X 20 , itemfeatures 2D matrix , 장르 19 , year 1개

UserFeatures = np.concatenate((UserFeatures, UsrDat),axis =1) #N X (N + M + 1 + L)
ItemFeatures = np.concatenate((ItemFeatures, ItmDat),axis =1) #M X (M + N + 1 + J)

UserFeatureslength = UserFeatures.shape[1] #N + M + 1 + L
ItemFeatureslength = ItemFeatures.shape[1] #M + N + 1 + J

print(UserFeatures.shape)
print(ItemFeatures.shape)

(943, 2710)
(1682, 2646)


In [48]:
samples_per_batch = len(train) // BATCH_SIZE #batch_size = 1000

iter_train = ShuffleIterator([train['user'], train['item'],train['rate']], batch_size = BATCH_SIZE)
iter_test = OneEpochIterator([test["user"], test["item"], test["rate"]], batch_size=10000)

user_batch = tf.placeholder(tf.int32, shape = [None], name = 'id_user')
item_batch = tf.placeholder(tf.int32, shape = [None], name = 'id_item')
rate_batch = tf.placeholder(tf.float64, shape = [None])
phase = tf.placeholder(tf.bool, name = 'phase')

w_user = tf.constant(UserFeatures, name="userids", shape=[USER_NUM, UserFeatures.shape[1]], dtype=tf.float64)
w_item = tf.constant(ItemFeatures, name="itemids", shape=[ITEM_NUM, ItemFeatures.shape[1]], dtype=tf.float64)

infer,regularizer = inferenceDense(phase, user_batch, item_batch, w_user, w_item, user_num=USER_NUM,item_num=ITEM_NUM) 
#return infer, regularizer
 
global_step = tf.contrib.framework.get_or_create_global_step() # ?
_, train_op = optimization(infer, regularizer, rate_batch, learning_rate=LR, reg=0.09) #return cost, train_op



In [21]:
init_op = tf.global_variables_initializer()
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5
finalerror = -1

with tf.Session(config=config) as sess:
    sess.run(init_op)
    print("{}{}{}{}".format("epoch","train_error","val_error","elapsed_time"))
    errors = deque(maxlen = samples_per_batch) #batch사이즈를 최대길이로..?
    start = time.time() #time check
    for i in range(EPOCH_MAX * samples_per_batch):
        #users, items, rates,y,m,d,dw,dy,w = next(iter_train)
        users,items,rates = next(iter_train)
        _, pred_batch = sess.run([train_op,infer], feed_dict = {user_batch : users,
                                                                item_batch : items,
                                                                rate_batch : rates,
                                                                phase : True})
        
        pred_batch = clip(pred_batch) #pred_batch 1~5 사이
        errors.append(np.power(pred_batch - rates , 2))
        if i % samples_per_batch == 0:
            train_err = np.sqrt(np.mean(errors))
            test_err2 = np.array([])
            degreelist = list()
            predlist = list()
            for users, items, rates in iter_test:
                pred_batch = sess.run(infer, feed_dict = {user_batch:users,
                                                          item_batch:items,
                                                          phase : False})
                pred_batch = clip(pred_batch)
                test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
            end = time.time()
            test_err = np.sqrt(np.mean(test_err2))
            finalerror = test_err
            print("{:3d},{:f},{:f},{:f}(s)".format(i // samples_per_batch, train_err, test_err, end - start))
            start = end

IndexError: tuple index out of range

## GraphRec

In [None]:
def GraphRec(train, test,ItemData=False,UserData=False,Graph=False,Dataset='100k'):

    AdjacencyUsers = np.zeros((USER_NUM,ITEM_NUM), dtype=np.float32) # N x M shape의 zero matrix 생성 (Adjacency)
    DegreeUsers = np.zeros((USER_NUM,1), dtype=np.float32) #N x 1  shape의 zero vactor 생성 (Degree)
    
    AdjacencyItems = np.zeros((ITEM_NUM,USER_NUM), dtype=np.float32) # M x N shape의 zero matrix 생성
    DegreeItems =  np.zeros((ITEM_NUM,1), dtype=np.float32)  # M X 1 shape의 zero vactor 생성
    for index, row in train.iterrows():
      userid=int(row['user']) #row돌면서 'user'와 'item' column의 값 저장
      itemid=int(row['item'])
      AdjacencyUsers[userid][itemid]=row['rate']/5.0 #train set의 rating / max 값을 numpy matrix에 저장
      AdjacencyItems[itemid][userid]=row['rate']/5.0 #동일, transpose matrix에
      DegreeUsers[userid][0]+=1
      DegreeItems[itemid][0]+=1
    
    DUserMax=np.amax(DegreeUsers) #max값
    DItemMax=np.amax(DegreeItems)
    DegreeUsers=np.true_divide(DegreeUsers, DUserMax) #DegreeUsers의 array들 전부를 Max값으로 나누기
    DegreeItems=np.true_divide(DegreeItems, DItemMax)
    
    AdjacencyUsers=np.asarray(AdjacencyUsers,dtype=np.float32) #정규화된 rating이 적힌 matrix를 array로
    AdjacencyItems=np.asarray(AdjacencyItems,dtype=np.float32)
    
    if(Graph):
        UserFeatures= np.concatenate((np.identity(USER_NUM,dtype=np.bool_), AdjacencyUsers,DegreeUsers), axis=1)#np.identity concat
        print(UserFeatures.shape) #
        ItemFeatures= np.concatenate((np.identity(ITEM_NUM,dtype=np.bool_), AdjacencyItems,DegreeItems), axis=1) 
    else:
        UserFeatures=np.identity(USER_NUM,dtype=np.bool_)
        ItemFeatures=np.identity(ITEM_NUM,dtype=np.bool_)



    if(UserData):
      if(Dataset=='1m'):
        UsrDat=get_UserData1M()
      if(Dataset=='100k'):
        UsrDat=get_UserData100k()
      UserFeatures=np.concatenate((UserFeatures,UsrDat), axis=1) 

    if(ItemData):
      if(Dataset=='1m'):
        ItmDat=get_ItemData1M()
      if(Dataset=='100k'):
        ItmDat=get_ItemData100k()

      ItemFeatures=np.concatenate((ItemFeatures,ItmDat), axis=1) 

    UserFeaturesLength=UserFeatures.shape[1]
    ItemFeaturesLength=ItemFeatures.shape[1]

    print(UserFeatures.shape)
    print(ItemFeatures.shape)

    
    samples_per_batch = len(train) // BATCH_SIZE

    iter_train = ShuffleIterator([train["user"],train["item"],train["rate"]],batch_size=BATCH_SIZE)

    iter_test = OneEpochIterator([test["user"],test["item"],test["rate"]],batch_size=10000)


    user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
    item_batch = tf.placeholder(tf.int32, shape=[None], name="id_item")
    rate_batch = tf.placeholder(tf.float64, shape=[None])
    phase = tf.placeholder(tf.bool, name='phase')
    
    
    w_user = tf.constant(UserFeatures,name="userids", shape=[USER_NUM,UserFeatures.shape[1]],dtype=tf.float64)
    w_item = tf.constant(ItemFeatures,name="itemids", shape=[ITEM_NUM, ItemFeatures.shape[1]],dtype=tf.float64)


    infer, regularizer = inferenceDense(phase,user_batch, item_batch,w_user,w_item, user_num=USER_NUM, item_num=ITEM_NUM)
    global_step = tf.contrib.framework.get_or_create_global_step()
    _, train_op = optimization(infer, regularizer, rate_batch, learning_rate=LR, reg=0.09)

    init_op = tf.global_variables_initializer()
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.5
    finalerror=-1
    with tf.Session(config=config) as sess:
        sess.run(init_op)
        print("{} {} {} {}".format("epoch", "train_error", "val_error", "elapsed_time"))
        errors = deque(maxlen=samples_per_batch)
        start = time.time()
        for i in range(EPOCH_MAX * samples_per_batch):
            #users, items, rates,y,m,d,dw,dy,w = next(iter_train)
            users, items, rates = next(iter_train)
            _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                                   item_batch: items,
                                                                   rate_batch: rates,
                                                                   phase:True})
            pred_batch = clip(pred_batch)
            errors.append(np.power(pred_batch - rates, 2))
            if i % samples_per_batch == 0:
                train_err = np.sqrt(np.mean(errors))
                test_err2 = np.array([])
                degreelist=list()
                predlist=list()
                for users, items, rates in iter_test:
                    pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                            item_batch: items,                                                                                             
                                                            phase:False})

                    pred_batch = clip(pred_batch)            
                    test_err2 = np.append(test_err2, np.power(pred_batch - rates, 2))
                end = time.time()
                test_err = np.sqrt(np.mean(test_err2))
                finalerror=test_err
                print("{:3d},{:f},{:f},{:f}(s)".format(i // samples_per_batch, train_err, test_err, end - start))
                start = end

In [None]:
#  GraphRec(df_train,df_test,ItemData=True,UserData=True,Graph=True,Dataset='100k')