In [1]:
import pickle
import json
import time, datetime
import numpy as np

def read_pickle(file):
    with open(file, 'rb') as f:
        ret = pickle.load(f)
    return ret

def read_json(file):
    with open(file, 'r') as f:
        ret = [json.loads(line) for line in f]
    return ret

def write_pickle(file, data):
    with open(file, 'wb') as fw:
        pickle.dump(data, fw)

In [2]:
filepath = '../yelp_dataset/filtered/'
jsonpath = '../yelp_dataset/json/'

In [3]:
reviews = read_pickle(filepath+'reviews.pickle')
users = read_pickle(filepath+'users.pickle')
businesses = read_pickle(filepath+'businesses.pickle')

In [4]:
busi_unfil = read_json(jsonpath+'yelp_academic_dataset_business.json')

In [5]:
print(type(reviews))
print(reviews[0].keys())
print(len(reviews))

<class 'list'>
dict_keys(['business_id', 'cool', 'useful', 'review_id', 'stars', 'funny', 'text', 'date', 'user_id'])
742969


In [6]:
user_unfil = read_json(jsonpath+'yelp_academic_dataset_user.json')
print(user_unfil[0].keys())

dict_keys(['compliment_list', 'compliment_note', 'compliment_photos', 'compliment_more', 'name', 'elite', 'useful', 'average_stars', 'compliment_cute', 'review_count', 'cool', 'user_id', 'yelping_since', 'funny', 'fans', 'friends', 'compliment_writer', 'compliment_profile', 'compliment_hot', 'compliment_plain', 'compliment_funny', 'compliment_cool'])


In [7]:
print(type(busi_unfil[0]))
print(busi_unfil[0].keys())
print(len(busi_unfil))
print(busi_unfil[0]['categories'])

<class 'dict'>
dict_keys(['review_count', 'city', 'longitude', 'hours', 'categories', 'name', 'is_open', 'postal_code', 'business_id', 'attributes', 'stars', 'latitude', 'state', 'address'])
192609
Golf, Active Life


In [None]:
busi_filtered = []
for busi in busi_unfil:
    for business in businesses:
        if busi['business_id'] == business:
            busi_filtered.append(busi)
            break
user_filtered = []
for user in user_unfil:
    for u in users:
        if user['user_id'] == u:
            user_filtered.append(user)
            break

In [None]:
print(len(busi_filtered))
print(len(businesses))

In [None]:
uinds = [i for i in range(len(users))]
uid2ind = {user:ind for user, ind in zip(users, uinds)}
ind2uid = {ind:user for user, ind in zip(users, uinds)}

In [None]:
b_inds = [i for i in range(len(businesses))]
bid2ind = {business:ind for business, ind in zip(businesses, b_inds)}
ind2bid = {ind:business for business, ind in zip(businesses, b_inds)}

In [None]:
cities = set(busi['city'] for busi in busi_filtered)
c_inds = [i for i in range(len(cities))]
ct_id2ind = {city:ind for city, ind in zip(cities, c_inds)}
ind2ct_id = {ind:city for city, ind in zip(cities, c_inds)}

In [None]:
print(len(cities))
print(ct_id2ind)

In [None]:
categories = set(category.strip() for busi in busi_filtered for category in busi['categories'].split(','))
ca_inds = [i for i in range(len(categories))]
ca_id2ind = {category:ind for category, ind in zip(categories, ca_inds)}
ind2ca_id = {ind:category for category, ind in zip(categories, ca_inds)}

In [None]:
print(len(categories))
print(ca_id2ind)

In [None]:
import numpy as np
Y = np.zeros((len(users)*len(businesses), 3))
print(Y)

In [None]:
def dataset_split(reviews, userid_to_num, businessid_to_num, train_ratio, valid_ratio, test_ratio, n_neg_sample):
    selected_reviews = []
    
    for review in reviews:
        filtered_review = {}
        filtered_review['user_id'] = userid_to_num[review['user_id']]
        filtered_review['business_id'] = businessid_to_num[review['business_id']]
        filtered_review['rate'] = 1.0
        filtered_review['timestamp'] = time.mktime(datetime.datetime.strptime(review['date'], '%Y-%m-%d %H:%M:%S').timetuple())
        selected_reviews.append(filtered_review)
        
    selected_reviews_sorted = sorted(selected_reviews, key=lambda k: k['timestamp']) # use the earlier data to train and the later data to test
    n_reviews = len(selected_reviews_sorted)
    train_size = int(n_reviews*train_ratio)
    valid_size = int(n_reviews*valid_ratio)
    train_data = [selected_reviews_sorted[index] for index in range(train_size)]
    valid_data = [selected_reviews_sorted[index] for index in range(train_size, train_size+valid_size)]
    test_data = [selected_reviews_sorted[index] for index in range(train_size+valid_size, n_reviews)]
    
    selected_users = set()
    selected_businesses = set()
    for review in train_data:
        selected_users.add(review['user_id'])
        selected_businesses.add(review['business_id'])
        
    eval_datas = [valid_data, test_data]
#     selected_eval_datas = [[] for _ in range(len(eval_datas))]
    selected_eval_datas = [[], []]
    for eval_index in range(len(eval_datas)):
        eval_data = eval_datas[eval_index]
        for review in eval_data:
            if review['user_id'] in selected_users and review['business_id'] in selected_businesses:
                selected_eval_datas[eval_index].append(review)
    selected_valid_data, selected_test_data = selected_eval_datas
    
    data_list = [train_data, selected_valid_data, selected_test_data]
#     data_for_user_list = [{} for _ in range(len(data_list))]
    data_for_user_list = [{}, {}, {}]
    train_data_for_item = set()
    for index in range(len(data_list)):
        data = data_list[index]
        data_for_user = data_for_user_list[index]
        for review in data:
            user = review['user_id']
            item = review['business_id']
            if index == 0:
                train_data_for_item.add(item)
            if user not in data_for_user:
                data_for_user[user] = [item]
            else:
                data_for_user[user].append(item)
    train_data_for_user, valid_data_for_user, test_data_for_user = data_for_user_list # dictionary of user_id:[item_id]
    
    with_neg_list = [valid_data_for_user, test_data_for_user]
#     data_with_neg_list = [[] for _ in range(len(with_neg_list))]
    data_with_neg_list = [[], []]
    for index in range(len(with_neg_list)):
        current_data = with_neg_list[index]
        for user in current_data.keys():
            if user not in selected_users:
                continue
            user_eval = {} # a dict
            business_set = selected_businesses - set(train_data_for_user[user]) - set(current_data[user]) # items not existed in this user's records
            sample_businesses = np.random.choice(list(business_set), size=n_neg_sample, replace=False)    # sample is random.choice
            user_eval['user_id'] = user
            user_eval['pos_business_id'] = current_data[user]
            user_eval['neg_business_id'] = list(sample_businesses)
            data_with_neg_list[index].append(user_eval)
    valid_with_neg, test_with_neg = data_with_neg_list
    
    return train_data, selected_valid_data, selected_test_data, valid_with_neg, test_with_neg

In [None]:
# get adjs
def get_adj_matrix(uid2ind, bid2ind, city_id2ind, cat_id2ind, users, businesses, reviews):
    """
    metapaths: UB, UUB, UBUB, UBCaB, UBCiB
    """
    tot_users = len(uid2ind)  # tot for total
    tot_business = len(bid2ind)
    tot_city = len(city_id2ind)
    tot_category = len(cat_id2ind)
    print(tot_users, tot_business, tot_city, tot_category)
    #relation U-U
    adj_UU = np.zeros([tot_users, tot_users])
    adj_UB = np.zeros([tot_users, tot_business])
    adj_BCa = np.zeros([tot_business, tot_category])
    adj_BCi = np.zeros([tot_business, tot_city])
    print(adj_BCi.shape)
    for user in users:
        if user['user_id'] not in uid2ind:
            continue
        user_id = uid2ind[user['user_id']]
        for friend in user['friends'].split(','):
            friend = friend.strip()
            if friend in uid2ind:
                friend_id = uid2ind[friend]
                adj_UU[user_id][friend_id] = 1
                adj_UU[friend_id][user_id] = 1
    #relation U-P-B
    for review in reviews:
        user_id = uid2ind[review['user_id']]
        business_id = bid2ind[review['business_id']]
        adj_UB[user_id][business_id] = 1
    #relation B_Ca B_Ci
    for business in businesses:
        if business['business_id'] not in bid2ind:
            continue
        business_id = bid2ind[business['business_id']]
        city_id = city_id2ind[business['city']]
        print("business_id: %d, city_id: %d" % (business_id, city_id))
        adj_BCi[business_id][city_id] = 1
        
        # more than one category for a business
        for category in business['categories'].split(','):
            category = category.strip()
            category_id = cat_id2ind[category]
            adj_BCa[business_id][category_id] = 1

    #metapath
    adj_UUB = adj_UU.dot(adj_UB)

    adj_UBU = adj_UB.dot(adj_UB.T)

    adj_UBUB = adj_UBU.dot(adj_UB)

    adj_UBCa = adj_UB.dot(adj_BCa)
    adj_UBCaB = adj_UBCa.dot(adj_BCa.T)

    adj_UBCi = adj_UB.dot(adj_BCi)
    adj_UBCiB = adj_UBCi.dot(adj_BCi.T)

#     adj_UCaB = adj_UCa.dot(adj_CaB)
    
#     adj_UCiB = adj_UCi.dot(adj_CiB)
    
    return adj_UB, adj_UUB, adj_UBUB, adj_UBCaB, adj_UBCiB


In [None]:
train_data, valid_data, test_data, valid_with_neg_sample, test_with_neg_sample \
    = dataset_split(reviews, uid2ind, bid2ind, 0.8, 0.1, 0.1, 50)

In [None]:
path = '../yelp_dataset/rates/'
filenames = ['train_data', 'valid_data', 'test_data', 'valid_with_neg_sample', 'test_with_neg_sample']
objs = [train_data, valid_data, test_data, valid_with_neg_sample, test_with_neg_sample]
for file, obj in zip(filenames, objs):
    write_pickle(path+file+'.pickle', obj)

In [None]:
# get adj matrices
adj_UB, adj_UUB, adj_UBUB, adj_UBCaB, adj_UBCiB \
    = get_adj_matrix(uid2ind, bid2ind, ct_id2ind, ca_id2ind, user_filtered, busi_filtered, reviews)

In [None]:
print(adj_UB.shape)
print(adj_UBCaB.shape)
print(adj_UBCaB[0:100][0:100])

In [None]:
adjs = [adj_UB, adj_UUB, adj_UBUB, adj_UBCaB, adj_UBCiB]
filenames = ['adj_UB', 'adj_UUB', 'adj_UBUB', 'adj_UBCaB', 'adj_UBCiB']
path = '../yelp_dataset/adjs/'
for adj, file in zip(adjs, filenames):
    write_pickle(path+file+'.pickle', adj)

In [None]:
filenames = ['uid2ind', 'bid2ind', 'ct_id2ind', 'ca_id2ind', 'ind2uid', 'ind2bid', 'ind2ct_id', 'ind2ca_id']
maps = [uid2ind, bid2ind, ct_id2ind, ca_id2ind, ind2uid, ind2bid, ind2ct_id, ind2ca_id]
path = '../yelp_dataset/adjs/'
for mapping, file in zip(maps, filenames):
    write_pickle(path+file+'.pickle', mapping)

In [8]:
# load dictionaries
path = '../yelp_dataset/adjs/'
uid2ind = read_pickle(path+'uid2ind.pickle')
bid2ind = read_pickle(path+'bid2ind.pickle')
ct_id2ind = read_pickle(path+'ct_id2ind.pickle')
ca_id2ind = read_pickle(path+'ca_id2ind.pickle')
ind2uid = read_pickle(path+'ind2uid.pickle')
ind2bid = read_pickle(path+'ind2bid.pickle')
ind2ct_id = read_pickle(path+'ind2ct_id.pickle')
ind2ca_id = read_pickle(path+'ind2ca_id.pickle')