In [1]:
# Basic import
import os
import sys
import json
import math
import shutil
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

def writeLog(row):
    with open('log.txt', 'a') as outfile:
        outfile.write(row + '\n')

def getErrMsg(e):
    error_class = e.__class__.__name__ #取得錯誤類型
    detail = e.args[0] #取得詳細內容
    errMsg = "[{}] {}".format(error_class, detail)
    return errMsg

In [3]:
DATA_DIR = '../Baseline/'
pro_dir = os.path.join(DATA_DIR, 'pro_sg')
dmf_dir = os.path.join(DATA_DIR, 'DMF_data')
if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)
if not os.path.exists(dmf_dir):
    os.makedirs(dmf_dir)

# Load numpy array

In [4]:
all_npy = np.load('./npy/all_2372.npy')
movie_genre = np.load('./npy/movie_genre.npy')
usr_following = np.load('./npy/user_followings.npy')
usr_genre = np.load('./npy/user_genre.npy')

print('All features:', all_npy.shape)
print('Movie genre:', movie_genre.shape)
print('User following:', usr_following.shape)
print('User genre:', usr_genre.shape)

All features: (165, 2372)
Movie genre: (165, 20)
User following: (1582, 165)
User genre: (1582, 20)


In [5]:
usr_nb = len(usr_following) # the number of users
movie_nb = len(movie_genre)  # the number of movies
print(usr_nb, movie_nb)

usr_test_amount = 150
movie_test_amount = 32
print(usr_test_amount, movie_test_amount)

latent_dim = 64 # latent dims
ft_dim = all_npy.shape[1] # feature dims
embedding_dims = 240
print(latent_dim, ft_dim, embedding_dims)

1582 165
150 32
64 2372 240


# UserFollowingRecord

In [6]:
user = []
movie = []

for i in range(usr_nb):
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            # build df
            user.append(i)
            movie.append(j)

In [7]:
df = pd.DataFrame(data={'userId': user, 'movieId': movie})
df

Unnamed: 0,userId,movieId
0,0,2
1,0,31
2,0,36
3,0,38
4,0,55
...,...,...
23441,1581,86
23442,1581,91
23443,1581,129
23444,1581,142


In [8]:
df.to_csv(os.path.join(pro_dir, 'UserFollowingRecord.csv'), index = 0)

In [9]:
unique_uid = pd.unique(df['userId'])
unique_uid.shape

(1582,)

In [10]:
unique_sid = pd.unique(df['movieId'])
unique_sid.shape

(165,)

# Training & testing split

## Prepare

In [11]:
#The number of following movie for each user
each_user = np.sum(usr_following, axis=1)
# print(each_user)

print('Min number of followings:', np.min(each_user))
print('Max number of followings:', np.max(each_user))
print('Avg of followers:', np.mean(each_user))

Min number of followings: 10
Max number of followings: 133
Avg of followers: 14.820480404551201


In [12]:
print('Over 10:', np.sum(each_user >= 10))
print('Over 12:', np.sum(each_user >= 12))
print('Over 14:', np.sum(each_user >= 14))
print('Over 16:', np.sum(each_user >= 16))
print('Over 18:', np.sum(each_user >= 18))
print('Over 20:', np.sum(each_user >= 20))
print('Over 22:', np.sum(each_user >= 22))
print('Over 24:', np.sum(each_user >= 24))
print('Over 26:', np.sum(each_user >= 26))
print('Over 28:', np.sum(each_user >= 28))
print('Over 30:', np.sum(each_user >= 30))

Over 10: 1582
Over 12: 937
Over 14: 613
Over 16: 440
Over 18: 315
Over 20: 229
Over 22: 178
Over 24: 137
Over 26: 106
Over 28: 80
Over 30: 73


In [13]:
random.seed(42)
usr_idx = [i for i in range(len(usr_following))]
print(len(usr_idx))

test_idx = each_user.argsort()[::-1][:150]
print(len(test_idx), test_idx[:10])

train_idx = [item for item in usr_idx if item not in test_idx]
print(len(train_idx), train_idx[:10])

vad_idx = sorted(random.sample(train_idx, usr_test_amount))
print(len(vad_idx), vad_idx[:10])

train_idx = [item for item in train_idx if item not in vad_idx]
print(len(train_idx), train_idx[:10])

1582
150 [1036  670  165  417  720  539 1417  731  590  328]
1432 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
150 [6, 13, 23, 54, 57, 64, 68, 92, 97, 100]
1282 [0, 1, 2, 3, 4, 5, 7, 8, 9, 10]


# For VAE & RaCT
## Setup

In [14]:
# init
train_uid = []
train_sid = []
vad_uid = []
vad_sid = []

for i in range(usr_nb):
    if i in train_idx:
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                train_uid.append(i)
                train_sid.append(j)
                
    elif i in vad_idx:
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                vad_uid.append(i)
                vad_sid.append(j)

In [15]:
print('The length of train:',len(train_uid), len(train_sid))
print('The length of vad:',len(vad_uid), len(vad_sid))

The length of train: 16601 16601
The length of vad: 1933 1933


In [16]:
# init
random.seed(42)
test_te_uid = []
test_te_sid= []
test_tr_uid = []
test_tr_sid = []

for i in range(usr_nb):
    # init
    t_for_train = []
    t_for_test = []
    
    if i in test_idx:
        temp_t = []
        
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                temp_t.append(j)
        
        # random choose half true and half false for test 
        t_for_test = random.sample(temp_t, math.ceil(0.5*len(temp_t)))
        for k in t_for_test:
            test_te_uid.append(i)
            test_te_sid.append(k)
        
        
        #the others for training
        t_for_train = [item for item in temp_t if not item in t_for_test]
        for k in t_for_train:
            test_tr_uid.append(i)
            test_tr_sid.append(k)

In [17]:
print('The length of test_te:',len(test_te_uid), len(test_te_sid))
print('The length of test_tr:',len(test_tr_uid), len(test_tr_uid))

The length of test_te: 2495 2495
The length of test_tr: 2417 2417


In [18]:
train = pd.DataFrame(data={'uid': train_uid, 'sid': train_sid}, columns=['uid', 'sid'])
train

Unnamed: 0,uid,sid
0,0,2
1,0,31
2,0,36
3,0,38
4,0,55
...,...,...
16596,1581,86
16597,1581,91
16598,1581,129
16599,1581,142


In [19]:
vad = pd.DataFrame(data={'uid': vad_uid, 'sid': vad_sid}, columns=['uid', 'sid'])
vad

Unnamed: 0,uid,sid
0,6,30
1,6,33
2,6,66
3,6,78
4,6,108
...,...,...
1928,1579,107
1929,1579,118
1930,1579,134
1931,1579,142


In [28]:
test = pd.DataFrame(data={'uid': test_te_uid, 'sid': test_te_sid}, columns=['uid', 'sid'])
test_plays_te = test
test

Unnamed: 0,uid,sid
0,30,132
1,30,20
2,30,0
3,30,64
4,30,59
...,...,...
2490,1567,28
2491,1567,66
2492,1567,37
2493,1567,118


In [21]:
test_plays_tr = pd.DataFrame(data={'uid': test_tr_uid, 'sid': test_tr_sid}, columns=['uid', 'sid'])
test_plays_tr

Unnamed: 0,uid,sid
0,30,31
1,30,45
2,30,69
3,30,72
4,30,85
...,...,...
2412,1567,104
2413,1567,119
2414,1567,121
2415,1567,127


In [22]:
unique_uid = np.concatenate((pd.unique(train['uid']), pd.unique(vad['uid']), pd.unique(test['uid'])), axis=0)
print(unique_uid.shape)
unique_uid

(1582,)


array([   0,    1,    2, ..., 1548, 1561, 1567])

In [23]:
unique_sid = pd.unique(df['movieId'])
with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)
print(unique_sid.shape)
unique_sid

(165,)


array([  2,  31,  36,  38,  55,  63,  96, 111, 120, 136,   3,  30,  41,
        48,  75,  77,  84,  89,  90, 106, 123, 129, 144, 163,  12,  44,
        85, 107, 116, 143, 153,   5,  24,  29,  35,  42, 100, 108, 140,
       152, 161,   9,  19,  28,  68,  80,  99, 112, 119, 137, 141,  25,
        45,  72,  93,  95, 109, 117, 138,  33,  66,  78, 126, 127, 164,
        49, 121, 133,  34,  40,  79,  83, 101,  32,  67,  94, 122, 135,
       150, 156,   0,   4,  10,  86, 104,  73,  98,  21,  74, 102,   1,
        13,  22,  37,  50,  64,  87, 147, 158,   6,  70,  88,  52,  60,
       134,  26,  18, 159, 154,  53,  57,  91, 148,  58, 103,  69, 125,
       132, 145, 131, 124, 146, 118, 157,  39,  81,  92,  14,  20,  59,
       105,  54,  56, 114, 128, 155, 162,  11,  43,  62, 130, 149,  23,
       142,  46, 110,   8,  76,  51,  27,  17, 160,  97, 113,  16,  47,
       115,  15,  61, 151, 139,  71,  82,  65,   7])

In [24]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [25]:
print(len(unique_sid), len(unique_uid))
print(len(show2id), len(profile2id))

165 1582
165 1582


In [26]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('uid')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 100 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [27]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad)
print(vad_plays_tr.shape, vad_plays_te.shape)

0 users sampled
100 users sampled
(1590, 2) (343, 2)


In [29]:
# test_plays_tr, test_plays_te = split_train_test_proportion(test)
# print(test_plays_tr.shape, test_plays_te.shape)

In [30]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['uid']))
    sid = list(map(lambda x: show2id[x], tp['sid']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [31]:
train_data = numerize(train)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

# For DMF
## all dataframe

In [32]:
user = []
movie = []
score = []
time = []

for i in range(usr_nb):
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            # build df
            timestamp = ''.join(str(n) for n in random.sample(range(0,9),9))
            user.append(i+1)
            movie.append(j+1)
            score.append(1)
            time.append(timestamp)

In [33]:
df = pd.DataFrame(data={'user': user, 'items': movie, 'ratings': score})
#                   columns=['user', 'movie', 'score', 'time'])
df

Unnamed: 0,user,items,ratings
0,1,3,1
1,1,32,1
2,1,37,1
3,1,39,1
4,1,56,1
...,...,...,...
23441,1582,87,1
23442,1582,92,1
23443,1582,130,1
23444,1582,143,1


In [34]:
df.to_csv(os.path.join(dmf_dir, 'myratings.dat')) #, header=None, index=None, sep=':')

In [31]:
# df = pd.DataFrame(data={'user': user, 'movie': movie, 'score': score, 'time': time})
# #                   columns=['user', 'movie', 'score', 'time'])
# df

Unnamed: 0,user,movie,score,time
0,1,3,1,685713024
1,1,32,1,170436258
2,1,37,1,213856407
3,1,39,1,862530741
4,1,56,1,435076281
...,...,...,...,...
23441,1582,87,1,857341206
23442,1582,92,1,745810236
23443,1582,130,1,748061235
23444,1582,143,1,832015476


In [32]:
# df.to_csv(os.path.join(dmf_dir, 'DMF_data.dat'), header=None, index=None, sep=':')

# Original split

In [33]:
write_json(test_idx, os.path.join(dmf_dir, 'test_idx.json'))
print(test_idx)

[13, 51, 54, 61, 65, 88, 93, 96, 114, 130, 135, 142, 146, 161, 163, 178, 186, 189, 191, 198, 206, 209, 224, 228, 255, 283, 285, 292, 313, 318, 326, 327, 333, 334, 350, 393, 407, 429, 432, 435, 440, 447, 449, 451, 457, 466, 469, 476, 501, 505, 514, 538, 541, 542, 546, 548, 552, 563, 569, 592, 600, 644, 646, 664, 689, 696, 704, 727, 735, 740, 741, 747, 758, 775, 777, 778, 781, 788, 810, 817, 821, 859, 864, 865, 877, 919, 928, 939, 940, 946, 958, 1010, 1022, 1034, 1043, 1083, 1093, 1098, 1103, 1116, 1130, 1133, 1140, 1149, 1161, 1182, 1195, 1197, 1206, 1209, 1220, 1221, 1232, 1236, 1247, 1266, 1285, 1287, 1300, 1301, 1309, 1310, 1316, 1327, 1330, 1342, 1354, 1372, 1385, 1393, 1399, 1402, 1409, 1429, 1436, 1437, 1442, 1466, 1470, 1493, 1494, 1508, 1516, 1518, 1525, 1529, 1547, 1554, 1563, 1573]


In [34]:
# init
random.seed(42)
train_t = []
train_f = []
test_t = []
test_f = []
target = []

for i in range(usr_nb):
    target_item = []
    # init
    t_for_train = []
    f_for_train = []
    t_for_test = []
    f_for_test = []
    
    if i not in test_idx: #if not in test id, just append it to true or false list
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                t_for_train.append((i, j, 1))
            else:
                f_for_train.append((i, j, 0))
                
        train_t.extend(t_for_train)
        train_f.extend(f_for_train)
#         print(len(t_for_train) + len(f_for_train))
        
    else: #if in test id, choose half of true and other 
        temp_t = []
        temp_f = []
        
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                temp_t.append((i, j, 1))
                target_item.append(1)
            else:
                temp_f.append((i, j, 0))
                target_item.append(0)
        
        # random choose half true and half false for test 
        t_for_test = random.sample(temp_t, math.ceil(0.5*len(temp_t)))
        f_for_test  = random.sample(temp_f, movie_test_amount-len(t_for_test))
        
        test_t.extend(t_for_test)
        test_f.extend(f_for_test)
        
        #the others for training
        t_for_train = [item for item in temp_t if not item in t_for_test]
        f_for_train = [item for item in temp_f if not item in f_for_test]
        train_t.extend(t_for_train)
        train_f.extend(f_for_train)
        
    if not (len(t_for_train) + len(f_for_train) + len(t_for_test) + len(f_for_test)) == movie_nb:
        print('Error!!!')
        break

In [35]:
print('The length of train_t:',len(train_t))
print('The length of train_f:',len(train_f))
print('The length of test_t:',len(test_t))
print('The length of test_f:',len(test_f))

The length of train_t: 22368
The length of train_f: 233862
The length of test_t: 1078
The length of test_f: 3722


In [36]:
train = train_t
test = test_t
write_json(test, os.path.join(dmf_dir, 'test.json'))
write_json(train, os.path.join(dmf_dir, 'train.json'))
print(len(test), len(train))

1078 22368
