In [1]:
# Basic import
import os
import sys
import json
import math
import shutil
import random
import pandas as pd
import numpy as np

In [2]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

def writeLog(row):
    with open('log.txt', 'a') as outfile:
        outfile.write(row + '\n')

def getErrMsg(e):
    error_class = e.__class__.__name__ #取得錯誤類型
    detail = e.args[0] #取得詳細內容
    errMsg = "[{}] {}".format(error_class, detail)
    return errMsg

In [3]:
DATA_DIR = '../Baseline/'
pro_dir = os.path.join(DATA_DIR, '_pro_sg_colduser')
dmf_dir = os.path.join(DATA_DIR, '_DMF_data_colduser')
if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)
if not os.path.exists(dmf_dir):
    os.makedirs(dmf_dir)

# Load numpy array

In [4]:
all_npy = np.load('./npy/all_2372.npy')
movie_genre = np.load('./npy/movie_genre.npy')
usr_following = np.load('./npy/user_followings.npy')
usr_genre = np.load('./npy/user_genre.npy')

print('All features:', all_npy.shape)
print('Movie genre:', movie_genre.shape)
print('User following:', usr_following.shape)
print('User genre:', usr_genre.shape)

All features: (165, 2372)
Movie genre: (165, 20)
User following: (1582, 165)
User genre: (1582, 20)


In [5]:
usr_nb = len(usr_following) # the number of users
movie_nb = len(movie_genre)  # the number of movies
print(usr_nb, movie_nb)

# usr_test_amount = 150
# movie_test_amount = 32
# print(usr_test_amount, movie_test_amount)

latent_dim = 64 # latent dims
ft_dim = all_npy.shape[1] # feature dims
embedding_dims = 240
print(latent_dim, ft_dim, embedding_dims)

1582 165
64 2372 240


# Training & testing split

## Prepare

In [6]:
#The number of following movie for each user
each_user = np.sum(usr_following, axis=1)
# print(each_user)

print('Min number of followings:', np.min(each_user))
print('Max number of followings:', np.max(each_user))
print('Avg of followers:', np.mean(each_user))

Min number of followings: 10
Max number of followings: 133
Avg of followers: 14.820480404551201


In [7]:
print('<= 15:', np.sum(each_user <= 15))
less_idx = np.nonzero(each_user <= 15)[0]
print(less_idx.shape, less_idx)

<= 15: 1142
(1142,) [   0    1    2 ... 1578 1580 1581]


In [8]:
for i in range(usr_nb):
    if not i in less_idx:
        each_user[i] = 0

In [9]:
print('Min number of followings:', np.min(each_user))
print('Max number of followings:', np.max(each_user))
print('Avg of followers:', np.mean(each_user))

Min number of followings: 0
Max number of followings: 15
Avg of followers: 8.36283185840708


In [10]:
(each_user.argsort()[::-1][:100])

array([ 326, 1448,  927,  926,  139,  895,  529,  117, 1403,  848,  545,
         99,  825,  824, 1436,  804,  953,  800,  570, 1472, 1473, 1478,
        606, 1486,   53, 1495, 1498, 1503,   43,   39, 1345, 1326, 1514,
       1043, 1155,  307, 1145,  314,  268, 1125, 1105,  357, 1223,  231,
       1235,  400, 1240, 1034,  464,  211,  408,  195,  428,  431, 1291,
        439,  981, 1303, 1311,  967,  460,  959,  730, 1156,  688,  676,
         13,  645,  679,  654,  633, 1571,  524,  675,   92,  822, 1225,
        873, 1290,  235, 1327,  561,  815, 1537,  188,  312,   81,  156,
         74, 1333,  423, 1451,  708, 1174, 1005,   93, 1382,  183, 1181,
       1553])

In [11]:
usr_idx = list(each_user.argsort()[::-1][:100])
print(usr_idx)

[326, 1448, 927, 926, 139, 895, 529, 117, 1403, 848, 545, 99, 825, 824, 1436, 804, 953, 800, 570, 1472, 1473, 1478, 606, 1486, 53, 1495, 1498, 1503, 43, 39, 1345, 1326, 1514, 1043, 1155, 307, 1145, 314, 268, 1125, 1105, 357, 1223, 231, 1235, 400, 1240, 1034, 464, 211, 408, 195, 428, 431, 1291, 439, 981, 1303, 1311, 967, 460, 959, 730, 1156, 688, 676, 13, 645, 679, 654, 633, 1571, 524, 675, 92, 822, 1225, 873, 1290, 235, 1327, 561, 815, 1537, 188, 312, 81, 156, 74, 1333, 423, 1451, 708, 1174, 1005, 93, 1382, 183, 1181, 1553]


In [12]:
train_idx = sorted(usr_idx[:80])
vad_idx = sorted(usr_idx[80:])

print(len(train_idx), train_idx)
print(len(vad_idx), vad_idx)

80 [13, 39, 43, 53, 92, 99, 117, 139, 195, 211, 231, 235, 268, 307, 314, 326, 357, 400, 408, 428, 431, 439, 460, 464, 524, 529, 545, 570, 606, 633, 645, 654, 675, 676, 679, 688, 730, 800, 804, 822, 824, 825, 848, 873, 895, 926, 927, 953, 959, 967, 981, 1034, 1043, 1105, 1125, 1145, 1155, 1156, 1223, 1225, 1235, 1240, 1290, 1291, 1303, 1311, 1326, 1345, 1403, 1436, 1448, 1472, 1473, 1478, 1486, 1495, 1498, 1503, 1514, 1571]
20 [74, 81, 93, 156, 183, 188, 312, 423, 561, 708, 815, 1005, 1174, 1181, 1327, 1333, 1382, 1451, 1537, 1553]


In [13]:
test_idx = sorted(usr_idx)
print(len(test_idx), test_idx)

100 [13, 39, 43, 53, 74, 81, 92, 93, 99, 117, 139, 156, 183, 188, 195, 211, 231, 235, 268, 307, 312, 314, 326, 357, 400, 408, 423, 428, 431, 439, 460, 464, 524, 529, 545, 561, 570, 606, 633, 645, 654, 675, 676, 679, 688, 708, 730, 800, 804, 815, 822, 824, 825, 848, 873, 895, 926, 927, 953, 959, 967, 981, 1005, 1034, 1043, 1105, 1125, 1145, 1155, 1156, 1174, 1181, 1223, 1225, 1235, 1240, 1290, 1291, 1303, 1311, 1326, 1327, 1333, 1345, 1382, 1403, 1436, 1448, 1451, 1472, 1473, 1478, 1486, 1495, 1498, 1503, 1514, 1537, 1553, 1571]


In [14]:
usr_nb = len(test_idx)
usr_test_amount = len(test_idx)
movie_test_amount = 32 #math.floor(len(less_idx)*0.5)
print(usr_nb, usr_test_amount)
print(movie_nb, movie_test_amount)

100 100
165 32


# For VAE & RaCT
## Build UserFollowingRecord.csv

In [None]:
user = []
movie = []

for i in test_idx:
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            # build df
            user.append(test_idx.index(i))
            movie.append(j)

In [None]:
df = pd.DataFrame(data={'userId': user, 'movieId': movie})
df

In [None]:
df.to_csv(os.path.join(pro_dir, 'UserFollowingRecord.csv'), index = 0)

In [None]:
unique_uid = pd.unique(df['userId'])
unique_uid.shape

In [None]:
unique_sid = pd.unique(df['movieId'])
unique_sid.shape

## Setup 

In [None]:
# init
random.seed(42)
test_te_uid = []
test_te_sid= []

train_uid = []
train_sid = []
vad_uid = []
vad_sid = []

for i in test_idx:
    # init
    t_for_train = []
    t_for_test = []
    
    temp_t = []
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            temp_t.append(j)
            
    t_for_test = random.sample(temp_t, math.ceil(0.5*len(temp_t)))
    for k in t_for_test:
        test_te_uid.append(i)
        test_te_sid.append(k)
    
    t_for_train = [item for item in temp_t if not item in t_for_test]
    for k in t_for_train:
        if i in train_idx:
            train_uid.append(i)
            train_sid.append(k)
        elif i in vad_idx:
            vad_uid.append(i)
            vad_sid.append(k)
        else:
            print('Error!!!')

In [None]:
print('The length of test_te:',len(test_te_uid), len(test_te_sid))
print('The length of train:',len(train_uid), len(train_sid))
print('The length of vad:',len(vad_uid), len(vad_sid))

# Build 1 on 1 dataframe

In [None]:
train = pd.DataFrame(data={'uid': train_uid, 'sid': train_sid}, columns=['uid', 'sid'])
train

In [None]:
vad = pd.DataFrame(data={'uid': vad_uid, 'sid': vad_sid}, columns=['uid', 'sid'])
vad

In [None]:
test_plays_tr = train.sample(frac=0.2, random_state=42)
test_plays_tr

In [None]:
train = train.drop(test_plays_tr.index)
train

In [None]:
test = pd.DataFrame(data={'uid': test_te_uid, 'sid': test_te_sid}, columns=['uid', 'sid'])
test_plays_te = test
test_plays_te

In [None]:
unique_uid = np.concatenate((pd.unique(train['uid']), pd.unique(vad['uid'])), axis=0)
print(unique_uid.shape)
unique_uid

In [None]:
# unique_sid = unique_sid
with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)
print(unique_sid.shape)
unique_sid

In [None]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [None]:
print(len(unique_sid), len(unique_uid))
print(len(show2id), len(profile2id))

In [None]:
def split_train_test_proportion(data, test_prop=0.2, lower_bound=5):
    data_grouped_by_user = data.groupby('uid')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)
        
        if n_items_u >= lower_bound:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 100 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()
    
    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

In [None]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad, test_prop=0.5, lower_bound=2)
print(vad_plays_tr.shape, vad_plays_te.shape)

In [None]:
# test_plays_tr, test_plays_te = split_train_test_proportion(test)
# print(test_plays_tr.shape, test_plays_te.shape)

In [None]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['uid']))
    sid = list(map(lambda x: show2id[x], tp['sid']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [None]:
train_data = numerize(train)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

# For DMF
## Chilli for all data

In [15]:
usr_following = usr_following[test_idx, :]
print(usr_following.shape)
print(usr_nb, usr_test_amount)
print(movie_nb, movie_test_amount)

(100, 165)
100 100
165 32


In [16]:
user = []
movie = []
score = []
time = []
# less_idx = list(less_idx)

for i in range(usr_nb):
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            # build df
            user.append(i+1)
            movie.append(j+1)
            score.append(1)

In [17]:
df = pd.DataFrame(data={'user': user, 'items': movie, 'ratings': score})
#                   columns=['user', 'movie', 'score', 'time'])
df

Unnamed: 0,user,items,ratings
0,1,2,1
1,1,13,1
2,1,14,1
3,1,22,1
4,1,23,1
...,...,...,...
1467,100,105,1
1468,100,122,1
1469,100,129,1
1470,100,155,1


In [18]:
print(pd.unique(df['user']).shape, pd.unique(df['items']).shape)

(100,) (159,)


In [19]:
df.to_csv(os.path.join(dmf_dir, 'myratings.dat'), header=None, index=None)

# Original split
* train: <class 'list'> 21864 [(0, 2, 1.0), (0, 31, 1.0), (0, 36, 1.0), (0, 38, 1.0), (0, 55, 1.0), (0, 63, 1.0), (0, 96, 1.0), (0, 111, 1.0), (0, 120, 1.0), (1, 3, 1.0)]
* test: <class 'list'> 1582 [(0, 136, 1.0), (1, 163, 1.0), (2, 153, 1.0), (3, 161, 1.0), (4, 141, 1.0), (5, 161, 1.0), (6, 164, 1.0), (7, 164, 1.0), (8, 161, 1.0), (9, 156, 1.0)]

In [27]:
less_idx = [i-1 for i in pd.unique(df['items'])]
print(less_idx)
movie_nb = len(less_idx)
print(movie_nb)

[1, 12, 13, 21, 22, 37, 50, 64, 66, 87, 93, 99, 141, 147, 158, 0, 11, 30, 40, 42, 49, 57, 101, 102, 106, 123, 126, 128, 138, 144, 19, 23, 45, 59, 69, 72, 81, 88, 111, 120, 133, 137, 146, 8, 25, 39, 70, 80, 95, 98, 156, 53, 55, 76, 103, 105, 124, 125, 154, 161, 28, 97, 112, 140, 34, 44, 58, 129, 150, 79, 90, 132, 152, 163, 4, 9, 31, 60, 62, 148, 10, 77, 96, 131, 149, 68, 122, 134, 164, 20, 24, 116, 119, 41, 52, 78, 92, 127, 2, 14, 91, 135, 29, 67, 86, 145, 84, 75, 159, 63, 136, 89, 3, 26, 139, 32, 33, 35, 71, 153, 108, 118, 142, 47, 117, 36, 114, 46, 61, 82, 104, 107, 83, 100, 18, 143, 74, 5, 17, 109, 155, 48, 85, 157, 15, 43, 151, 121, 51, 94, 16, 6, 38, 73, 110, 130, 115, 54, 56]
159


In [25]:
print(usr_following.shape)
usr_following = usr_following[:, less_idx]
print(usr_following.shape)

(100, 165)
(100, 159)


In [28]:
# init
random.seed(42)
train_t = []
train_f = []
test_t = []
test_f = []

for i in range(usr_nb):
    # init
    t_for_train = []
    f_for_train = []
    t_for_test = []
    f_for_test = []
    
    temp_t = []
    temp_f = []
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            temp_t.append((i, j, 1))
        else:
            temp_f.append((i, j, 0))
            
    t_for_test = random.sample(temp_t, math.ceil(0.5*len(temp_t)))
    f_for_test = random.sample(temp_f, movie_test_amount-len(t_for_test))
    
    test_t.extend(t_for_test)
    test_f.extend(f_for_test)
    
    t_for_train = [item for item in temp_t if not item in t_for_test]
    f_for_train = [item for item in temp_f if not item in f_for_test]
    train_t.extend(t_for_train)
    train_f.extend(f_for_train)
    
    if not (len(t_for_train) + len(f_for_train) + len(t_for_test) + len(f_for_test)) == len(less_idx):
        print('Error!!!')
        break

In [29]:
print('The length of train_t:',len(train_t))
print('The length of train_f:',len(train_f))
print('The length of test_t:',len(test_t))
print('The length of test_f:',len(test_f))

The length of train_t: 700
The length of train_f: 12000
The length of test_t: 772
The length of test_f: 2428


In [30]:
write_json(train_t, os.path.join(dmf_dir, 'train_t.json'))
write_json(train_f, os.path.join(dmf_dir, 'train_f.json'))
write_json(test_t, os.path.join(dmf_dir, 'test_t.json'))
write_json(test_f, os.path.join(dmf_dir, 'test_f.json'))