In [1]:
# Basic import
import os
import sys
import json
import math
import shutil
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

def writeLog(row):
    with open('log.txt', 'a') as outfile:
        outfile.write(row + '\n')

def getErrMsg(e):
    error_class = e.__class__.__name__ #取得錯誤類型
    detail = e.args[0] #取得詳細內容
    errMsg = "[{}] {}".format(error_class, detail)
    return errMsg

In [3]:
DATA_DIR = './'
pro_dir = os.path.join(DATA_DIR, 'pro_sg_colduser')
dmf_dir = os.path.join(DATA_DIR, 'DMF_data_colduser')
if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)
if not os.path.exists(dmf_dir):
    os.makedirs(dmf_dir)

# Load numpy array

In [4]:
all_npy = np.load('./npy/all_2372.npy')
movie_genre = np.load('./npy/movie_genre.npy')
usr_following = np.load('./npy/user_followings.npy')
usr_genre = np.load('./npy/user_genre.npy')

print('All features:', all_npy.shape)
print('Movie genre:', movie_genre.shape)
print('User following:', usr_following.shape)
print('User genre:', usr_genre.shape)

All features: (165, 2372)
Movie genre: (165, 20)
User following: (1582, 165)
User genre: (1582, 20)


In [5]:
usr_nb = len(usr_following) # the number of users
movie_nb = len(movie_genre)  # the number of movies
print(usr_nb, movie_nb)

# usr_test_amount = 150
# movie_test_amount = 32
# print(usr_test_amount, movie_test_amount)

latent_dim = 64 # latent dims
ft_dim = all_npy.shape[1] # feature dims
embedding_dims = 240
print(latent_dim, ft_dim, embedding_dims)

1582 165
64 2372 240


# Training & testing split

## Prepare

In [6]:
#The number of following movie for each user
each_user = np.sum(usr_following, axis=1)
# print(each_user)

print('Min number of followings:', np.min(each_user))
print('Max number of followings:', np.max(each_user))
print('Avg of followers:', np.mean(each_user))

asc = np.sort(each_user)
# print(each_user)
# print(asc)
desc = np.flip(asc)
# print(desc)

Min number of followings: 10
Max number of followings: 133
Avg of followers: 14.820480404551201


### User

In [7]:
test_idx = [13, 61, 64, 72, 77, 109, 115, 118, 142, 162, 173, 180, 185, 201, 203, 226, 235, 238, 242, 256, 267, 270, 296, 301, 342, 386, 389, 402, 428, 434, 445, 446, 454, 456, 479, 543, 558, 593, 597, 603, 609, 621, 625, 627, 634, 646, 649, 658, 693, 700, 750, 753, 754, 760, 762, 768, 787, 798, 825, 835, 900, 902, 923, 958, 966, 979, 1014, 1023, 1029, 1030, 1037, 1048, 1072, 1074, 1076, 1079, 1091, 1123, 1133, 1137, 1196, 1202, 1203, 1219, 1277, 1288, 1302, 1303, 1311, 1327, 1399, 1413, 1437, 1449, 1510, 1520, 1526, 1544, 1565, 1580]
print(len(test_idx), test_idx[:10]) # 100 [13, 61, 64, 72, 77, 109, 115, 118, 142, 162]

100 [13, 61, 64, 72, 77, 109, 115, 118, 142, 162]


In [8]:
usr_test_amount = len(test_idx)
movie_test_amount = 32 #len(st)
print(usr_test_amount, movie_test_amount)

100 32


In [12]:
user = []
movie = []

for i in test_idx:
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            # build df
            user.append(test_idx.index(i))
            movie.append(j)

In [13]:
df = pd.DataFrame(data={'userId': user, 'movieId': movie})
df

Unnamed: 0,userId,movieId
0,0,1
1,0,12
2,0,13
3,0,21
4,0,22
...,...,...
1133,99,84
1134,99,85
1135,99,86
1136,99,120


In [14]:
df.to_csv(os.path.join(pro_dir, 'UserFollowingRecord.csv'), index = 0)

In [15]:
unique_sid = pd.unique(df['movieId'])
unique_sid.shape

(152,)

In [10]:
random.seed(42)

In [11]:
train_idx = sorted(random.sample(usr_idx, int(usr_test_amount*0.5)))
print(len(train_idx), train_idx[:10])

50 [13, 64, 72, 77, 109, 115, 180, 201, 203, 235]


In [12]:
vad_idx = sorted(random.sample(train_idx, int(len(train_idx)*0.2)))
print(len(vad_idx), vad_idx[:10])

10 [115, 201, 434, 649, 753, 1048, 1079, 1123, 1203, 1277]


In [13]:
train_idx = [item for item in train_idx if item not in vad_idx]
print(len(train_idx), train_idx[:10])

40 [13, 64, 72, 77, 109, 180, 203, 235, 238, 256]


### splitting

# For VAE & RaCT
## Setup

In [17]:
# init
train_uid = []
train_sid = []
vad_uid = []
vad_sid = []
test_uid = []
test_sid = []

for i in range(usr_nb):
    if i in train_idx:
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                train_uid.append(i)
                train_sid.append(j)
                
    elif i in vad_idx:
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                vad_uid.append(i)
                vad_sid.append(j)
                
    elif i in test_idx:
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                test_uid.append(i)
                test_sid.append(j)

In [18]:
print(len(train_uid), len(train_sid))
print(len(vad_uid), len(vad_sid))
print(len(test_uid), len(test_sid))

468 468
115 115
555 555


In [19]:
train = pd.DataFrame(data={'uid': train_uid, 'sid': train_sid}, columns=['uid', 'sid'])
train

Unnamed: 0,uid,sid
0,13,1
1,13,12
2,13,13
3,13,21
4,13,22
...,...,...
463,1580,84
464,1580,85
465,1580,86
466,1580,120


In [20]:
vad = pd.DataFrame(data={'uid': vad_uid, 'sid': vad_sid}, columns=['uid', 'sid'])
vad

Unnamed: 0,uid,sid
0,115,0
1,115,21
2,115,28
3,115,45
4,115,58
...,...,...
110,1277,102
111,1277,118
112,1277,129
113,1277,134


In [21]:
test = pd.DataFrame(data={'uid': test_uid, 'sid': test_sid}, columns=['uid', 'sid'])
test

Unnamed: 0,uid,sid
0,61,9
1,61,24
2,61,25
3,61,55
4,61,98
...,...,...
550,1565,46
551,1565,57
552,1565,85
553,1565,102


In [35]:
unique_uid = np.concatenate((pd.unique(train['uid']), pd.unique(vad['uid']), pd.unique(test['uid'])), axis=0)
print(unique_uid.shape)
unique_uid

(100,)


array([  13,   64,   72,   77,  109,  180,  203,  235,  238,  256,  267,
        296,  342,  386,  402,  428,  446,  543,  597,  627,  760,  762,
        798,  825,  923,  966, 1023, 1029, 1030, 1137, 1202, 1302, 1311,
       1327, 1399, 1449, 1510, 1520, 1544, 1580,  115,  201,  434,  649,
        753, 1048, 1079, 1123, 1203, 1277,   61,  118,  142,  162,  173,
        185,  226,  242,  270,  301,  389,  445,  454,  456,  479,  558,
        593,  603,  609,  621,  625,  634,  646,  658,  693,  700,  750,
        754,  768,  787,  835,  900,  902,  958,  979, 1014, 1037, 1072,
       1074, 1076, 1091, 1133, 1196, 1219, 1288, 1303, 1413, 1437, 1526,
       1565])

In [41]:
allsid = train['sid'].append(vad['sid'], ignore_index=True).append(test['sid'], ignore_index=True)
allsid

0         1
1        12
2        13
3        21
4        22
       ... 
1133     46
1134     57
1135     85
1136    102
1137    125
Name: sid, Length: 1138, dtype: int64

In [47]:
unique_sid = pd.unique(allsid)
with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)
print(unique_sid.shape)
unique_sid

(152,)


array([  1,  12,  13,  21,  22,  37,  50,  64,  66,  87,  93,  99, 141,
       147, 158,   2,  58,  74,  80,  81,  86, 127, 129, 157, 161, 164,
        24,  25,  29,  48,  77,  83,  90, 104, 143,   0,  26,  27,  45,
        53,  57,  72, 144, 149, 152,   4,   8,  70,  98, 103, 137, 146,
        34,  76, 119, 122,  44,  68,  91, 108, 121, 132, 151,   9,  28,
        79,  84, 154, 156, 101, 138, 125, 126, 134, 150,  15,  62, 107,
        49,  75,  89,  95, 133, 159,  10,  88, 131, 142, 162, 102,   3,
        36, 114, 118, 120, 128,  82, 111,  40,  19,  23,  56, 106, 145,
        52,  18,  30,  55, 163,   5,  60, 112,  31,  17,  54,  92,  33,
        46,  78,  42,  47,  11, 123,  32,  96,  38, 115,  63,  67, 110,
        85,  14,  51,  59, 124, 160, 105, 117, 100, 153,  20,  39, 135,
        71,  94, 148, 155,  43,  73, 109, 139,  41])

In [48]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [49]:
print(len(unique_sid), len(unique_uid))
print(len(show2id), len(profile2id))

152 100
152 100


In [50]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('uid')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 100 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [51]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad)
print(vad_plays_tr.shape, vad_plays_te.shape)

0 users sampled
(95, 2) (20, 2)


In [52]:
test_plays_tr, test_plays_te = split_train_test_proportion(test)
print(test_plays_tr.shape, test_plays_te.shape)

0 users sampled
(454, 2) (101, 2)


In [53]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['uid']))
    sid = list(map(lambda x: show2id[x], tp['sid']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [54]:
train_data = numerize(train)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

# For DMF

## Chilli

In [14]:
user = []
movie = []
score = []
time = []

for i in test_idx:
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            # build df
#             print(i, test_idx.index(i), j)
            user.append(test_idx.index(i))
            movie.append(j)
            score.append(1)
#             timestamp = ''.join(str(n) for n in random.sample(range(0,9),9))
#             time.append(timestamp)

13 0 1
13 0 12
13 0 13
13 0 21
13 0 22
13 0 37
13 0 50
13 0 64
13 0 66
13 0 87
13 0 93
13 0 99
13 0 141
13 0 147
13 0 158
61 1 9
61 1 24
61 1 25
61 1 55
61 1 98
61 1 100
61 1 108
61 1 117
61 1 118
61 1 134
61 1 142
64 2 2
64 2 58
64 2 74
64 2 80
64 2 81
64 2 86
64 2 127
64 2 129
64 2 157
64 2 161
64 2 164
72 3 2
72 3 24
72 3 25
72 3 29
72 3 48
72 3 77
72 3 83
72 3 90
72 3 104
72 3 143
77 4 0
77 4 26
77 4 27
77 4 45
77 4 53
77 4 57
77 4 72
77 4 129
77 4 144
77 4 149
77 4 152
109 5 4
109 5 8
109 5 24
109 5 64
109 5 70
109 5 72
109 5 86
109 5 98
109 5 103
109 5 137
109 5 146
115 6 0
115 6 21
115 6 28
115 6 45
115 6 58
115 6 67
115 6 68
115 6 86
115 6 112
115 6 125
118 7 0
118 7 3
118 7 45
118 7 50
118 7 85
118 7 114
118 7 117
118 7 124
118 7 143
118 7 153
118 7 158
142 8 10
142 8 70
142 8 72
142 8 85
142 8 88
142 8 95
142 8 99
142 8 101
142 8 111
142 8 117
142 8 141
142 8 161
162 9 28
162 9 66
162 9 87
162 9 99
162 9 103
162 9 112
162 9 119
162 9 122
162 9 137
162 9 147
173 10 2
173 10 4


In [12]:
df = pd.DataFrame(data={'user': user, 'items': movie, 'ratings': score})
#                   columns=['user', 'movie', 'score', 'time'])
df

Unnamed: 0,user,items,ratings
0,0,1,1
1,0,12,1
2,0,13,1
3,0,21,1
4,0,22,1
...,...,...,...
1133,99,84,1
1134,99,85,1
1135,99,86,1
1136,99,120,1


In [15]:
df.to_csv(os.path.join(dmf_dir, 'myratings_cuser.dat')) #, header=None, index=None, sep=':')

## all

In [None]:
user = []
movie = []
score = []
time = []
train = []
test = []

for i in range(usr_nb):
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            if i in test_idx:
                test.append((i, j, 1))
            else:
                train.append((i, j, 1))
            
            # build df
            timestamp = ''.join(str(n) for n in random.sample(range(0,9),9))
            user.append(i+1)
            movie.append(j+1)
            score.append(1)
            time.append(timestamp)
#         else:
#             user.append(i+1)
#             movie.append(j+1)
#             score.append(0)
#             time.append(timestamp)

In [None]:
write_json(test, os.path.join(dmf_dir, 'test.json'))
write_json(train, os.path.join(dmf_dir, 'train.json'))
print(len(test), len(train))

In [None]:
df = pd.DataFrame(data={'user': user, 'movie': movie, 'score': score, 'time': time})
#                   columns=['user', 'movie', 'score', 'time'])
df

In [None]:
df.to_csv(os.path.join(dmf_dir, 'DMF_data.dat'), header=None, index=None, sep=':')

# Original split

In [None]:
# init
train_t = []
train_f = []
test_t = []
test_f = []

for i in range(usr_nb):
    # init
    t_for_train = []
    f_for_train = []
    t_for_test = []
    f_for_test = []
    
    if i not in test_idx: #if not in test id, just append it to true or false list
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                t_for_train.append((i, j, 1))
            else:
                f_for_train.append((i, j, 0))
                
        train_t.extend(t_for_train)
        train_f.extend(f_for_train)
#         print(len(t_for_train) + len(f_for_train))
        
    else: #if in test id, choose half of true and other 
        temp_t = []
        temp_f = []
        
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                temp_t.append((i, j, 1))
            else:
                temp_f.append((i, j, 0))
        
        # random choose half true and half false for test 
        t_for_test = random.sample(temp_t, math.ceil(0.5*len(temp_t)))
        f_for_test  = random.sample(temp_f, movie_test_amount-len(t_for_test))
        
        test_t.extend(t_for_test)
        test_f.extend(f_for_test)
        
        #the others for training
        t_for_train = [item for item in temp_t if not item in t_for_test]
        f_for_train = [item for item in temp_f if not item in f_for_test]
        train_t.extend(t_for_train)
        train_f.extend(f_for_train)
        
    if not (len(t_for_train) + len(f_for_train) + len(t_for_test) + len(f_for_test)) == movie_nb:
        print('Error!!!')
        break

In [None]:
print('The length of train_t:',len(train_t))
print('The length of train_f:',len(train_f))
print('The length of test_t:',len(test_t))
print('The length of test_f:',len(test_f))

In [None]:
train = train_t
test = test_t
write_json(test, './DMF_data/test.json')
write_json(train, './DMF_data/train.json')
print(len(test), len(train))