In [50]:
# Basic import
import os
import sys

import random
import numpy as np
import pandas as pd

from helper import write_json

In [7]:
DATA_DIR = '../Baseline/'
pro_dir = os.path.join(DATA_DIR, '_pro_sg')
dmf_dir = os.path.join(DATA_DIR, '_DMF_data')
if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)
if not os.path.exists(dmf_dir):
    os.makedirs(dmf_dir)

# Load numpy array

In [8]:
usr_following = np.load('./npy/user_followings.npy')
print('User following:', usr_following.shape)

usr_nb = usr_following.shape[0] # the number of users
movie_nb = usr_following.shape[1]  # the number of movies
print(usr_nb, movie_nb)

usr_test_amount = 150
movie_test_amount = 32
print(usr_test_amount, movie_test_amount)

User following: (1582, 165)
1582 165
150 32


# UserFollowingRecord

In [9]:
user = []
movie = []

for i in range(usr_nb):
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            # build df
            user.append(i)
            movie.append(j)

In [10]:
df = pd.DataFrame(data={'userId': user, 'movieId': movie})
df # 23446 records

Unnamed: 0,userId,movieId
0,0,2
1,0,31
2,0,36
3,0,38
4,0,55
...,...,...
23441,1581,86
23442,1581,91
23443,1581,129
23444,1581,142


In [11]:
df.to_csv(os.path.join(pro_dir, 'UserFollowingRecord.csv'), index = 0)

In [12]:
unique_uid = pd.unique(df['userId'])
unique_uid.shape

(1582,)

In [13]:
unique_sid = pd.unique(df['movieId'])
unique_sid.shape

(165,)

# Training & testing split

In [18]:
#The number of following movie for each user
each_user = np.sum(usr_following, axis=1)
# print(each_user)

print('Min number of followings:', np.min(each_user))
print('Max number of followings:', np.max(each_user))
print('Avg of followers:', np.mean(each_user))

Min number of followings: 10
Max number of followings: 133
Avg of followers: 14.820480404551201


In [21]:
usr_idx = [i for i in range(len(usr_following))]
print(len(usr_idx))

random.seed(42)
test_idx = sorted(random.sample(usr_idx, usr_test_amount))
print(len(test_idx), test_idx[:10]) # 150 [13, 51, 54, 61, 65, 88, 93, 96, 114, 130]

train_idx = [item for item in usr_idx if item not in test_idx]
print(len(train_idx), train_idx[:10])

vad_idx = sorted(random.sample(train_idx, usr_test_amount))
print(len(vad_idx), vad_idx[:10])

train_idx = [item for item in train_idx if item not in vad_idx]
print(len(train_idx), train_idx[:10])

1582
150 [13, 51, 54, 61, 65, 88, 93, 96, 114, 130]
1432 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
150 [1, 4, 6, 15, 24, 31, 40, 44, 69, 111]
1282 [0, 2, 3, 5, 7, 8, 9, 10, 11, 12]


# For VAE & RaCT
## Setup

In [23]:
# init
random.seed(42)

train_uid = []
train_sid = []

vad_uid = []
vad_sid = []

test_te_uid = []
test_te_sid = []
test_tr_uid = []
test_tr_sid = []

all_movie = [i for i in range(movie_nb)]

for i in range(usr_nb):
    if i in train_idx:
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                train_uid.append(i)
                train_sid.append(j)
                
    elif i in vad_idx:
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                vad_uid.append(i)
                vad_sid.append(j)
    
    elif i in test_idx:
        test = random.sample(all_movie, movie_test_amount)
        for j in range(movie_nb):
            if j in test:
                if usr_following[i][j] == 1:
                    test_te_uid.append(i)
                    test_te_sid.append(j)
            else:
                if usr_following[i][j] == 1:
                    test_tr_uid.append(i)
                    test_tr_sid.append(j)

In [24]:
print('The length of train:',len(train_uid), len(train_sid))
print('The length of vad:',len(vad_uid), len(vad_sid))
print('The length of test_tr:',len(test_tr_uid), len(test_tr_uid))
print('The length of test_te:',len(test_te_uid), len(test_te_sid))

The length of train: 19009 19009
The length of vad: 2342 2342
The length of test_te: 433 433
The length of test_tr: 1662 1662


In [25]:
train = pd.DataFrame(data={'uid': train_uid, 'sid': train_sid}, columns=['uid', 'sid'])
train

Unnamed: 0,uid,sid
0,0,2
1,0,31
2,0,36
3,0,38
4,0,55
...,...,...
19004,1581,86
19005,1581,91
19006,1581,129
19007,1581,142


In [26]:
vad = pd.DataFrame(data={'uid': vad_uid, 'sid': vad_sid}, columns=['uid', 'sid'])
vad

Unnamed: 0,uid,sid
0,1,3
1,1,30
2,1,41
3,1,48
4,1,75
...,...,...
2337,1576,104
2338,1576,118
2339,1576,119
2340,1576,134


In [28]:
test_plays_tr = pd.DataFrame(data={'uid': test_tr_uid, 'sid': test_tr_sid}, columns=['uid', 'sid'])
test_plays_tr

Unnamed: 0,uid,sid
0,13,12
1,13,13
2,13,21
3,13,37
4,13,64
...,...,...
1657,1573,91
1658,1573,100
1659,1573,120
1660,1573,142


In [27]:
test_plays_te = pd.DataFrame(data={'uid': test_te_uid, 'sid': test_te_sid}, columns=['uid', 'sid'])
test_plays_te

Unnamed: 0,uid,sid
0,13,1
1,13,22
2,13,50
3,13,87
4,54,117
...,...,...
428,1563,87
429,1563,129
430,1563,143
431,1573,2


In [32]:
unique_uid = pd.unique(train['userId'])
print(unique_uid.shape)
unique_uid

(1582,)


array([   0,    1,    2, ..., 1579, 1580, 1581])

In [33]:
unique_sid = pd.unique(df['movieId'])
with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)
print(unique_sid.shape)
unique_sid

(165,)


array([  2,  31,  36,  38,  55,  63,  96, 111, 120, 136,   3,  30,  41,
        48,  75,  77,  84,  89,  90, 106, 123, 129, 144, 163,  12,  44,
        85, 107, 116, 143, 153,   5,  24,  29,  35,  42, 100, 108, 140,
       152, 161,   9,  19,  28,  68,  80,  99, 112, 119, 137, 141,  25,
        45,  72,  93,  95, 109, 117, 138,  33,  66,  78, 126, 127, 164,
        49, 121, 133,  34,  40,  79,  83, 101,  32,  67,  94, 122, 135,
       150, 156,   0,   4,  10,  86, 104,  73,  98,  21,  74, 102,   1,
        13,  22,  37,  50,  64,  87, 147, 158,   6,  70,  88,  52,  60,
       134,  26,  18, 159, 154,  53,  57,  91, 148,  58, 103,  69, 125,
       132, 145, 131, 124, 146, 118, 157,  39,  81,  92,  14,  20,  59,
       105,  54,  56, 114, 128, 155, 162,  11,  43,  62, 130, 149,  23,
       142,  46, 110,   8,  76,  51,  27,  17, 160,  97, 113,  16,  47,
       115,  15,  61, 151, 139,  71,  82,  65,   7])

In [36]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [37]:
print(len(unique_sid), len(unique_uid))
print(len(show2id), len(profile2id))

165 1582
165 1582


In [38]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('uid')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 100 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [39]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad)
print(vad_plays_tr.shape, vad_plays_te.shape)

0 users sampled
100 users sampled
(1919, 2) (423, 2)


In [None]:
# test_plays_tr, test_plays_te = split_train_test_proportion(test)
# print(test_plays_tr.shape, test_plays_te.shape)

In [40]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['uid']))
    sid = list(map(lambda x: show2id[x], tp['sid']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [41]:
train_data = numerize(train)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

# For DMF
## Chilli for all data

In [None]:
user = []
movie = []
score = []
time = []
# less_idx = list(less_idx)

for i in range(usr_nb):
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            # build df
            user.append(i+1)
            movie.append(j+1)
            score.append(1)

In [None]:
df = pd.DataFrame(data={'user': user, 'items': movie, 'ratings': score})
#                   columns=['user', 'movie', 'score', 'time'])
df

In [None]:
print(pd.unique(df['user']).shape, pd.unique(df['items']).shape)

In [None]:
df.to_csv(os.path.join(dmf_dir, 'myratings.dat'), header=None, index=None)

# Original split
* train: <class 'list'> 21864 [(0, 2, 1.0), (0, 31, 1.0), (0, 36, 1.0), (0, 38, 1.0), (0, 55, 1.0), (0, 63, 1.0), (0, 96, 1.0), (0, 111, 1.0), (0, 120, 1.0), (1, 3, 1.0)]
* test: <class 'list'> 1582 [(0, 136, 1.0), (1, 163, 1.0), (2, 153, 1.0), (3, 161, 1.0), (4, 141, 1.0), (5, 161, 1.0), (6, 164, 1.0), (7, 164, 1.0), (8, 161, 1.0), (9, 156, 1.0)]

In [43]:
usr_idx = [i for i in range(len(usr_following))]
print(len(usr_idx))

random.seed(42)
test_idx = sorted(random.sample(usr_idx, usr_test_amount))
print(len(test_idx), test_idx[:10]) # 150 [13, 51, 54, 61, 65, 88, 93, 96, 114, 130]

1582
150 [13, 51, 54, 61, 65, 88, 93, 96, 114, 130]


In [47]:
# init
random.seed(42)
train_t = []
train_f = []
test_t = []
test_f = []
all_movie = [i for i in range(movie_nb)]

for i in range(usr_nb):
    # init
    t_for_train = []
    f_for_train = []
    t_for_test = []
    f_for_test = []
    
    if i not in test_idx: #if not in test id, just append it to true or false list
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                t_for_train.append((i, j, 1))
            else:
                f_for_train.append((i, j, 0))
                
        train_t.extend(t_for_train)
        train_f.extend(f_for_train)
#         print(len(t_for_train) + len(f_for_train))
        
    else: #if in test id, choose half of true and other 
        test = random.sample(all_movie, movie_test_amount)
        
        for j in range(movie_nb):
            if j in test:
                if usr_following[i][j] == 1:
                    t_for_test.append((i, j, 1))
                else:
                    f_for_test.append((i, j, 0))
            else:
                if usr_following[i][j] == 1:
                    t_for_train.append((i, j, 1))
                else:
                    f_for_train.append((i, j, 0))
        
        test_t.extend(t_for_test)
        test_f.extend(f_for_test)
        
        train_t.extend(t_for_train)
        train_f.extend(f_for_train)
        
    if not (len(t_for_train) + len(f_for_train) + len(t_for_test) + len(f_for_test)) == movie_nb:
        print('Error!!!')
        break

In [48]:
print('The length of train_t:',len(train_t))
print('The length of train_f:',len(train_f))
print('The length of test_t:',len(test_t))
print('The length of test_f:',len(test_f))

The length of train_t: 23013
The length of train_f: 233217
The length of test_t: 433
The length of test_f: 4367


In [51]:
write_json(train_t, os.path.join(dmf_dir, 'train_t.json'))
write_json(train_f, os.path.join(dmf_dir, 'train_f.json'))
write_json(test_t, os.path.join(dmf_dir, 'test_t.json'))
write_json(test_f, os.path.join(dmf_dir, 'test_f.json'))