In [1]:
# Basic import
import os
import sys
import json
import math
import shutil
import random
import pandas as pd
import numpy as np

In [2]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

def writeLog(row):
    with open('log.txt', 'a') as outfile:
        outfile.write(row + '\n')

def getErrMsg(e):
    error_class = e.__class__.__name__ #取得錯誤類型
    detail = e.args[0] #取得詳細內容
    errMsg = "[{}] {}".format(error_class, detail)
    return errMsg

In [3]:
DATA_DIR = '../Baseline/'
pro_dir = os.path.join(DATA_DIR, 'pro_sg_coldmovie')
dmf_dir = os.path.join(DATA_DIR, 'DMF_data_coldmovie')
if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)
if not os.path.exists(dmf_dir):
    os.makedirs(dmf_dir)

# Load numpy array

In [4]:
all_npy = np.load('./npy/all_2372.npy')
movie_genre = np.load('./npy/movie_genre.npy')
usr_following = np.load('./npy/user_followings.npy')
usr_genre = np.load('./npy/user_genre.npy')

print('All features:', all_npy.shape)
print('Movie genre:', movie_genre.shape)
print('User following:', usr_following.shape)
print('User genre:', usr_genre.shape)

All features: (165, 2372)
Movie genre: (165, 20)
User following: (1582, 165)
User genre: (1582, 20)


In [5]:
usr_nb = len(usr_following) # the number of users
movie_nb = len(movie_genre)  # the number of movies
print(usr_nb, movie_nb)

# usr_test_amount = 150
# movie_test_amount = 32
# print(usr_test_amount, movie_test_amount)

latent_dim = 64 # latent dims
ft_dim = all_npy.shape[1] # feature dims
embedding_dims = 240
print(latent_dim, ft_dim, embedding_dims)

1582 165
64 2372 240


# Training & testing split

## Prepare

In [6]:
#The number of followers for each movie
moive_followers = np.sum(usr_following, axis=0)
# print(moive_followers)

print('Min number of followers:', np.min(moive_followers))
print('Max number of followers:', np.max(moive_followers))
print('Avg of followers:', np.mean(moive_followers))

Min number of followers: 1
Max number of followers: 520
Avg of followers: 142.0969696969697


In [7]:
# print('<= 10:', np.sum(moive_followers <= 10))
# print('<= 20:', np.sum(moive_followers <= 20))
# print('<= 30:', np.sum(moive_followers <= 30))
# print('<= 40:', np.sum(moive_followers <= 40))
# print('<= 50:', np.sum(moive_followers <= 50))
print('<= 100:', np.sum(moive_followers <= 100))
less_idx = np.nonzero(moive_followers <= 100)[0]
print(less_idx.shape, less_idx)

<= 100: 76
(76,) [  6   7   8  11  13  14  15  16  17  19  20  23  26  27  29  31  32  33
  35  36  38  39  41  43  45  46  47  48  51  54  56  59  61  63  65  67
  69  70  71  73  76  82  83  88  90  92  94  95  97  98 105 107 109 110
 113 115 116 117 124 130 132 133 135 136 138 139 140 145 146 148 150 155
 157 158 160 162]


## Stat of user

In [8]:
coldarea = usr_following[:, less_idx]
coldarea.shape

(1582, 76)

In [9]:
#The number of following movie for each user
each_user = np.sum(coldarea, axis=1)
# print(each_user)

print('Min number of followings:', np.min(each_user))
print('Max number of followings:', np.max(each_user))
print('Avg of followers:', np.mean(each_user))

Min number of followings: 0
Max number of followings: 52
Avg of followers: 2.4121365360303413


In [10]:
for i in range(1, 11):
    print('>=', i, ':', np.sum(each_user >= i))
# print('>', i, ':', np.sum(each_user > 10))

>= 1 : 1185
>= 2 : 831
>= 3 : 554
>= 4 : 377
>= 5 : 228
>= 6 : 161
>= 7 : 108
>= 8 : 76
>= 9 : 53
>= 10 : 39


In [11]:
each_user.argsort()[::-1]
# each_user.argsort()[::-1].shape

array([1036,  165,  670, ...,  480,  482, 1348])

In [12]:
each_user[1036]

52

In [13]:
each_user[1348]

0

In [14]:
test_idx = list(each_user.argsort()[::-1][:100])
test_idx.sort()
print(len(test_idx), test_idx)

100 [11, 14, 24, 30, 31, 43, 53, 57, 80, 82, 103, 128, 131, 134, 143, 153, 160, 165, 173, 201, 205, 211, 212, 218, 231, 232, 245, 248, 266, 290, 315, 326, 328, 330, 336, 341, 347, 360, 367, 368, 385, 387, 395, 417, 418, 440, 451, 455, 459, 468, 471, 493, 494, 515, 524, 539, 568, 613, 670, 691, 697, 718, 720, 731, 759, 784, 814, 821, 828, 854, 867, 870, 899, 969, 990, 991, 1036, 1067, 1102, 1104, 1114, 1125, 1172, 1178, 1239, 1249, 1329, 1332, 1336, 1354, 1363, 1417, 1418, 1423, 1432, 1457, 1501, 1547, 1548, 1579]


In [15]:
coldarea = usr_following[test_idx, :][:, less_idx]
coldarea.shape

(100, 76)

In [16]:
#The number of following movie for each user
each_movie = np.sum(coldarea, axis=0)

print('Min number of followings:', np.min(each_movie))
print('Max number of followings:', np.max(each_movie))
print('Avg of followers:', np.mean(each_movie))

Min number of followings: 0
Max number of followings: 36
Avg of followers: 14.105263157894736


In [17]:
each_movie

array([18,  0, 17,  4, 25, 14,  2, 11, 14, 28, 24, 25, 21,  5, 12, 22, 11,
        9,  2, 21, 11, 14, 11,  6, 15, 16, 10, 13, 11,  4,  4, 13,  7, 24,
        2, 23, 17, 15,  2, 15, 16,  6, 31, 17, 21,  7, 18, 17,  1, 29, 16,
       36, 25, 13,  6,  7, 12, 17, 19,  6, 13, 12, 21, 17, 30,  5, 11, 12,
       12, 13, 20, 16, 24, 23,  3,  2])

In [18]:
less_idx = less_idx[np.nonzero(each_movie > 0)[0]]
print(less_idx.shape, less_idx)

(75,) [  6   8  11  13  14  15  16  17  19  20  23  26  27  29  31  32  33  35
  36  38  39  41  43  45  46  47  48  51  54  56  59  61  63  65  67  69
  70  71  73  76  82  83  88  90  92  94  95  97  98 105 107 109 110 113
 115 116 117 124 130 132 133 135 136 138 139 140 145 146 148 150 155 157
 158 160 162]


In [19]:
coldarea = usr_following[test_idx, :][:, less_idx]
coldarea.shape

(100, 75)

In [20]:
#The number of following movie for each user
each_movie = np.sum(coldarea, axis=0)
# print(each_user)

print('Min number of followings:', np.min(each_movie))
print('Max number of followings:', np.max(each_movie))
print('Avg of followers:', np.mean(each_movie))

Min number of followings: 1
Max number of followings: 36
Avg of followers: 14.293333333333333


In [21]:
less_idx = list(less_idx)
movie_nb = len(less_idx)  # the number of movies
usr_nb = len(test_idx)
usr_test_amount = len(test_idx)
movie_test_amount = 32 #math.floor(len(less_idx)*0.5)
print(usr_nb, usr_test_amount, movie_nb, movie_test_amount)

100 100 75 32


# Build UserFollowingRecord.csv

In [22]:
user = []
movie = []

for i in test_idx:
    for j in less_idx:
        if usr_following[i][j] == 1:
            # build df
            user.append(test_idx.index(i))
            movie.append(less_idx.index(j))

In [23]:
df = pd.DataFrame(data={'userId': user, 'movieId': movie})
df

Unnamed: 0,userId,movieId
0,0,26
1,0,38
2,0,41
3,0,48
4,0,50
...,...,...
1067,99,39
1068,99,41
1069,99,45
1070,99,48


In [24]:
df.to_csv(os.path.join(pro_dir, 'UserFollowingRecord.csv'), index = 0)

In [25]:
unique_uid = pd.unique(df['userId'])
unique_uid.shape

(100,)

In [26]:
unique_sid = pd.unique(df['movieId'])
unique_sid.shape

(75,)

## Setup 

In [27]:
# init
random.seed(42)
train_t = []
train_f = []
test_t = []
test_f = []

for i in test_idx:
    # init
    t_for_train = []
    f_for_train = []
    t_for_test = []
    f_for_test = []
    
    temp_t = []
    temp_f = []
    for j in less_idx:
        if usr_following[i][j] == 1:
            temp_t.append(j)
        else:
            temp_f.append(j)
            
    t_for_test = random.sample(temp_t, math.ceil(0.5*len(temp_t)))
    f_for_test = random.sample(temp_f, movie_test_amount-len(t_for_test))
    
    test_t.append(t_for_test)
    test_f.append(f_for_test)
    
    t_for_train = [item for item in temp_t if not item in t_for_test]
    f_for_train = [item for item in temp_f if not item in f_for_test]
    train_t.append(t_for_train)
    train_f.append(f_for_train)
    
    if not (len(t_for_train) + len(f_for_train) + len(t_for_test) + len(f_for_test)) == len(less_idx):
        print('Error!!!')
        break

In [28]:
print('The length of train_t:',len(train_t))
print('The length of train_f:',len(train_f))
print('The length of test_t:',len(test_t))
print('The length of test_f:',len(test_f))

The length of train_t: 100
The length of train_f: 100
The length of test_t: 100
The length of test_f: 100


In [29]:
#average num of following for training user
total_train = 0
for t in train_t:
    total_train += len(t)
avg = total_train / usr_test_amount
print('Training:', total_train, avg)

#average num of following for testing user
total_test = 0
for t in test_t:
    total_test += len(t)
avg = total_test / usr_test_amount
print('Testing:', total_test, avg)

Training: 511 5.11
Testing: 561 5.61


# Build 1 on 1 dataframe

In [30]:
# init
train_uid = []
train_sid = []
for i in range(len(train_t)):
    for j in train_t[i]:
        train_uid.append(i)
        train_sid.append(j)
    
test_uid = []
test_sid = []
for i in range(len(test_t)):
    for j in test_t[i]:
        test_uid.append(i)
        test_sid.append(j)

In [31]:
print(len(train_uid), len(train_sid))
print(len(test_uid), len(test_sid))

511 511
561 561


In [32]:
train = pd.DataFrame(data={'uid': train_uid, 'sid': train_sid}, columns=['uid', 'sid'])
train

Unnamed: 0,uid,sid
0,0,73
1,0,98
2,0,107
3,1,6
4,1,95
...,...,...
506,98,146
507,99,63
508,99,76
509,99,83


In [33]:
vad = train.sample(frac=0.2, random_state=42)
vad

Unnamed: 0,uid,sid
124,26,135
84,17,138
433,82,148
255,52,19
68,16,29
...,...,...
506,98,146
494,96,63
483,92,158
275,55,31


In [34]:
train = train.drop(vad.index)
train

Unnamed: 0,uid,sid
1,0,98
3,1,6
4,1,95
5,1,98
6,1,107
...,...,...
503,98,19
504,98,31
507,99,63
508,99,76


In [35]:
test = pd.DataFrame(data={'uid': test_uid, 'sid': test_sid}, columns=['uid', 'sid'])
test

Unnamed: 0,uid,sid
0,0,109
1,0,48
2,0,150
3,0,83
4,1,136
...,...,...
556,98,145
557,99,94
558,99,67
559,99,48


In [36]:
unique_uid = np.unique(np.concatenate((pd.unique(train['uid']), pd.unique(vad['uid']), pd.unique(test['uid'])), axis=0))
print(unique_uid.shape)
unique_uid

(100,)


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [37]:
unique_sid = np.asarray(less_idx)
with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)
print(unique_sid.shape)
unique_sid

(75,)


array([  6,   8,  11,  13,  14,  15,  16,  17,  19,  20,  23,  26,  27,
        29,  31,  32,  33,  35,  36,  38,  39,  41,  43,  45,  46,  47,
        48,  51,  54,  56,  59,  61,  63,  65,  67,  69,  70,  71,  73,
        76,  82,  83,  88,  90,  92,  94,  95,  97,  98, 105, 107, 109,
       110, 113, 115, 116, 117, 124, 130, 132, 133, 135, 136, 138, 139,
       140, 145, 146, 148, 150, 155, 157, 158, 160, 162])

In [38]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [39]:
print(len(unique_sid), len(unique_uid))
print(len(show2id), len(profile2id))

75 100
75 100


In [40]:
def split_train_test_proportion(data, test_prop=0.2, lower_bound=5):
    data_grouped_by_user = data.groupby('uid')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)
        
        if n_items_u >= lower_bound:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 100 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()
    
    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

In [41]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad, test_prop=0.5, lower_bound=2)
print(vad_plays_tr.shape, vad_plays_te.shape)

0 users sampled
(74, 2) (28, 2)


In [42]:
test_plays_tr, test_plays_te = split_train_test_proportion(test)
print(test_plays_tr.shape, test_plays_te.shape)

0 users sampled
(499, 2) (62, 2)


In [43]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['uid']))
    sid = list(map(lambda x: show2id[x], tp['sid']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [44]:
train_data = numerize(train)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

# For DMF

## Chilli

In [45]:
user = []
movie = []
score = []
time = []
# less_idx = list(less_idx)

for i in test_idx:
    for j in less_idx:
        if usr_following[i][j] == 1:
            # build df
            user.append(test_idx.index(i))
            movie.append(less_idx.index(j))
            score.append(1)

In [46]:
df = pd.DataFrame(data={'user': user, 'items': movie, 'ratings': score})
#                   columns=['user', 'movie', 'score', 'time'])
df

Unnamed: 0,user,items,ratings
0,0,26,1
1,0,38,1
2,0,41,1
3,0,48,1
4,0,50,1
...,...,...,...
1067,99,39,1
1068,99,41,1
1069,99,45,1
1070,99,48,1


In [47]:
print(pd.unique(df['user']).shape, pd.unique(df['items']).shape)

(100,) (75,)


In [48]:
df.to_csv(os.path.join(dmf_dir, 'myratings_cmovie.dat')) #, header=None, index=None, sep=':')