In [1]:
# Basic import
import os
import sys
import json
import math
import shutil
import random
import pandas as pd
import numpy as np

In [2]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

def writeLog(row):
    with open('log.txt', 'a') as outfile:
        outfile.write(row + '\n')

def getErrMsg(e):
    error_class = e.__class__.__name__ #取得錯誤類型
    detail = e.args[0] #取得詳細內容
    errMsg = "[{}] {}".format(error_class, detail)
    return errMsg

In [3]:
DATA_DIR = './'
pro_dir = os.path.join(DATA_DIR, 'pro_sg_coldmovie')
dmf_dir = os.path.join(DATA_DIR, 'DMF_data_coldmovie')
if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)
if not os.path.exists(dmf_dir):
    os.makedirs(dmf_dir)

# Load numpy array

In [4]:
all_npy = np.load('./npy/all_2372.npy')
movie_genre = np.load('./npy/movie_genre.npy')
usr_following = np.load('./npy/user_followings.npy')
usr_genre = np.load('./npy/user_genre.npy')

print('All features:', all_npy.shape)
print('Movie genre:', movie_genre.shape)
print('User following:', usr_following.shape)
print('User genre:', usr_genre.shape)

All features: (165, 2372)
Movie genre: (165, 20)
User following: (1582, 165)
User genre: (1582, 20)


In [5]:
usr_nb = len(usr_following) # the number of users
movie_nb = len(movie_genre)  # the number of movies
print(usr_nb, movie_nb)

# usr_test_amount = 150
# movie_test_amount = 32
# print(usr_test_amount, movie_test_amount)

latent_dim = 64 # latent dims
ft_dim = all_npy.shape[1] # feature dims
embedding_dims = 240
print(latent_dim, ft_dim, embedding_dims)

1582 165
64 2372 240


# Training & testing split

## Prepare

In [6]:
#The number of followers for each movie
moive_followers = np.sum(usr_following, axis=0)
# print(moive_followers)

print('Min number of followers:', np.min(moive_followers))
print('Max number of followers:', np.max(moive_followers))
print('Avg of followers:', np.mean(moive_followers))

asc = np.sort(moive_followers)
# print(asc)
desc = np.flip(asc)
# print(desc)

over5 = 0
for num in moive_followers:
    if num >= 5:
        over5 += 1
print('The num of followers over 5:', over5)

Min number of followers: 1
Max number of followers: 520
Avg of followers: 142.0969696969697
The num of followers over 5: 163


In [7]:
print('<= 10:', np.sum(moive_followers <= 10))
print('<= 20:', np.sum(moive_followers <= 20))
print('<= 30:', np.sum(moive_followers <= 30))
print('<= 40:', np.sum(moive_followers <= 40))
print('<= 50:', np.sum(moive_followers <= 50))
print('<= 100:', np.sum(moive_followers <= 100))
less_idx = np.nonzero(moive_followers <= 100)[0]
print(less_idx.shape, less_idx)

<= 10: 3
<= 20: 13
<= 30: 21
<= 40: 32
<= 50: 40
<= 100: 76
(76,) [  6   7   8  11  13  14  15  16  17  19  20  23  26  27  29  31  32  33
  35  36  38  39  41  43  45  46  47  48  51  54  56  59  61  63  65  67
  69  70  71  73  76  82  83  88  90  92  94  95  97  98 105 107 109 110
 113 115 116 117 124 130 132 133 135 136 138 139 140 145 146 148 150 155
 157 158 160 162]


In [8]:
print(usr_following.shape)
usr_following = usr_following.T
print(usr_following.shape)

(1582, 165)
(165, 1582)


In [9]:
st = set()
for idx in less_idx:
    print('Index:', idx)
    print('Sum:', usr_following[idx].sum())
#     print(usr_following[idx])
    li = list(np.where(usr_following[idx] == 1)[0])
#     print(li)
    st = st | set(li)
    print(len(st))
    print('==================================================')

Index: 6
Sum: 37
37
Index: 7
Sum: 1
38
Index: 8
Sum: 37
69
Index: 11
Sum: 75
141
Index: 13
Sum: 97
221
Index: 14
Sum: 23
235
Index: 15
Sum: 18
248
Index: 16
Sum: 31
269
Index: 17
Sum: 61
314
Index: 19
Sum: 78
357
Index: 20
Sum: 61
394
Index: 23
Sum: 72
433
Index: 26
Sum: 90
484
Index: 27
Sum: 23
497
Index: 29
Sum: 32
510
Index: 31
Sum: 58
532
Index: 32
Sum: 61
572
Index: 33
Sum: 80
624
Index: 35
Sum: 11
628
Index: 36
Sum: 78
652
Index: 38
Sum: 28
659
Index: 39
Sum: 49
677
Index: 41
Sum: 82
717
Index: 43
Sum: 70
743
Index: 45
Sum: 81
778
Index: 46
Sum: 77
810
Index: 47
Sum: 89
838
Index: 48
Sum: 49
849
Index: 51
Sum: 19
851
Index: 54
Sum: 18
854
Index: 56
Sum: 22
861
Index: 59
Sum: 24
864
Index: 61
Sum: 41
879
Index: 63
Sum: 58
893
Index: 65
Sum: 4
894
Index: 67
Sum: 76
903
Index: 69
Sum: 40
909
Index: 70
Sum: 41
922
Index: 71
Sum: 20
926
Index: 73
Sum: 43
931
Index: 76
Sum: 93
956
Index: 82
Sum: 78
987
Index: 83
Sum: 82
1003
Index: 88
Sum: 44
1007
Index: 90
Sum: 68
1015
Index: 92
Sum: 

In [10]:
test_idx = list(st)
test_idx.sort()
print(len(test_idx), test_idx[:10])

1185 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [11]:
usr_test_amount = len(test_idx)
movie_test_amount = len(less_idx)
print(usr_test_amount, movie_test_amount)

1185 76


In [12]:
usr_following = usr_following.T
print(usr_following.shape)

(1582, 165)


## Stat of user

In [13]:
coldarea = usr_following[test_idx, :][:, less_idx]
coldarea.shape

(1185, 76)

In [14]:
#The number of following movie for each user
each_user = np.sum(coldarea, axis=1)
# print(each_user)

print('Min number of followings:', np.min(each_user))
print('Max number of followings:', np.max(each_user))
print('Avg of followers:', np.mean(each_user))

asc = np.sort(each_user)
# print(each_user)
# print(asc)
desc = np.flip(asc)
# print(desc)

Min number of followings: 1
Max number of followings: 52
Avg of followers: 3.220253164556962


In [15]:
for i in range(1, 11):
    print('>=', i, ':', np.sum(each_user >= i))
# print('>', i, ':', np.sum(each_user > 10))

>= 1 : 1185
>= 2 : 831
>= 3 : 554
>= 4 : 377
>= 5 : 228
>= 6 : 161
>= 7 : 108
>= 8 : 76
>= 9 : 53
>= 10 : 39


In [16]:
new_test_idx = []
for i in test_idx:
    add = 0
    for j in less_idx:
        if usr_following[i][j] == 1:
            add += 1
            
    if add >= 7:
        new_test_idx.append(i)

In [17]:
len(new_test_idx)

108

In [18]:
random.seed(42)
test_idx = random.sample(new_test_idx, 100)
test_idx.sort()
print(len(test_idx), test_idx[:10])

100 [11, 21, 24, 30, 31, 43, 53, 57, 76, 80]


In [19]:
usr_test_amount = len(test_idx)
movie_test_amount = 32 #math.floor(len(less_idx)*0.5)
print(usr_test_amount, movie_test_amount)

100 32


## Setup 

In [None]:
# init
random.seed(42)
train_t = []
train_f = []
test_t = []
test_f = []

for i in test_idx:
    # init
    t_for_train = []
    f_for_train = []
    t_for_test = []
    f_for_test = []
    
    temp_t = []
    temp_f = []
    for j in less_idx:
        if usr_following[i][j] == 1:
            temp_t.append(j)
        else:
            temp_f.append(j)
            
    t_for_test = random.sample(temp_t, math.ceil(0.5*len(temp_t)))
    f_for_test = random.sample(temp_f, movie_test_amount-len(t_for_test))
    
    test_t.append(t_for_test)
    test_f.append(f_for_test)
    
    t_for_train = [item for item in temp_t if not item in t_for_test]
    f_for_train = [item for item in temp_f if not item in f_for_test]
    train_t.append(t_for_train)
    train_f.append(f_for_train)
    
    if not (len(t_for_train) + len(f_for_train) + len(t_for_test) + len(f_for_test)) == len(less_idx):
        print('Error!!!')
        break

In [None]:
print('The length of train_t:',len(train_t))
print('The length of train_f:',len(train_f))
print('The length of test_t:',len(test_t))
print('The length of test_f:',len(test_f))

# Build 1 on 1 dataframe

In [None]:
# init
train_uid = []
train_sid = []
for i in range(len(train_t)):
    for j in train_t[i]:
        train_uid.append(i)
        train_sid.append(j)
    
test_uid = []
test_sid = []
for i in range(len(test_t)):
    for j in test_t[i]:
        test_uid.append(i)
        test_sid.append(j)

In [None]:
print(len(train_uid), len(train_sid))
print(len(test_uid), len(test_sid))

In [None]:
train = pd.DataFrame(data={'uid': train_uid, 'sid': train_sid}, columns=['uid', 'sid'])
train

In [None]:
vad = train.sample(frac=0.2, random_state=42)
vad

In [None]:
train = train.drop(vad.index)
train

In [None]:
test = pd.DataFrame(data={'uid': test_uid, 'sid': test_sid}, columns=['uid', 'sid'])
test

In [None]:
unique_uid = np.unique(np.concatenate((pd.unique(train['uid']), pd.unique(vad['uid']), pd.unique(test['uid'])), axis=0))
print(unique_uid.shape)
unique_uid

In [None]:
unique_sid = less_idx
with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)
print(unique_sid.shape)
unique_sid

In [None]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [None]:
print(len(unique_sid), len(unique_uid))
print(len(show2id), len(profile2id))

In [None]:
def split_train_test_proportion(data, test_prop=0.2, lower_bound=5):
    data_grouped_by_user = data.groupby('uid')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)
        
        if n_items_u >= lower_bound:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 100 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()
    
    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

In [None]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad, test_prop=0.5, lower_bound=2)
print(vad_plays_tr.shape, vad_plays_te.shape)

In [None]:
test_plays_tr, test_plays_te = split_train_test_proportion(test)
print(test_plays_tr.shape, test_plays_te.shape)

In [None]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['uid']))
    sid = list(map(lambda x: show2id[x], tp['sid']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [None]:
train_data = numerize(train)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

# For DMF

## Chilli

In [20]:
user = []
movie = []
score = []
time = []
less_idx = list(less_idx)

for i in test_idx:
    for j in less_idx:
        if usr_following[i][j] == 1:
            # build df
            user.append(test_idx.index(i))
            movie.append(less_idx.index(j))
            score.append(1)

In [21]:
df = pd.DataFrame(data={'user': user, 'items': movie, 'ratings': score})
#                   columns=['user', 'movie', 'score', 'time'])
df

Unnamed: 0,user,items,ratings
0,0,27,1
1,0,39,1
2,0,42,1
3,0,49,1
4,0,51,1
...,...,...,...
1062,99,40,1
1063,99,42,1
1064,99,46,1
1065,99,49,1


In [22]:
df.to_csv(os.path.join(dmf_dir, 'myratings_cmovie.dat')) #, header=None, index=None, sep=':')

## all

In [None]:
user = []
movie = []
score = []
time = []
train = []
test = []

for i in range(usr_nb):
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            if i in test_idx:
                test.append((i, j, 1))
            else:
                train.append((i, j, 1))
            
            # build df
            timestamp = ''.join(str(n) for n in random.sample(range(0,9),9))
            user.append(i+1)
            movie.append(j+1)
            score.append(1)
            time.append(timestamp)
#         else:
#             user.append(i+1)
#             movie.append(j+1)
#             score.append(0)
#             time.append(timestamp)

In [None]:
write_json(test, os.path.join(dmf_dir, 'test.json'))
write_json(train, os.path.join(dmf_dir, 'train.json'))
print(len(test), len(train))

In [None]:
df = pd.DataFrame(data={'user': user, 'movie': movie, 'score': score, 'time': time})
#                   columns=['user', 'movie', 'score', 'time'])
df

In [None]:
df.to_csv(os.path.join(dmf_dir, 'DMF_data.dat'), header=None, index=None, sep=':')

# Original split

In [None]:
# init
train_t = []
train_f = []
test_t = []
test_f = []

for i in range(usr_nb):
    # init
    t_for_train = []
    f_for_train = []
    t_for_test = []
    f_for_test = []
    
    if i not in test_idx: #if not in test id, just append it to true or false list
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                t_for_train.append((i, j, 1))
            else:
                f_for_train.append((i, j, 0))
                
        train_t.extend(t_for_train)
        train_f.extend(f_for_train)
#         print(len(t_for_train) + len(f_for_train))
        
    else: #if in test id, choose half of true and other 
        temp_t = []
        temp_f = []
        
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                temp_t.append((i, j, 1))
            else:
                temp_f.append((i, j, 0))
        
        # random choose half true and half false for test 
        t_for_test = random.sample(temp_t, math.ceil(0.5*len(temp_t)))
        f_for_test  = random.sample(temp_f, movie_test_amount-len(t_for_test))
        
        test_t.extend(t_for_test)
        test_f.extend(f_for_test)
        
        #the others for training
        t_for_train = [item for item in temp_t if not item in t_for_test]
        f_for_train = [item for item in temp_f if not item in f_for_test]
        train_t.extend(t_for_train)
        train_f.extend(f_for_train)
        
    if not (len(t_for_train) + len(f_for_train) + len(t_for_test) + len(f_for_test)) == movie_nb:
        print('Error!!!')
        break

In [None]:
print('The length of train_t:',len(train_t))
print('The length of train_f:',len(train_f))
print('The length of test_t:',len(test_t))
print('The length of test_f:',len(test_f))

In [None]:
train = train_t
test = test_t
write_json(test, './DMF_data/test.json')
write_json(train, './DMF_data/train.json')
print(len(test), len(train))