In [None]:
# Basic import
import os
import sys
import json
import math
import shutil
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Helper function
def writeProgress(msg, count, total):
    sys.stdout.write(msg + "{:.2%}\r".format(count/total))
    sys.stdout.flush()
    
def newPath(path):
    if not os.path.isdir(path):
        os.mkdir(path)

def read_json(src_path):
    with open(src_path, 'r') as json_file:
        data = json.load(json_file)
    return data

def write_json(data,dst_path):
    with open(dst_path, 'w') as outfile:
        json.dump(data, outfile)

def writeLog(row):
    with open('log.txt', 'a') as outfile:
        outfile.write(row + '\n')

def getErrMsg(e):
    error_class = e.__class__.__name__ #取得錯誤類型
    detail = e.args[0] #取得詳細內容
    errMsg = "[{}] {}".format(error_class, detail)
    return errMsg

In [None]:
DATA_DIR = './'
pro_dir = os.path.join(DATA_DIR, 'pro_sg')
dmf_dir = os.path.join(DATA_DIR, 'DMF_data')
if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)
if not os.path.exists(dmf_dir):
    os.makedirs(dmf_dir)

# Load numpy array

In [None]:
all_npy = np.load('./npy/all_2372.npy')
movie_genre = np.load('./npy/movie_genre.npy')
usr_following = np.load('./npy/user_followings.npy')
usr_genre = np.load('./npy/user_genre.npy')

print('All features:', all_npy.shape)
print('Movie genre:', movie_genre.shape)
print('User following:', usr_following.shape)
print('User genre:', usr_genre.shape)

In [None]:
usr_nb = len(usr_following) # the number of users
movie_nb = len(movie_genre)  # the number of movies
print(usr_nb, movie_nb)

usr_test_amount = 150
movie_test_amount = 32
print(usr_test_amount, movie_test_amount)

latent_dim = 64 # latent dims
ft_dim = all_npy.shape[1] # feature dims
embedding_dims = 240
print(latent_dim, ft_dim, embedding_dims)

# Training & testing split

## Prepare

In [None]:
#The number of following movie for each user
each_user = np.sum(usr_following, axis=1)
# print(each_user)

print('Min number of followings:', np.min(each_user))
print('Max number of followings:', np.max(each_user))
print('Avg of followers:', np.mean(each_user))

asc = np.sort(each_user)
# print(each_user)
# print(asc)
desc = np.flip(asc)
# print(desc)

In [None]:
print('Over 10:', np.sum(each_user >= 10))
print('Over 12:', np.sum(each_user >= 12))
print('Over 14:', np.sum(each_user >= 14))
print('Over 16:', np.sum(each_user >= 16))
print('Over 18:', np.sum(each_user >= 18))
print('Over 20:', np.sum(each_user >= 20))

In [None]:
usr_idx = [i for i in range(len(usr_following))]
print(len(usr_idx))

In [None]:
random.seed(42)
test_idx = sorted(random.sample(usr_idx, usr_test_amount))
print(len(test_idx), test_idx[:10]) # 150 [13, 51, 54, 61, 65, 88, 93, 96, 114, 130]

In [None]:
train_idx = [item for item in usr_idx if item not in test_idx]
print(len(train_idx), train_idx[:10])

In [None]:
vad_idx = sorted(random.sample(train_idx, usr_test_amount))
print(len(vad_idx), vad_idx[:10])

In [None]:
train_idx = [item for item in train_idx if item not in vad_idx]
print(len(train_idx), train_idx[:10])

# For VAE & RaCT
## Setup

In [None]:
# init
train_uid = []
train_sid = []
vad_uid = []
vad_sid = []

for i in range(usr_nb):
    if i in train_idx:
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                train_uid.append(i)
                train_sid.append(j)
                
    elif i in vad_idx:
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                vad_uid.append(i)
                vad_sid.append(j)

In [None]:
print('The length of train:',len(train_uid), len(train_sid))
print('The length of vad:',len(vad_uid), len(vad_sid))

In [None]:
# init
random.seed(42)
test_te_uid = []
test_te_sid= []
test_tr_uid = []
test_tr_sid = []

for i in range(usr_nb):
    # init
    t_for_train = []
    t_for_test = []
    
    if i in test_idx:
        temp_t = []
        
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                temp_t.append(j)
        
        # random choose half true and half false for test 
        t_for_test = random.sample(temp_t, math.ceil(0.5*len(temp_t)))
        for k in t_for_test:
            test_te_uid.append(i)
            test_te_sid.append(k)
        
        
        #the others for training
        t_for_train = [item for item in temp_t if not item in t_for_test]
        for k in t_for_train:
            test_tr_uid.append(i)
            test_tr_sid.append(k)

In [None]:
print('The length of test_te:',len(test_te_uid), len(test_te_sid))
print('The length of test_tr:',len(test_tr_uid), len(test_tr_uid))

In [None]:
train = pd.DataFrame(data={'uid': train_uid, 'sid': train_sid}, columns=['uid', 'sid'])
train

In [None]:
vad = pd.DataFrame(data={'uid': vad_uid, 'sid': vad_sid}, columns=['uid', 'sid'])
vad

In [None]:
test = pd.DataFrame(data={'uid': test_te_uid, 'sid': test_te_sid}, columns=['uid', 'sid'])
test

In [None]:
test_plays_tr = pd.DataFrame(data={'uid': test_tr_uid, 'sid': test_tr_sid}, columns=['uid', 'sid'])
test_plays_tr

In [None]:
unique_uid = np.concatenate((pd.unique(train['uid']), pd.unique(vad['uid']), pd.unique(test['uid'])), axis=0)
print(unique_uid.shape)
unique_uid

In [None]:
unique_sid = pd.unique(train['sid'])
with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)
print(unique_sid.shape)
unique_sid

In [None]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [None]:
print(len(unique_sid), len(unique_uid))
print(len(show2id), len(profile2id))

In [None]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('uid')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 100 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [None]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad)
print(vad_plays_tr.shape, vad_plays_te.shape)

In [None]:
test_plays_te = test
# test_plays_tr, test_plays_te = split_train_test_proportion(test)
# print(test_plays_tr.shape, test_plays_te.shape)

In [None]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['uid']))
    sid = list(map(lambda x: show2id[x], tp['sid']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [None]:
train_data = numerize(train)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

# For DMF
## all dataframe

In [None]:
user = []
movie = []
score = []
time = []

for i in range(usr_nb):
    for j in range(movie_nb):
        if usr_following[i][j] == 1:
            # build df
            timestamp = ''.join(str(n) for n in random.sample(range(0,9),9))
            user.append(i+1)
            movie.append(j+1)
            score.append(1)
            time.append(timestamp)

In [None]:
df = pd.DataFrame(data={'user': user, 'movie': movie, 'score': score, 'time': time})
#                   columns=['user', 'movie', 'score', 'time'])
df

In [None]:
df.to_csv(os.path.join(dmf_dir, 'DMF_data.dat'), header=None, index=None, sep=':')

# Original split

In [None]:
write_json(test_idx, os.path.join(dmf_dir, 'test_idx.json'))
print(test_idx)

In [None]:
# init
random.seed(42)
train_t = []
train_f = []
test_t = []
test_f = []

for i in range(usr_nb):
    # init
    t_for_train = []
    f_for_train = []
    t_for_test = []
    f_for_test = []
    
    if i not in test_idx: #if not in test id, just append it to true or false list
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                t_for_train.append((i, j, 1))
            else:
                f_for_train.append((i, j, 0))
                
        train_t.extend(t_for_train)
        train_f.extend(f_for_train)
#         print(len(t_for_train) + len(f_for_train))
        
    else: #if in test id, choose half of true and other 
        temp_t = []
        temp_f = []
        
        for j in range(movie_nb):
            if usr_following[i][j] == 1:
                temp_t.append((i, j, 1))
            else:
                temp_f.append((i, j, 0))
        
        # random choose half true and half false for test 
        t_for_test = random.sample(temp_t, math.ceil(0.5*len(temp_t)))
        f_for_test  = random.sample(temp_f, movie_test_amount-len(t_for_test))
        
        test_t.extend(t_for_test)
        test_f.extend(f_for_test)
        
        #the others for training
        t_for_train = [item for item in temp_t if not item in t_for_test]
        f_for_train = [item for item in temp_f if not item in f_for_test]
        train_t.extend(t_for_train)
        train_f.extend(f_for_train)
        
    if not (len(t_for_train) + len(f_for_train) + len(t_for_test) + len(f_for_test)) == movie_nb:
        print('Error!!!')
        break

In [None]:
print('The length of train_t:',len(train_t))
print('The length of train_f:',len(train_f))
print('The length of test_t:',len(test_t))
print('The length of test_f:',len(test_f))

In [None]:
train = train_t
test = test_t
write_json(test, os.path.join(dmf_dir, 'test.json'))
write_json(train, os.path.join(dmf_dir, 'train.json'))
print(len(test), len(train))