## 1. 초기 세팅

In [3]:
## 전처리과정에서 pandas의 버전에 다르게 동작하는 경향이 보여, 이 미션에서는 아래 버전으로 사용하도록하겠습니다.
#!pip install pandas==1.0.1

In [2]:
import argparse
import time
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from scipy import sparse


In [3]:
## 각종 파라미터 세팅
parser = argparse.ArgumentParser(description='PyTorch Variational Autoencoders for Collaborative Filtering')


parser.add_argument('--data', type=str, default='/opt/ml/input/data/train/',
                    help='Movielens dataset location')

parser.add_argument('--lr', type=float, default=1e-4,
                    help='initial learning rate')
parser.add_argument('--wd', type=float, default=0.00,
                    help='weight decay coefficient')
parser.add_argument('--batch_size', type=int, default=500,
                    help='batch size')
parser.add_argument('--epochs', type=int, default=20, #원래 20
                    help='upper epoch limit')
parser.add_argument('--total_anneal_steps', type=int, default=200000,
                    help='the total number of gradient updates for annealing')
parser.add_argument('--anneal_cap', type=float, default=0.2,
                    help='largest annealing parameter')
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')
parser.add_argument('--cuda', action='store_true',
                    help='use CUDA')
parser.add_argument('--log_interval', type=int, default=100, metavar='N',
                    help='report interval')
parser.add_argument('--save', type=str, default='model.pt',
                    help='path to save the final model')
args = parser.parse_args([])

# Set the random seed manually for reproductibility.
torch.manual_seed(args.seed)

#만약 GPU가 사용가능한 환경이라면 GPU를 사용
if torch.cuda.is_available():
    args.cuda = True

device = torch.device("cuda" if args.cuda else "cpu")
device

device(type='cuda')

##2. 데이터 전처리

이 부분에서 진행되는 과정은 저희가 일반적으로 알고있는 MovieLens (user, item, timestamp)데이터를 전처리하는 과정입니다. 전처리 과정의 다양한 옵션들을 구성하기 위해 약간 복잡하게 되었지만, 
결과적으로는, 유저들의 특정한 아이템들을 따로 분리를 해서, 그 분리된 값을 모델이 예측할 수 있냐를 확인하기 위한 전처리 과정이라고 보시면 되겠습니다.
실제로 나오는 데이터셋을 확인하면 더욱 이해가 빠를것입니다.

In [8]:
import os
import pandas as pd
from scipy import sparse
import numpy as np

def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id)#, as_index=False)
    count = playcount_groupbyid.size()

    return count

# 특정한 횟수 이상의 리뷰가 존재하는(사용자의 경우 min_uc 이상, 아이템의 경우 min_sc이상) 
# 데이터만을 추출할 때 사용하는 함수입니다.
# 현재 데이터셋에서는 결과적으로 원본그대로 사용하게 됩니다.
def filter_triplets(tp, min_uc=5, min_sc=0):
    if min_sc > 0:
        itemcount = get_count(tp, 'item')
        tp = tp[tp['item'].isin(itemcount.index[itemcount >= min_sc])]

    if min_uc > 0:
        usercount = get_count(tp, 'user')
        tp = tp[tp['user'].isin(usercount.index[usercount >= min_uc])]

    usercount, itemcount = get_count(tp, 'user'), get_count(tp, 'item')
    return tp, usercount, itemcount

#훈련된 모델을 이용해 검증할 데이터를 분리하는 함수입니다.
#100개의 액션이 있다면, 그중에 test_prop 비율 만큼을 비워두고, 그것을 모델이 예측할 수 있는지를
#확인하기 위함입니다.
def split_train_test_proportion(data, test_prop=0.2): #원래 0.2
    data_grouped_by_user = data.groupby('user')
    tr_list, te_list = list(), list()

    np.random.seed(98765)
    
    for _, group in data_grouped_by_user:
        n_items_u = len(group)
        
        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        
        else:
            tr_list.append(group)
    
    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

def numerize(tp, profile2id, show2id):
    uid = tp['user'].apply(lambda x: profile2id[x])
    sid = tp['item'].apply(lambda x: show2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [5]:
print("Load and Preprocess Movielens dataset")
# Load Data

DATA_DIR = args.data
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'train_ratings.csv'), header=0)
print("원본 데이터\n", raw_data)

# Filter Data
raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=5, min_sc=0)
#제공된 훈련데이터의 유저는 모두 5개 이상의 리뷰가 있습니다.
print("5번 이상의 리뷰가 있는 유저들로만 구성된 데이터\n",raw_data)

print("유저별 리뷰수\n",user_activity)
print("아이템별 리뷰수\n",item_popularity)

Load and Preprocess Movielens dataset
원본 데이터
            user   item        time
0            11   4643  1230782529
1            11    170  1230782534
2            11    531  1230782539
3            11    616  1230782542
4            11   2140  1230782563
...         ...    ...         ...
5154466  138493  44022  1260209449
5154467  138493   4958  1260209482
5154468  138493  68319  1260209720
5154469  138493  40819  1260209726
5154470  138493  27311  1260209807

[5154471 rows x 3 columns]
5번 이상의 리뷰가 있는 유저들로만 구성된 데이터
            user   item        time
0            11   4643  1230782529
1            11    170  1230782534
2            11    531  1230782539
3            11    616  1230782542
4            11   2140  1230782563
...         ...    ...         ...
5154466  138493  44022  1260209449
5154467  138493   4958  1260209482
5154468  138493  68319  1260209720
5154469  138493  40819  1260209726
5154470  138493  27311  1260209807

[5154471 rows x 3 columns]
유저별 리뷰수
 user
11        376
1

In [9]:
# Shuffle User Indices
unique_uid = user_activity.index
print("(BEFORE) unique_uid:",unique_uid)
np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]
print("(AFTER) unique_uid:",unique_uid)

n_users = unique_uid.size #31360
n_heldout_users = 3136#3000


# Split Train/Validation/Test User Indices
tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

#주의: 데이터의 수가 아닌 사용자의 수입니다!
print("훈련 데이터에 사용될 사용자 수:", len(tr_users))
print("검증 데이터에 사용될 사용자 수:", len(vd_users))
print("테스트 데이터에 사용될 사용자 수:", len(te_users))




(BEFORE) unique_uid: Int64Index([    11,     14,     18,     25,     31,     35,     43,     50,
                58,     60,
            ...
            138459, 138461, 138470, 138471, 138472, 138473, 138475, 138486,
            138492, 138493],
           dtype='int64', name='user', length=31360)
(AFTER) unique_uid: Int64Index([ 27968,  67764,   2581,  82969, 137831,  48639,  97870,  40424,
             46835,  79570,
            ...
            114284,   9009,  21165,  33920,  22054, 135379, 125855,  41891,
             15720,  17029],
           dtype='int64', name='user', length=31360)
훈련 데이터에 사용될 사용자 수: 25088
검증 데이터에 사용될 사용자 수: 3136
테스트 데이터에 사용될 사용자 수: 3136


In [10]:
##훈련 데이터에 해당하는 아이템들
#Train에는 전체 데이터를 사용합니다.
train_plays = raw_data.loc[raw_data['user'].isin(tr_users)]

##아이템 ID
unique_sid = pd.unique(train_plays['item'])

show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

pro_dir = os.path.join(DATA_DIR, 'pro_sg')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

#Validation과 Test에는 input으로 사용될 tr 데이터와 정답을 확인하기 위한 te 데이터로 분리되었습니다.
vad_plays = raw_data.loc[raw_data['user'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['item'].isin(unique_sid)]
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)

test_plays = raw_data.loc[raw_data['user'].isin(te_users)]
test_plays = test_plays.loc[test_plays['item'].isin(unique_sid)]
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)



train_data = numerize(train_plays, profile2id, show2id)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)


vad_data_tr = numerize(vad_plays_tr, profile2id, show2id)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

vad_data_te = numerize(vad_plays_te, profile2id, show2id)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

test_data_tr = numerize(test_plays_tr, profile2id, show2id)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

test_data_te = numerize(test_plays_te, profile2id, show2id)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)

print("Done!")

Done!


In [11]:
#데이터 셋 확인
print(train_data)
print(vad_data_tr)
print(vad_data_te)
# print(test_data_tr)
# print(test_data_te)

           uid   sid
0        11825     0
1        11825     1
2        11825     2
3        11825     3
4        11825     4
...        ...   ...
5154466  10783   477
5154467  10783  1325
5154468  10783   331
5154469  10783   558
5154470  10783  1922

[4125303 rows x 2 columns]
           uid   sid
376      26554   440
377      26554   741
378      26554  1407
379      26554   193
380      26554  1041
...        ...   ...
5153247  26934   760
5153248  26934   697
5153249  26934  3232
5153250  26934  1369
5153251  26934  3679

[415395 rows x 2 columns]
           uid   sid
382      26554  3012
383      26554  1681
384      26554   201
399      26554  3177
401      26554  3289
...        ...   ...
5153229  26934   737
5153233  26934   228
5153236  26934   235
5153240  26934  3962
5153243  26934  1086

[102295 rows x 2 columns]


##3. 데이터 로더 설정

In [12]:

class DataLoader():
    '''
    Load Movielens dataset
    '''
    def __init__(self, path):
        
        self.pro_dir = os.path.join(path, 'pro_sg')
        assert os.path.exists(self.pro_dir), "Preprocessed files do not exist. Run data.py"

        self.n_items = self.load_n_items()
    
    def load_data(self, datatype='train'):
        if datatype == 'train':
            return self._load_train_data()
        elif datatype == 'validation':
            return self._load_tr_te_data(datatype)
        elif datatype == 'test':
            return self._load_tr_te_data(datatype)
        else:
            raise ValueError("datatype should be in [train, validation, test]")
        
    def load_n_items(self):
        unique_sid = list()
        with open(os.path.join(self.pro_dir, 'unique_sid.txt'), 'r') as f:
            for line in f:
                unique_sid.append(line.strip())
        n_items = len(unique_sid)
        return n_items
    
    def _load_train_data(self):
        path = os.path.join(self.pro_dir, 'train.csv')
        
        tp = pd.read_csv(path)
        n_users = tp['uid'].max() + 1

        rows, cols = tp['uid'], tp['sid']
        data = sparse.csr_matrix((np.ones_like(rows),
                                 (rows, cols)), dtype='float64',
                                 shape=(n_users, self.n_items))
        return data
    
    def _load_tr_te_data(self, datatype='test'):
        tr_path = os.path.join(self.pro_dir, '{}_tr.csv'.format(datatype))
        te_path = os.path.join(self.pro_dir, '{}_te.csv'.format(datatype))

        tp_tr = pd.read_csv(tr_path)
        tp_te = pd.read_csv(te_path)

        start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
        end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())

        rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
        rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

        data_tr = sparse.csr_matrix((np.ones_like(rows_tr),
                                    (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, self.n_items))
        data_te = sparse.csr_matrix((np.ones_like(rows_te),
                                    (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, self.n_items))
        return data_tr, data_te

## side information 준비

In [13]:
import gensim
from urllib.request import urlretrieve, urlopen

#urlretrieve("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", \
#                           filename="GoogleNews-vectors-negative300.bin.gz")
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

print('모델의 크기(shape) :',word2vec_model.vectors.shape) # 모델의 크기 확인

모델의 크기(shape) : (3000000, 300)


In [14]:
import pandas as pd
gen = pd.read_csv("/opt/ml/input/data/train/genres.tsv", delimiter='\t')
gen.head()

Unnamed: 0,item,genre
0,318,Crime
1,318,Drama
2,2571,Action
3,2571,Sci-Fi
4,2571,Thriller


In [15]:
def gen_numerize(tp, show2id):
    #uid = tp['user'].apply(lambda x: profile2id[x])
    sid = tp['item'].apply(lambda x: show2id[x])
    return sid
gen['item'] = gen_numerize(gen, show2id)
gen.head()

Unnamed: 0,item,genre
0,198,Crime
1,198,Drama
2,82,Action
3,82,Sci-Fi
4,82,Thriller


In [16]:
gen

Unnamed: 0,item,genre
0,198,Crime
1,198,Drama
2,82,Action
3,82,Sci-Fi
4,82,Thriller
...,...,...
15928,6763,Drama
15929,5046,Action
15930,5046,Comedy
15931,5508,Comedy


In [17]:
gen_emb = pd.DataFrame(gen.genre.value_counts().index.values, columns=['genre'])
gen_emb.head()

Unnamed: 0,genre
0,Drama
1,Comedy
2,Thriller
3,Romance
4,Action


In [30]:
emb_list = []
for x in gen_emb.genre:
    if x == 'Sci-Fi':
        emb_list.append(word2vec_model['science_fiction'])
    elif x == 'Film-Noir':
        emb_list.append(word2vec_model['Film_Noir'])
    else:
        emb_list.append(word2vec_model[x])

In [31]:
x = pd.concat([gen_emb, pd.DataFrame(emb_list)], axis=1)
a = x.set_index('genre', drop=True)
a.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Drama,-0.144531,-0.05542,0.013855,-0.111816,0.1875,0.022095,-0.277344,-0.12793,-0.320312,0.032227,...,-0.222656,-0.100098,-0.589844,-0.18457,0.189453,0.195312,-0.113281,-0.055908,0.05249,0.24707
Comedy,-0.032959,-0.077637,-0.065918,0.291016,0.041016,0.043213,0.151367,0.273438,0.097656,-0.054443,...,0.443359,0.078125,-0.443359,-0.024048,-0.036621,0.253906,-0.046631,-0.045898,0.038818,0.074219
Thriller,0.217773,-0.064941,0.1875,0.314453,-0.047363,-0.192383,-0.390625,0.135742,0.168945,0.019165,...,-0.058594,0.003937,-0.267578,0.235352,0.271484,-0.000277,-0.03418,-0.396484,-0.028076,0.072266
Romance,0.041992,-0.075195,-0.341797,0.119629,0.185547,-0.005249,-0.057617,-0.179688,-0.090332,0.179688,...,0.116699,-0.078613,-0.322266,-0.013245,0.353516,-0.083008,-0.121094,0.022095,-0.035156,0.291016
Action,0.053955,-0.03125,0.242188,0.049316,0.023315,-0.080566,-0.05957,0.033203,-0.310547,0.108887,...,0.009277,-0.189453,-0.242188,0.067383,0.02417,0.016968,0.049072,0.011475,-0.025513,-0.099121


In [32]:
gen2emb = dict((x, a.loc[x].values) for (i, x) in enumerate(a.index))
gen['emb'] = gen['genre'].apply(lambda x: gen2emb[x])

In [33]:
gen.head(10)

Unnamed: 0,item,genre,emb
0,198,Crime,"[0.028076172, 0.0048828125, -0.09667969, -0.01..."
1,198,Drama,"[-0.14453125, -0.055419922, 0.0138549805, -0.1..."
2,82,Action,"[0.053955078, -0.03125, 0.2421875, 0.049316406..."
3,82,Sci-Fi,"[0.12988281, -0.12597656, 0.1796875, 0.2519531..."
4,82,Thriller,"[0.21777344, -0.064941406, 0.1875, 0.31445312,..."
5,260,Action,"[0.053955078, -0.03125, 0.2421875, 0.049316406..."
6,260,Crime,"[0.028076172, 0.0048828125, -0.09667969, -0.01..."
7,260,Drama,"[-0.14453125, -0.055419922, 0.0138549805, -0.1..."
8,260,Thriller,"[0.21777344, -0.064941406, 0.1875, 0.31445312,..."
9,264,Comedy,"[-0.032958984, -0.07763672, -0.06591797, 0.291..."


In [34]:
def item_genre_emb_mean(i):
    total.append(np.mean(gen[gen['item'] == i].emb))

In [35]:
total = []

item_genre_emb_idx = pd.DataFrame(list(i for i in range(0, max(gen.item)+1)), columns=['item'])
item_genre_emb_idx.item.apply(lambda x: item_genre_emb_mean(x))
item_genre_emb = pd.DataFrame(total)
item_genre_emb.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.01651,-0.062561,0.094589,0.042084,0.043182,0.02652,-0.06842,0.137451,-0.155029,-0.025269,...,-0.132629,-0.041382,-0.305176,-0.006836,0.197449,0.032135,-0.027195,0.064209,0.043518,0.061951
1,0.081635,-0.032227,0.068909,0.082504,0.009857,-0.05542,-0.080139,0.152634,-0.122925,0.063751,...,-0.075012,-0.030418,-0.158997,0.145264,0.174988,0.007255,0.03949,-0.155701,0.000488,-0.040222
2,-0.103394,0.007202,-0.018585,0.072998,0.104065,0.027039,-0.024414,0.055664,-0.175781,-0.042969,...,-0.228516,-0.050545,-0.402344,-0.223145,0.194824,0.049561,-0.035889,-0.060669,0.027115,0.212891
3,-0.081909,-0.063721,0.080933,0.164795,0.060608,0.061646,0.236816,0.079346,-0.057861,-0.184082,...,-0.273438,-0.071785,-0.208496,-0.039551,0.143799,0.001709,-0.082275,0.070801,-0.091904,0.053711
4,0.054626,-0.195557,-0.029028,-0.056213,0.179199,-0.013428,0.020264,0.259277,-0.032227,-0.060394,...,0.173462,0.118408,-0.044434,-0.103149,0.124237,-0.050903,-0.112061,-0.040283,-0.031189,0.062378


In [36]:
item_genre_emb.shape

(6807, 300)

In [37]:
item_genre_emb = item_genre_emb.T
item_genre_emb.shape

(300, 6807)

# User별 장르 관점에서의 아이템 선호도

In [None]:
from collections import Counter
from tqdm import tqdm

# 유저별 시청한 영화 장르 파악
raw_data = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')
users = raw_data.user.unique()
user_genre = pd.DataFrame([users]).T
user_genre.columns = ['users']

# 유저별 장르 선호에 따른 추천
for x in gen.genre.unique():
    user_genre[x] = 0
user_genre.set_index('users', inplace=True)


users = raw_data.user.unique()

for i in tqdm(range(len(users))):
    user_item = raw_data[raw_data.user == users[i]].item.values
    user_fav_genre = []
    for x in user_item:
        user_fav_genre.extend(gen[gen.item == x].genre.values)

    counter_user = Counter(user_fav_genre).most_common()
    for x, y in counter_user:
        user_genre.loc[users[i], x] = y

user_genre['total'] = user_genre.sum(axis=1).values
user_genre_prefer = user_genre.iloc[:,0:].div(user_genre.total, axis=0)
items = sorted(gen.item.unique())
for x in items:
    user_genre_prefer[x] = 0

user_genre_prefer.to_csv('user_genre_prefer.csv')

items = gen.item.unique()

for i in tqdm(range(len(items))):
    i_genres = gen[gen.item == items[i]].genre.values
    for u in users:
        user_item = 0
        cnt = 0
        for x in i_genres:
            user_item += user_genre_prefer.loc[u, x]
            cnt += 1
        user_genre_prefer.loc[u, items[i]] = user_item / cnt

user_genre_prefer.iloc[:,19:].to_csv('8h_user_genre_prefer.csv')

In [38]:
user_genre_prefer = pd.read_csv('later_ana/8h_user_genre_prefer.csv')
user_genre_prefer['users'] = user_genre_prefer['users'].apply(lambda x: profile2id[x])
user_genre_prefer = user_genre_prefer.sort_values('users')
user_genre_prefer.reset_index(inplace=True, drop=True)

In [39]:
user_genre_prefer.head()

Unnamed: 0,users,0,1,2,3,4,5,6,7,8,...,6797,6798,6799,6800,6801,6802,6803,6804,6805,6806
0,0,0.095404,0.076602,0.108635,0.02507,0.047354,0.062674,0.192201,0.094011,0.033426,...,0.080223,0.063138,0.192201,0.169916,0.104921,0.140669,0.130919,0.130919,0.192201,0.169916
1,1,0.106796,0.089806,0.101942,0.048544,0.048544,0.084951,0.15534,0.080097,0.053398,...,0.085437,0.090615,0.15534,0.106796,0.080906,0.126214,0.087379,0.087379,0.15534,0.106796
2,2,0.09322,0.064972,0.110169,0.016949,0.048023,0.066384,0.20339,0.094633,0.048023,...,0.087006,0.056497,0.20339,0.186441,0.103578,0.149718,0.135593,0.135593,0.20339,0.186441
3,3,0.096667,0.053333,0.126667,0.006667,0.04,0.07,0.24,0.11,0.066667,...,0.101333,0.048889,0.24,0.146667,0.071111,0.173333,0.093333,0.093333,0.24,0.146667
4,4,0.084375,0.065625,0.13125,0.075,0.075,0.04375,0.175,0.078125,0.01875,...,0.0675,0.054167,0.175,0.2,0.104167,0.11875,0.125,0.125,0.175,0.2


In [40]:
user_genre_prefer.shape

(31360, 6808)

# Multi-VAE + Multi-DAE

In [41]:
# 데이터 로드
loader = DataLoader(args.data)

n_items = loader.load_n_items()
train_data = loader.load_data('train')
vad_data_tr, vad_data_te = loader.load_data('validation')
test_data_tr, test_data_te = loader.load_data('test')

In [42]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np


class MultiDAE(nn.Module):
    """
    Container module for Multi-DAE.

    Multi-DAE : Denoising Autoencoder with Multinomial Likelihood
    See Variational Autoencoders for Collaborative Filtering
    https://arxiv.org/abs/1802.05814
    """

    def __init__(self, p_dims, q_dims=None, dropout=0.5):
        super(MultiDAE, self).__init__()
        self.item_genre = torch.Tensor(item_genre_emb.values) ##### 추가

        self.p_dims = p_dims
        if q_dims:
            assert q_dims[0] == p_dims[-1], "In and Out dimensions must equal to each other"
            assert q_dims[-1] == p_dims[0], "Latent dimension for p- and q- network mismatches."
            self.q_dims = q_dims
        else:
            self.q_dims = p_dims[::-1]

        self.dims = self.q_dims + self.p_dims[1:]
        self.layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(self.dims[:-1], self.dims[1:])])
        self.drop = nn.Dropout(dropout)
        
        self.init_weights()
    
    def forward(self, input):
        #print('input.shape: ', input.shape)
        h = F.normalize(input)
        h = self.drop(h)
        h = torch.cat((self.item_genre.to(device), h), 0) ###추가
        #print('합친 h.shape: ', h.shape)
        for i, layer in enumerate(self.layers):
            h = layer(h)
            if i != len(self.layers) - 1:
                h = F.tanh(h) #reluX
                

        item_genre_emb, reconstructed_h = h.split([self.item_genre.shape[0], input.shape[0]], 0) ##추가
        #print('item_genre_emb.shape: ', item_genre_emb.shape)
        #print('reconstructed_h.shape: ',reconstructed_h.shape)
        return reconstructed_h

    def init_weights(self):
        for layer in self.layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)



class MultiVAE(nn.Module):
    """
    Container module for Multi-VAE.

    Multi-VAE : Variational Autoencoder with Multinomial Likelihood
    See Variational Autoencoders for Collaborative Filtering
    https://arxiv.org/abs/1802.05814
    """

    def __init__(self, p_dims, q_dims=None, dropout=0.5):
        super(MultiVAE, self).__init__()
        self.p_dims = p_dims
        self.item_genre = torch.Tensor(item_genre_emb.values) ##### 추가
        if q_dims:
            assert q_dims[0] == p_dims[-1], "In and Out dimensions must equal to each other"
            assert q_dims[-1] == p_dims[0], "Latent dimension for p- and q- network mismatches."
            self.q_dims = q_dims
        else:
            self.q_dims = p_dims[::-1]

        # Last dimension of q- network is for mean and variance
        temp_q_dims = self.q_dims[:-1] + [self.q_dims[-1] * 2]
        self.q_layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(temp_q_dims[:-1], temp_q_dims[1:])])
        self.p_layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(self.p_dims[:-1], self.p_dims[1:])])
        
        self.drop = nn.Dropout(dropout)
        self.init_weights()
    
    def forward(self, input):
        self.input = input
        
        mu, logvar = self.encode(input)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar
    
    def encode(self, input):
        h = F.normalize(input)
        h = self.drop(h)
        
        h = torch.cat((self.item_genre.to(device), h), 0) ###추가
        for i, layer in enumerate(self.q_layers):
            h = layer(h)
            if i != len(self.q_layers) - 1:
                h = F.tanh(h)
            else:
                mu = h[:, :self.q_dims[-1]]
                logvar = h[:, self.q_dims[-1]:]
        return mu, logvar

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(mu)
        else:
            return mu
    
    def decode(self, z):
        h = z
        for i, layer in enumerate(self.p_layers):
            h = layer(h)
            if i != len(self.p_layers) - 1:
                h = F.tanh(h)
        item_genre_emb, reconstructed_h = h.split([self.item_genre.shape[0], self.input.shape[0]], 0) ##추가
        return reconstructed_h

    def init_weights(self):
        for layer in self.q_layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)
        
        for layer in self.p_layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)




def loss_function_vae(recon_x, x, mu, logvar, anneal=1.0):
    BCE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
    KLD = -0.5 * torch.mean(torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1))

    return BCE + anneal * KLD

def loss_function_dae(recon_x, x):
    BCE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
    return BCE





In [43]:

def sparse2torch_sparse(data):
    """
    Convert scipy sparse matrix to torch sparse tensor with L2 Normalization
    This is much faster than naive use of torch.FloatTensor(data.toarray())
    https://discuss.pytorch.org/t/sparse-tensor-use-cases/22047/2
    """
    samples = data.shape[0]
    features = data.shape[1]
    coo_data = data.tocoo()
    indices = torch.LongTensor([coo_data.row, coo_data.col])
    row_norms_inv = 1 / np.sqrt(data.sum(1))
    row2val = {i : row_norms_inv[i].item() for i in range(samples)}
    values = np.array([row2val[r] for r in coo_data.row])
    t = torch.sparse.FloatTensor(indices, torch.from_numpy(values).float(), [samples, features])
    return t

def naive_sparse2tensor(data):
    return torch.FloatTensor(data.toarray())


def train(model, criterion, optimizer, is_VAE = False):
    # Turn on training mode
    model.train()
    train_loss = 0.0
    start_time = time.time()
    global update_count

    np.random.shuffle(idxlist)
    
    for batch_idx, start_idx in enumerate(range(0, N, args.batch_size)):
        end_idx = min(start_idx + args.batch_size, N)
        data = train_data[idxlist[start_idx:end_idx]]
        data = naive_sparse2tensor(data).to(device)
        optimizer.zero_grad()

        if is_VAE:
          if args.total_anneal_steps > 0:
            anneal = min(args.anneal_cap, 
                            1. * update_count / args.total_anneal_steps)
          else:
              anneal = args.anneal_cap

          optimizer.zero_grad()
          recon_batch, mu, logvar = model(data)
          
          loss = criterion(recon_batch, data, mu, logvar, anneal)
        else:
          recon_batch = model(data)
          loss = criterion(recon_batch, data)

        loss.backward()
        train_loss += loss.item()
        optimizer.step()

        update_count += 1

        if batch_idx % args.log_interval == 0 and batch_idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:4d}/{:4d} batches | ms/batch {:4.2f} | '
                    'loss {:4.2f}'.format(
                        epoch, batch_idx, len(range(0, N, args.batch_size)),
                        elapsed * 1000 / args.log_interval,
                        train_loss / args.log_interval))
            

            start_time = time.time()
            train_loss = 0.0
    train_loss /= len(range(0, N, args.batch_size))
    return train_loss



def evaluate(model, criterion, data_tr, data_te, is_VAE=False):
    # Turn on evaluation mode
    model.eval()
    total_loss = 0.0
    global update_count
    e_idxlist = list(range(data_tr.shape[0]))
    e_N = data_tr.shape[0]
    n100_list = []
    r10_list= []
    r20_list = []
    r50_list = []
    
    with torch.no_grad():
        for start_idx in range(0, e_N, args.batch_size):
            end_idx = min(start_idx + args.batch_size, N)
            data = data_tr[e_idxlist[start_idx:end_idx]]
            heldout_data = data_te[e_idxlist[start_idx:end_idx]]

            data_tensor = naive_sparse2tensor(data).to(device)
            if is_VAE :
              
              if args.total_anneal_steps > 0:
                  anneal = min(args.anneal_cap, 
                                1. * update_count / args.total_anneal_steps)
              else:
                  anneal = args.anneal_cap

              recon_batch, mu, logvar = model(data_tensor)

              loss = criterion(recon_batch, data_tensor, mu, logvar, anneal)

            else :
              recon_batch = model(data_tensor)
              loss = criterion(recon_batch, data_tensor)




            total_loss += loss.item()

            # Exclude examples from training set
            recon_batch = recon_batch.cpu().numpy()
            recon_batch[data.nonzero()] = -np.inf

            n100 = NDCG_binary_at_k_batch(recon_batch, heldout_data, 100)
            r20 = Recall_at_k_batch(recon_batch, heldout_data, 20)
            r10 = Recall_at_k_batch(recon_batch, heldout_data, 10)
            r50 = Recall_at_k_batch(recon_batch, heldout_data, 50)

            n100_list.append(n100)
            r20_list.append(r20)
            r10_list.append(r10)
            r50_list.append(r50)
 
    total_loss /= len(range(0, e_N, args.batch_size))
    n100_list = np.concatenate(n100_list)
    r20_list = np.concatenate(r20_list)
    r10_list = np.concatenate(r10_list)
    r50_list = np.concatenate(r50_list)

    return total_loss, np.mean(n100_list), np.mean(r10_list), np.mean(r20_list), np.mean(r50_list)


In [44]:
import bottleneck as bn
import numpy as np

def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):
    '''
    Normalized Discounted Cumulative Gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)

    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    return DCG / IDCG


def Recall_at_k_batch(X_pred, heldout_batch, k=100):
    batch_users = X_pred.shape[0]
    
    X_pred = X_pred + np.mean(item_genre_emb.values)*20
    idx = bn.argpartition(-X_pred, k, axis=1)
    
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_batch > 0).toarray()
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(
        np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall

In [45]:
#!pip install torch torchvision
#import torch
#from torch.utils.tensorboard import SummaryWriter
#writer = SummaryWriter()

#!pip install adabound
import adabound

In [46]:
import wandb

wandb.init(project="recmovie", entity="esk1")
wandb.config.update(args)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mesk1[0m (use `wandb login --relogin` to force relogin)


In [47]:

###############################################################################
# Load data
###############################################################################

loader = DataLoader(args.data)

n_items = loader.load_n_items()
train_data = loader.load_data('train')
vad_data_tr, vad_data_te = loader.load_data('validation')
test_data_tr, test_data_te = loader.load_data('test')

N = train_data.shape[0]
idxlist = list(range(N))

###############################################################################
# Build the model
###############################################################################
#p_dims = [200, 600, 1600, 3200, n_items]
p_dims = [200, 3000, n_items]
model = MultiDAE(p_dims).to(device)

#optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=args.wd)
optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1)
#https://github.com/Luolc/AdaBound
#optimizer = optim.SGD(model.parameters(), lr=1e-3, weight_decay=args.wd)
criterion = loss_function_dae

###############################################################################
# Training code
###############################################################################

best_r10 = -np.inf
update_count = 0

In [48]:
############batch 1600
train_loss_list = []
val_loss_list = []
r10_fin_list = []
new_epochs = 100

for epoch in range(1, new_epochs + 1):
    epoch_start_time = time.time()
    train_loss = train(model, criterion, optimizer, is_VAE=False)
    val_loss, n100, r10, r20, r50 = evaluate(model, criterion, vad_data_tr, vad_data_te, is_VAE=False)
    
    train_loss_list.append(train_loss)
    val_loss_list.append(val_loss)
    r10_fin_list.append(r10)

    print('-' * 89)
    print('| end of epoch {:3d} | time: {:4.2f}s | valid loss {:4.2f} | '
            'n100 {:5.3f} | r10 {:5.3f} | r20 {:5.3f} | r50 {:5.3f}'.format(
                epoch, time.time() - epoch_start_time, val_loss,
                n100, r10, r20, r50))
    print('-' * 89)

    n_iter = epoch * len(range(0, N, args.batch_size))


    # Save the model if the n100 is the best we've seen so far.
    if r10 > best_r10:
        with open('DAE_'+args.save, 'wb') as f:
            torch.save(model, f)
        best_r10 = r10
        print("Better performance! save best model...")

    wandb.log({
        "train_loss": train_loss,
        "val_loss": val_loss,
        "n100": n100,
        "r10": r10, 
        "r20": r20,
        "r50": r50
    })

# Load the best saved model.
with open('DAE_'+args.save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss, n100, r10, r20, r50 = evaluate(model, criterion, test_data_tr, test_data_te, is_VAE=False)
print('=' * 89)
print('| End of training | test loss {:4.2f} | n100 {:4.2f} | r10 {:4.2f} | r20 {:4.2f} | '
        'r50 {:4.2f}'.format(test_loss, n100, r10, r20, r50))
print('=' * 89)
wandb.watch(model)

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 2.95s | valid loss 968.81 | n100 0.344 | r10 0.281 | r20 0.257 | r50 0.312
-----------------------------------------------------------------------------------------




Better performance! save best model...
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 2.83s | valid loss 946.18 | n100 0.385 | r10 0.315 | r20 0.290 | r50 0.350
-----------------------------------------------------------------------------------------
Better performance! save best model...
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 2.94s | valid loss 936.94 | n100 0.402 | r10 0.330 | r20 0.305 | r50 0.367
-----------------------------------------------------------------------------------------
Better performance! save best model...
-----------------------------------------------------------------------------------------
| end of epoch   4 | time: 2.90s | valid loss 930.41 | n100 0.415 | r10 0.347 | r20 0.319 | r50 0.381
-----------------------------------------------------------------------------------------
Better performance! save best model..

[]

## Multivae

In [49]:
import wandb

wandb.init(project="recmovie", entity="esk1")
wandb.config.update(args)




0,1
n100,▁▅▇▇▇█▇█▇██████████▇█████████████████▇▇▇
r10,▁▅▇▇▇█▇█▅▇███▇████▇▅█▇▇██▇▇▇▇▇▇▇█▇▇▇▇▇▇▇
r20,▁▅▇▇▇█▇█▆▇▇██▇████▇▆██████▇▇▇▇▇▇█▇▇▇▇▇▇▇
r50,▁▅▇▇████▇██████████▇████████████████████
train_loss,█▅▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁
val_loss,█▆▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
n100,0.43905
r10,0.35739
r20,0.33532
r50,0.40545
train_loss,1075.3526
val_loss,877.73273


In [50]:

###############################################################################
# Load data
###############################################################################

loader = DataLoader(args.data)

n_items = loader.load_n_items()
train_data = loader.load_data('train')
vad_data_tr, vad_data_te = loader.load_data('validation')
test_data_tr, test_data_te = loader.load_data('test')

N = train_data.shape[0]
idxlist = list(range(N))

###############################################################################
# Build the model
###############################################################################

p_dims = [200, 3000, n_items]
model2 = MultiVAE(p_dims).to(device)

optimizer2 = adabound.AdaBound(model2.parameters(), lr=1e-3, final_lr=0.1)
#optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=args.wd)
criterion2 = loss_function_vae

###############################################################################
# Training code
###############################################################################

best_n100 = -np.inf
update_count = 0

In [51]:
train_loss_list = []
val_loss_list = []
r10_fin_list = []

best_r10 = -np.inf
# save best model as r10
for epoch in range(1, 100 + 1):
    epoch_start_time = time.time()
    train_loss = train(model2, criterion2, optimizer2, is_VAE=True)
    val_loss, n100, r10, r20, r50 = evaluate(model2, criterion2, vad_data_tr, vad_data_te, is_VAE=True)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:4.2f}s | valid loss {:4.2f} | '
            'n100 {:5.3f} | r10 {:5.3f} | r20 {:5.3f} | r50 {:5.3f}'.format(
                epoch, time.time() - epoch_start_time, val_loss,
                n100, r10, r20, r50))
    print('-' * 89)

    n_iter = epoch * len(range(0, N, args.batch_size))
    train_loss_list.append(train_loss)
    val_loss_list.append(val_loss)
    r10_fin_list.append(r10)

    # Save the model if the r10 is the best we've seen so far.
    if r10 > best_r10:
        with open('VAE_'+args.save, 'wb') as f:
            torch.save(model2, f)
        best_r10 = r10
        print("Better performance! save best model...")

    wandb.log({
        "train_loss": train_loss,
        "val_loss": val_loss,
        "n100": n100,
        "r10": r10, 
        "r20": r20,
        "r50": r50
    })


# Load the best saved model.
with open('VAE_'+args.save, 'rb') as f:
    model2 = torch.load(f)

# Run on test data.
test_loss, n100, r10, r20, r50 = evaluate(model2, criterion2, test_data_tr, test_data_te, is_VAE=True)
print('=' * 89)
print('| End of training | test loss {:4.2f} | n100 {:4.2f} | r10 {:4.2f} | r20 {:4.2f} | '
        'r50 {:4.2f}'.format(test_loss, n100, r10, r20, r50))
print('=' * 89)
wandb.watch(model)



-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 3.01s | valid loss 972.23 | n100 0.345 | r10 0.281 | r20 0.258 | r50 0.312
-----------------------------------------------------------------------------------------




Better performance! save best model...
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 2.83s | valid loss 952.94 | n100 0.383 | r10 0.323 | r20 0.291 | r50 0.344
-----------------------------------------------------------------------------------------
Better performance! save best model...
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 2.83s | valid loss 941.27 | n100 0.405 | r10 0.340 | r20 0.311 | r50 0.367
-----------------------------------------------------------------------------------------
Better performance! save best model...
-----------------------------------------------------------------------------------------
| end of epoch   4 | time: 2.77s | valid loss 934.64 | n100 0.415 | r10 0.349 | r20 0.318 | r50 0.378
-----------------------------------------------------------------------------------------
Better performance! save best model..

[]

In [52]:
## 배치사이즈 포함
def numerize_for_infer(tp, profile2id, show2id):
    uid = tp['user'].apply(lambda x: profile2id[x])
    sid = tp['item'].apply(lambda x: show2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

### 데이터 준비    
infer_df = numerize_for_infer(raw_data, profile2id, show2id)

loader = DataLoader(args.data)
n_items = loader.load_n_items()

n_users = infer_df['uid'].max() + 1

rows, cols = infer_df['uid'], infer_df['sid']
data = sparse.csr_matrix((np.ones_like(rows),
                                 (rows, cols)), dtype='float64',
                                 shape=(n_users, n_items))

N = data.shape[0]
idxlist = list(range(N))

model.eval()
model2.eval()
total_loss = 0.0
e_idxlist = list(range(data.shape[0]))
e_N = data.shape[0]
pred_list = None
user_genre_prefer.drop('users', axis=1, inplace=True)

In [90]:
total_loss = 0
total_loss2 = 0
with torch.no_grad():
    for start_idx in range(0, e_N, args.batch_size):
        end_idx = min(start_idx + args.batch_size, N)
        data_batch = data[e_idxlist[start_idx:end_idx]]

        data_tensor = naive_sparse2tensor(data_batch).to(device)
        data_tensor2 = naive_sparse2tensor(data_batch).to(device)

        if args.total_anneal_steps > 0:
            anneal = min(args.anneal_cap, 1. * update_count / args.total_anneal_steps)
        else:
            anneal = args.anneal_cap
        
        recon_batch = model(data_tensor)
        loss = criterion(recon_batch, data_tensor)
        total_loss += loss.item()

        recon_batch2, mu, logvar = model2(data_tensor2)
        loss2 = criterion2(recon_batch2, data_tensor2, mu, logvar, anneal)
        total_loss2 += loss2.item()

        # Exclude examples from training set
        recon_batch2 = recon_batch2.cpu().numpy()
        recon_batch = recon_batch.cpu().numpy()
        
        recon_batch = np.add(recon_batch, recon_batch2) # 1:1로 앙상블
        recon_batch = recon_batch + np.mean(item_genre_emb.values)*20 + user_genre_prefer.values[e_idxlist[start_idx:end_idx]]*20
        recon_batch[data_batch.nonzero()] = -np.inf
  
        ##Recall
        batch_users = recon_batch.shape[0]
        idx = bn.argpartition(-recon_batch, 10, axis=1)[:, :10]
        if start_idx == 0:
            pred_list = idx
        else:
            pred_list = np.append(pred_list, idx, axis=0)

print(pred_list.shape)
## sample_submission에 맞게끔 바꾸기
user2 = []
item2 = []
for i_idx, arr_10 in enumerate(pred_list):
    user2.extend([i_idx]*10)
    item2.extend(arr_10)

u2 = pd.DataFrame(user2, columns=['user'])
i2 = pd.DataFrame(item2, columns=['item'])
all2 = pd.concat([u2, i2], axis=1)

re_p2id = dict((v, k) for k, v in profile2id.items())
re_s2id = dict((v, k) for k, v in show2id.items())

def de_numerize(tp, re_p2id, re_s2id):
    uid2 = tp['user'].apply(lambda x: re_p2id[x])
    sid2 = tp['item'].apply(lambda x: re_s2id[x])
    return pd.DataFrame(data={'uid': uid2, 'sid': sid2}, columns=['uid', 'sid'])

ans2 = de_numerize(all2, re_p2id, re_s2id)
ans2.columns = ['user', 'item']
new_ans2 = ans2.sort_values('user')

### 확인용
submit_data = pd.read_csv('/opt/ml/input/data/eval/sample_submission.csv', sep=',')
sum(new_ans2.user.values == submit_data.user.values)
new_ans2.reset_index(drop=True, inplace=True)
new_ans2.to_csv('0405_user_item_prefer.csv', index=False)



(31360, 10)


In [59]:
##배치없이 -> 오류
### 예측하기
total_loss = 0
total_loss2 = 0

with torch.no_grad():
    data_tensor = naive_sparse2tensor(data).to(device)
    data_tensor2 = naive_sparse2tensor(data).to(device)

    if args.total_anneal_steps > 0:
        anneal = min(args.anneal_cap, 1. * update_count / args.total_anneal_steps)
    else:
        anneal = args.anneal_cap
        
    recon_batch = model(data_tensor)
    loss = criterion(recon_batch, data_tensor)
    total_loss += loss.item()

    recon_batch2, mu, logvar = model2(data_tensor2)
    loss2 = criterion2(recon_batch2, data_tensor2, mu, logvar, anneal)
    total_loss2 += loss2.item()

# Exclude examples from training set
recon_batch2 = recon_batch2.cpu().numpy()
recon_batch = recon_batch.cpu().numpy()
        
recon_batch = np.add(recon_batch, recon_batch2) # 1:1로 앙상블
print(recon_batch)
recon_batch = recon_batch + np.mean(item_genre_emb.values)*20 + user_genre_prefer.values*20
recon_batch[data.nonzero()] = -np.inf

idx = bn.argpartition(-recon_batch, 10, axis=1)[:, :10]


### sample_submission 형태로 바꾸고 id 다시 바꾸기
user = []
item = []
for i_idx, arr_10 in enumerate(idx):
    user.extend([i_idx]*10)
    item.extend(arr_10)

u = pd.DataFrame(user, columns=['user'])
i = pd.DataFrame(item, columns=['item'])
all = pd.concat([u, i], axis=1)

re_p2id = dict((v, k) for k, v in profile2id.items())
re_s2id = dict((v, k) for k, v in show2id.items())

def de_numerize(tp, re_p2id, re_s2id):
    uid = tp['user'].apply(lambda x: re_p2id[x])
    sid = tp['item'].apply(lambda x: re_s2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

ans = de_numerize(all, re_p2id, re_s2id)
ans.columns = ['user', 'item']
new_ans = ans.sort_values('user')

### 확인용
submit_data = pd.read_csv('/opt/ml/input/data/eval/sample_submission.csv', sep=',')
sum(new_ans.user.values == submit_data.user.values)
new_ans.reset_index(drop=True, inplace=True)
new_ans.to_csv('0405_second.csv', index=False)



[[ -1.7009397   -2.4967341   -1.6319823  ... -16.350296    -8.83639
  -10.603864  ]
 [  5.370331     6.2508354    0.9975498  ... -11.320208   -11.89768
   -4.7585497 ]
 [  2.1687698    3.367939     0.51937413 ...  -6.740138   -13.162594
   -4.848978  ]
 ...
 [ -1.8735077   -0.25055498  -3.3098485  ... -10.564044    -2.3500547
   -8.045005  ]
 [ -3.3119338   -2.2483325   -0.8973126  ...  -8.552555    -1.1070881
   -8.202363  ]
 [ -3.106039    -0.7413424    6.3965263  ...  -7.470235    -8.992666
   -8.2287855 ]]
