In [None]:
!

In [147]:
import os
import bottleneck as bn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [1]:
import argparse
import time
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from scipy import sparse
import os
import pandas as pd
## 각종 파라미터 세팅
parser = argparse.ArgumentParser(description='PyTorch Variational Autoencoders for Collaborative Filtering')


parser.add_argument('--data', type=str, default='data/train/', help='Movielens dataset location')
parser.add_argument('--lr', type=float, default=1e-4, help='initial learning rate')
parser.add_argument('--wd', type=float, default=0.00, help='weight decay coefficient')
parser.add_argument('--batch_size', type=int, default=500, help='batch size')
parser.add_argument('--epochs', type=int, default=20, help='upper epoch limit')
parser.add_argument('--total_anneal_steps', type=int, default=200000, help='the total number of gradient updates for annealing')
parser.add_argument('--anneal_cap', type=float, default=0.2, help='largest annealing parameter')
parser.add_argument('--seed', type=int, default=1111, help='random seed')
parser.add_argument('--cuda', action='store_true', help='use CUDA')
parser.add_argument('--log_interval', type=int, default=100, metavar='N', help='report interval')
parser.add_argument('--save', type=str, default='model.pt', help='path to save the final model')
args = parser.parse_args([])

# Set the random seed manually for reproductibility.
torch.manual_seed(args.seed)

#만약 GPU가 사용가능한 환경이라면 GPU를 사용
if torch.cuda.is_available():
    args.cuda = True

device = torch.device("cuda" if args.cuda else "cpu")
device

device(type='cuda')

In [112]:
import os
import pandas as pd
from scipy import sparse
import numpy as np

def get_count(tp, id):
    '''
    tp -> DataFrame
    id -> Feature of DataFrame
    '''
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()

    return count

# 특정한 횟수 이상의 리뷰가 존재하는(사용자의 경우 min_uc 이상, 아이템의 경우 min_sc이상) 
# 데이터만을 추출할 때 사용하는 함수입니다.
# 현재 데이터셋에서는 결과적으로 원본그대로 사용하게 됩니다.
def filter_triplets(tp, min_uc=5, min_sc=0):
    if min_sc > 0:
        itemcount = get_count(tp, 'item') # tp DF의 'item'에 대한 count
        tp = tp[tp['item'].isin(itemcount.index[itemcount >= min_sc])]

    if min_uc > 0:
        usercount = get_count(tp, 'user')
        tp = tp[tp['user'].isin(usercount.index[usercount >= min_uc])]

    usercount, itemcount = get_count(tp, 'user'), get_count(tp, 'item')
    return tp, usercount, itemcount

#훈련된 모델을 이용해 검증할 데이터를 분리하는 함수입니다.
#100개의 액션이 있다면, 그중에 test_prop 비율 만큼을 비워두고, 그것을 모델이 예측할 수 있는지를
#확인하기 위함입니다.
def split_train_test_proportion(data, test_prop=0.2):
    '''
    data -> DataFrame
    
    train과 test를 8:2 비율로 나눠주는 함수.
    '''
    data_grouped_by_user = data.groupby('user')
    tr_list, te_list = list(), list()

    np.random.seed(98765)
    
    for _, group in data_grouped_by_user:
        n_items_u = len(group)
        
        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool') # 'False'가 n_items_u개 만큼 채워진 array
            
            # n_items_u개 중에서 20%의 인덱스를 랜덤으로 뽑아서 해당 인덱스를 'True'로 바꿈
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True
                    
            tr_list.append(group[np.logical_not(idx)]) # 'False'인 것을 tr_list에 추가
            te_list.append(group[idx]) # 'True'인 것을 te_list에 추가
        
        else:
            tr_list.append(group)
    
    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

def numerize(tp, profile2id, show2id):
    '''
    tp -> DataFrame
    profile2id, show2id -> dict()
    
    user, item을 reindexing한 df 반환.
    '''
    uid = tp['user'].apply(lambda x: profile2id[x])
    sid = tp['item'].apply(lambda x: show2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [182]:
print("Load and Preprocess Movielens dataset")
# Load Data
DATA_DIR = args.data
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'train_ratings.csv'), header=0)
raw_data = raw_data.drop(columns=['time'])
raw_data['rating'] = 1
print("원본 데이터\n", raw_data)

Load and Preprocess Movielens dataset
원본 데이터
            user   item  rating
0            11   4643       1
1            11    170       1
2            11    531       1
3            11    616       1
4            11   2140       1
...         ...    ...     ...
5154466  138493  44022       1
5154467  138493   4958       1
5154468  138493  68319       1
5154469  138493  40819       1
5154470  138493  27311       1

[5154471 rows x 3 columns]


In [183]:
movie, user_activity, item_popularity = filter_triplets(raw_data, min_uc=5, min_sc=0)

# Shuffle User Indices
unique_uid = user_activity.index
print("(BEFORE) unique_uid:", unique_uid)
np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)  # 해당 숫자까지의 인덱스를 무작위로 섞은 것을 arr로 반환
unique_uid = unique_uid[idx_perm]
print("(AFTER) unique_uid:", unique_uid)  # 무작위로 item을 섞음

(BEFORE) unique_uid: Int64Index([    11,     14,     18,     25,     31,     35,     43,     50,
                58,     60,
            ...
            138459, 138461, 138470, 138471, 138472, 138473, 138475, 138486,
            138492, 138493],
           dtype='int64', name='user', length=31360)
(AFTER) unique_uid: Int64Index([ 27968,  67764,   2581,  82969, 137831,  48639,  97870,  40424,
             46835,  79570,
            ...
            114284,   9009,  21165,  33920,  22054, 135379, 125855,  41891,
             15720,  17029],
           dtype='int64', name='user', length=31360)


In [184]:
unique_sid = pd.unique(movie['item'])
len(unique_sid)

6807

In [185]:
show2id = dict((int(sid), int(i)) for (i, sid) in enumerate(unique_sid))  # item2idx dict
profile2id = dict((int(pid), int(i)) for (i, pid) in enumerate(unique_uid))  # user2idx dict

In [186]:
movie

Unnamed: 0,user,item,rating
0,11,4643,1
1,11,170,1
2,11,531,1
3,11,616,1
4,11,2140,1
...,...,...,...
5154466,138493,44022,1
5154467,138493,4958,1
5154468,138493,68319,1
5154469,138493,40819,1


In [187]:
def numerize(tp, profile2id, show2id):
    """
    tp -> DataFrame
    profile2id, show2id -> dict()

    user, item을 reindexing한 df 반환.
    """
    uid = tp['user'].apply(lambda x: profile2id[x])
    sid = tp['item'].apply(lambda x: show2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid, 'rating': tp['rating']}, columns=['uid', 'sid', 'rating'])

In [188]:
movie = numerize(movie, profile2id, show2id)
movie

Unnamed: 0,uid,sid,rating
0,11825,0,1
1,11825,1,1
2,11825,2,1
3,11825,3,1
4,11825,4,1
...,...,...,...
5154466,10783,423,1
5154467,10783,1491,1
5154468,10783,331,1
5154469,10783,733,1


In [189]:
pivot_table = movie.pivot_table(index = ["uid"], columns = ["sid"],values = "rating")
pivot_table

sid,0,1,2,3,4,5,6,7,8,9,...,6797,6798,6799,6800,6801,6802,6803,6804,6805,6806
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31355,,,,,,,,,,,...,,,,,,,,,,
31356,,,,,,,,,,,...,,,,,,,,,,
31357,,,,,,,1.0,,,,...,,,,,,,,,,
31358,,,,1.0,,,,,,,...,,,,,,,,,,


In [190]:
X = pivot_table.to_numpy()
X

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [191]:
X = np.nan_to_num(X)
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [192]:
class EASE:
    """
    Embarrassingly Shallow Autoencoders model class
    """
    
    def __init__(self, lambda_):
        self.B = None
        self.lambda_ = lambda_
    
    def train(self, rating_matrix):
        """
        train pass
        :param rating_matrix: rating matrix
        """
        G = rating_matrix.T @ rating_matrix
        diag = list(range(G.shape[0]))
        G[diag, diag] += self.lambda_
        P = np.linalg.inv(G)

        # B = P * (X^T * X − diagMat(γ))
        self.B = P / -np.diag(P)
        min_dim = min(*self.B.shape)
        self.B[range(min_dim), range(min_dim)] = 0
    
    def forward(self, user_row):
        """
        forward pass
        """
        return user_row @ self.B

In [209]:
ease = EASE(500)

In [210]:
ease.train(X)

In [211]:
num_k = 10

result = ease.forward(X[:, :])
result[X.nonzero()] = -np.inf
pred_list = bn.argpartition(-result, num_k, axis=1)[:, :num_k]

In [212]:
pd.DataFrame(result)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6797,6798,6799,6800,6801,6802,6803,6804,6805,6806
0,0.011731,-0.035673,-0.015981,-0.047282,-0.024147,-0.002194,0.274899,-0.021591,-0.020003,0.042056,...,0.004111,-0.003226,0.014818,0.001546,0.016300,-0.012090,0.004897,-0.002854,0.000832,0.006400
1,0.028961,0.054722,-0.011079,0.015485,0.053344,-0.012203,0.009506,-0.035038,0.020011,0.030836,...,-0.001435,-0.000778,0.005790,0.020769,-0.000095,-0.004505,-0.006420,0.015964,0.004832,-0.004245
2,0.034148,0.032678,-0.006214,-0.055194,-0.029402,0.017823,0.000420,0.003860,0.036687,-0.013020,...,-0.003991,0.006797,-0.006167,-0.004917,0.003496,0.000311,0.010890,-0.003493,-0.006652,0.009338
3,-0.048228,-0.018545,-0.002125,-0.007178,0.015331,0.022188,-0.017350,0.052638,0.008847,0.061715,...,-0.000399,0.007201,0.003958,-0.008591,0.001508,-0.007081,0.003635,0.015445,0.000992,0.000830
4,0.049076,0.020851,0.024238,-0.012179,0.017176,0.033463,0.012917,0.001824,0.009960,0.021081,...,-0.008223,-0.003875,-0.004487,-0.001991,0.003346,0.001011,-0.009274,-0.002192,-0.000363,-0.009642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31355,0.003707,0.031630,0.012206,0.017352,0.005590,-0.010473,-0.000901,0.027524,-0.041609,-0.007252,...,0.000713,0.000412,0.011423,0.022222,0.007883,0.004993,-0.007255,-0.003051,-0.001425,-0.004404
31356,-0.013426,0.022571,0.019493,0.027699,0.007411,-0.006143,0.666106,0.002495,-0.020419,0.004112,...,-0.003953,-0.008163,-0.009372,-0.007753,-0.004473,0.014978,-0.018267,0.007105,-0.001977,0.014636
31357,0.032782,-0.009010,0.023447,0.071709,0.195722,-0.017960,-inf,-0.028239,0.141336,0.014548,...,-0.021472,0.003875,-0.009756,0.001943,-0.009948,0.005193,0.029362,0.002945,-0.011655,-0.008748
31358,-0.023865,-0.024192,0.190202,-inf,0.267330,-0.030685,0.220291,-0.007516,0.186605,-0.063117,...,0.018202,0.071676,0.022489,0.001929,0.034934,-0.024364,0.010463,-0.022868,-0.017616,-0.038654


In [213]:
print(pred_list)

[[ 599  604  592 ...  299  858  844]
 [ 656  157   40 ... 1469  652  306]
 [ 571   82  375 ...  264   80  625]
 ...
 [  80   84  376 ... 1399  217 1451]
 [  46 1285 1068 ...  208  450 1754]
 [ 383  374  209 ... 1287  474  796]]


In [151]:
def denumerize_for_infer(tp, re_p2id, re_s2id):
    uid2 = tp['user'].apply(lambda x: re_p2id[x])
    sid2 = tp['item'].apply(lambda x: re_s2id[x])
    return pd.DataFrame(data={'uid': uid2, 'sid': sid2}, columns=['uid', 'sid'])

In [214]:
user2 = []
item2 = []
for i_idx, arr_10 in tqdm(enumerate(pred_list)):
    user2.extend([i_idx]*10)
    item2.extend(arr_10)

output = pd.DataFrame({"user":user2, "item":item2})
output

31360it [00:00, 323033.37it/s]


Unnamed: 0,user,item
0,0,599
1,0,604
2,0,592
3,0,1253
4,0,504
...,...,...
313595,31359,202
313596,31359,564
313597,31359,1287
313598,31359,474


In [215]:
re_p2id = dict((int(v), int(k)) for k, v in profile2id.items())
re_s2id = dict((int(v), int(k)) for k, v in show2id.items())

output = denumerize_for_infer(output, re_p2id, re_s2id)
output.columns = ['user', 'item']
output = output.sort_values('user')
output

Unnamed: 0,user,item
118257,11,33004
118250,11,4370
118251,11,4886
118252,11,40815
118253,11,47
...,...,...
107837,138493,32587
107838,138493,4022
107839,138493,53125
107830,138493,551


In [216]:
output.to_csv("output8.csv", index=False)

In [95]:
rating = raw_data.drop(columns=['time'])
rating['rating'] = 1
# rating = rating.sort_values(["user", 'item'])
# rating.reset_index(drop=True, inplace = True)
rating

Unnamed: 0,user,item,rating
0,11,4643,1
1,11,170,1
2,11,531,1
3,11,616,1
4,11,2140,1
...,...,...,...
5154466,138493,44022,1
5154467,138493,4958,1
5154468,138493,68319,1
5154469,138493,40819,1


In [96]:
data = pd.merge(movie, rating)
data

Unnamed: 0,item,title,user,rating
0,318,"Shawshank Redemption, The (1994)",11,1
1,318,"Shawshank Redemption, The (1994)",18,1
2,318,"Shawshank Redemption, The (1994)",25,1
3,318,"Shawshank Redemption, The (1994)",35,1
4,318,"Shawshank Redemption, The (1994)",43,1
...,...,...,...,...
5154466,8130,"Girl Next Door, The (1999)",119506,1
5154467,8130,"Girl Next Door, The (1999)",130762,1
5154468,8130,"Girl Next Door, The (1999)",135885,1
5154469,8130,"Girl Next Door, The (1999)",137805,1


In [97]:
# data = data.iloc[:1000000, :]
pivot_table = data.pivot_table(index = ['user'], columns = ['item'], values = 'rating')
pivot_table

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,1.0,,,,,,,,,,...,,,,,,,,,,
14,1.0,,,,,,1.0,,,,...,,,,,,,,,,
18,,,,,,,,,,,...,,,,,,,,,,
25,,,,,,,,,,,...,,,,,,,,,,
31,,,,,,,,,,,...,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138473,1.0,,,,,,,,,,...,,,,,,,,,,
138475,,,,,,,,,,,...,,,,,,,,,,
138486,1.0,,,,,,,,,,...,,,,,,,,,,
138492,,,,,,,,,,,...,,,,,,,,,,


In [98]:
X = pivot_table.to_numpy()
X

array([[ 1., nan, nan, ..., nan, nan, nan],
       [ 1., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [ 1., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [ 1.,  1., nan, ..., nan, nan, nan]])

In [99]:
X = np.nan_to_num(X)
X

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.]])

In [101]:
ease = EASE(300)

In [102]:
ease.train(X)

In [103]:
K = 10
top_k_indexes = ease.forward(X[0, :])

# top_k_indexes
top_k_indexes[X[0, :].nonzero()] = -np.inf
top_k_indexes = top_k_indexes.argsort()[-K:][::-1]

In [104]:
for i in movie['item'][top_k_indexes].to_numpy():
    print(i)

43869
25833
4886
361
102125
2052
1976
1588
1378
2571


In [88]:
for i in movie['item'][top_k_indexes].to_numpy():
    print(i)

43869
1378
314
4886
25833
69995
102125
25963
1419
1588


In [63]:
list(movie['item'][top_k_indexes].values)

[4370, 33004, 2997, 47, 4886, 30707, 32587, 3996, 3081, 7438]

In [105]:
users = []
items = []

u = sorted(rating['user'].unique())
for i in u:
    users.extend([i]*10)

In [65]:
from tqdm import tqdm

In [106]:
for i in tqdm(range(len(X))):
    top_k_indexes = ease.forward(X[i, :])

    # top_k_indexes
    top_k_indexes[X[0, :].nonzero()] = -np.inf
    top_k_indexes = top_k_indexes.argsort()[-K:][::-1]
    
    items.extend(list(movie['item'][top_k_indexes].values))

100%|██████████| 31360/31360 [04:59<00:00, 104.74it/s]


In [109]:
output = pd.DataFrame({'user':users, 'item' : items})

In [110]:
output

Unnamed: 0,user,item
0,11,43869
1,11,25833
2,11,4886
3,11,361
4,11,102125
...,...,...
313595,138493,1588
313596,138493,3175
313597,138493,5294
313598,138493,6297


In [111]:
output.to_csv("output2.csv", index=False)