In [1]:
import math
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

import implicit
from implicit.evaluation import ranking_metrics_at_k, train_test_split
from scipy import sparse

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
############# 중요 #############
# data_path는 사용자의 디렉토리에 맞게 설정해야 합니다.
data_path = '../../data/train/'
df = pd.read_csv(data_path+'train_ratings.csv')

item_ids = np.sort(df['item'].unique())
user_ids = np.sort(df['user'].unique())
num_item, num_user = len(item_ids), len(user_ids)

# user, item indexing
# item은 cloze task 수행을 위해 0값은 mask로 사용되기 때문에 +1을 해준다.
item2idx = pd.Series(data=np.arange(len(item_ids)), index=item_ids) # item re-indexing (1~num_item), num_item+1: mask idx
user2idx = pd.Series(data=np.arange(len(user_ids)), index=user_ids) # user re-indexing (0~num_user-1)

# dataframe indexing
df = pd.merge(df, pd.DataFrame({'item': item_ids, 'item_idx': item2idx[item_ids].values}), on='item', how='inner')
df = pd.merge(df, pd.DataFrame({'user': user_ids, 'user_idx': user2idx[user_ids].values}), on='user', how='inner')
df.sort_values(['user_idx', 'time'], inplace=True)
del df['item'], df['user'] 

# train set, valid set 생성
users = defaultdict(list) # defaultdict은 dictionary의 key가 없을때 default 값을 value로 반환
user_train = {}
user_valid = {}
for u, i, t in zip(df['user_idx'], df['item_idx'], df['time']):
    users[u].append(i)

# valid set은 각 유저 별 마지막 interaction을 사용
for user in users:
    user_train[user] = users[user][:-5]
    user_valid[user] = users[user][-5:]

print(f'num users: {num_user}, num items: {num_item}')

num users: 31360, num items: 6807


In [3]:
df.columns = ['time','sid','uid']

In [4]:
valid_df = df.groupby('uid').tail(5)

In [5]:
train_df = df.drop(valid_df.index)

In [6]:
valid_df.head(10)

Unnamed: 0,time,sid,uid
371,1294796106,5055,0
372,1294796113,3165,0
373,1294796119,4965,0
374,1294796132,3567,0
375,1294796159,2304,0
4948380,1225320379,729,1
4948381,1225320408,157,1
4948382,1225320490,1197,1
4948383,1225320523,718,1
4948384,1225320550,279,1


In [7]:
rows_tr, cols_tr = train_df['uid'], train_df['sid']
rows_vd, cols_vd = valid_df['uid'], valid_df['sid']
rows, cols = df['uid'], df['sid']

In [8]:
train_data = sparse.csr_matrix(
                              (np.ones_like(rows_tr),(rows_tr,cols_tr)),
                              dtype='float64',
                              shape=(31360,6807)
                              )

In [9]:
valid_data = sparse.csr_matrix(
                              (np.ones_like(rows_vd),(rows_vd,cols_vd)),
                              dtype='float64',
                              shape=(31360,6807)
                              )

In [10]:
full_data = sparse.csr_matrix(
                              (np.ones_like(rows),(rows,cols)),
                              dtype='float64',
                              shape=(31360,6807)
                              )

In [11]:
tr, te = train_test_split(train_data,train_percentage=0.968)

In [12]:
te.sum()

160170.0

In [13]:
model = implicit.als.AlternatingLeastSquares(factors=60,regularization=0.01)

In [14]:
model.fit(full_data)

100%|██████████| 15/15 [00:01<00:00, 11.32it/s]


In [15]:
ranking_metrics_at_k(model, tr, valid_data)

100%|██████████| 31360/31360 [00:00<00:00, 220704.32it/s]


{'precision': 0.15522321428571428,
 'map': 0.07750505142532985,
 'ndcg': 0.13885590593743413,
 'auc': 0.576956766167273}

In [16]:
ranking_metrics_at_k(model, tr, te)

100%|██████████| 29245/29245 [00:00<00:00, 233885.50it/s]


{'precision': 0.29665467204961676,
 'map': 0.1954980395110777,
 'ndcg': 0.2965409987583518,
 'auc': 0.6504993551606533}

In [17]:
predictions = model.recommend([i for i in range(31360)],full_data,N=20)[0]

In [18]:
result = pd.DataFrame()
for uid in range(31360):
    tmp = pd.DataFrame({'uid':uid,'sid':predictions[uid]})
    result = pd.concat((result,tmp))

In [19]:
result

Unnamed: 0,uid,sid
0,0,2381
1,0,354
2,0,1458
3,0,1
4,0,4679
...,...,...
15,31359,4886
16,31359,328
17,31359,4101
18,31359,729


In [20]:
idx2item = pd.Series(item2idx.index)
idx2user = pd.Series(user2idx.index)

In [21]:
result['user'] = result['uid'].apply(lambda x: idx2user.loc[x])
result['item'] = result['sid'].apply(lambda x: idx2item.loc[x])
result

Unnamed: 0,uid,sid,user,item
0,0,2381,11,4370
1,0,354,11,590
2,0,1458,11,2617
3,0,1,11,2
4,0,4679,11,34405
...,...,...,...,...
15,31359,4886,138493,44191
16,31359,328,138493,541
17,31359,4101,138493,8961
18,31359,729,138493,1282


In [22]:
result[['user','item']].to_csv('output/als_submission.csv',index=False)