In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch

from recbole.model.general_recommender.ease import EASE
from recbole.model.context_aware_recommender.ffm import FFM

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color


SEED=13

In [2]:
!ls ../../data/train
!readlink -ef ../../data/train/train_ratings.csv

Ml_item2attributes.json  genres.tsv  train_ratings.csv	years.tsv
directors.tsv		 titles.tsv  writers.tsv
/opt/ml/input/data/train/train_ratings.csv


## 데이터 로드

In [148]:
train = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")

In [78]:
user2idx = {v:k for k,v in enumerate(sorted(set(train.user)))}
item2idx = {v:k for k,v in enumerate(sorted(set(train.item)))}
uidx2user = {k:v for k,v in enumerate(sorted(set(train.user)))}
iidx2item = {k:v for k,v in enumerate(sorted(set(train.item)))}

## make inter file

In [79]:
train.user = train.user.map(user2idx)
train.item = train.item.map(item2idx)

In [80]:
train.columns=['user_id:token','item_id:token','timestamp:float']

In [81]:
train[:2]

Unnamed: 0,user_id:token,item_id:token,timestamp:float
0,0,2505,1230782529
1,0,109,1230782534


In [82]:
outpath = f"dataset/train_data"
os.makedirs(outpath, exist_ok=True)
# sub_train=train.groupby("user").sample(n=10, random_state=SEED)
# sub_train.shape
train.to_csv(os.path.join(outpath,"train_data.inter"),sep='\t',index=False)

In [83]:
train.shape

(5154471, 3)

## make yaml file

In [84]:
yamldata="""
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
"""
with open("train_data.yaml", "w") as f:
    f.write(yamldata)

121

## make config, logger

In [247]:
logger = getLogger()

# configurations initialization
config = Config(model='EASE', dataset="train_data", config_file_list=[f'train_data.yaml'])
# config['epochs'] = 100
config['show_progress'] = False
config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config['valid_metric'] = "Recall@10"
config['eval_args'] = {'split': {'RS': [1, 0, 0]},
                         'group_by': 'user',
                         'order': 'RO',
                         'mode': 'full'}
config['topk']=[20]
init_seed(config['seed'], config['reproducibility'])
# logger initialization
init_logger(config)
logger.info(config)

21 Dec 16:27    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/train_data
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 100
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [1, 0, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [20]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_sep

## make dataset

In [248]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

21 Dec 16:28    INFO  train_data
The number of users: 31361
Average actions of users: 164.36450892857144
The number of items: 6808
Average actions of items: 757.2309387395328
The number of inters: 5154471
The sparsity of the dataset: 97.58579218741939%
Remain Fields: ['user_id', 'item_id', 'timestamp']
21 Dec 16:28    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
21 Dec 16:28    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [1, 0, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


In [250]:
train_data.dataset
valid_data.dataset
test_data.dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 164.36450892857144
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 757.2309387395328
[1;34mThe number of inters[0m: 5154471
[1;34mThe sparsity of the dataset[0m: 97.58579218741939%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: nan
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: nan
[1;34mThe number of inters[0m: 0
[1;34mThe sparsity of the dataset[0m: 100.0%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: nan
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: nan
[1;34mThe number of inters[0m: 0
[1;34mThe sparsity of the dataset[0m: 100.0%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'label']

## make model

In [251]:
# model loading and initialization
init_seed(config['seed'], config['reproducibility'])
model = EASE(config, train_data.dataset).to(config['device'])
logger.info(model)

21 Dec 16:28    INFO  EASE()
Trainable parameters: 1


## train

In [253]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=config['show_progress']
)

21 Dec 16:29    INFO  epoch 0 training [time: 14.01s, train loss: 0.0000]
21 Dec 16:29    INFO  Saving current: saved/EASE-Dec-21-2022_16-28-47.pth


### before train

In [128]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, verbose=1, show_progress=config['show_progress']
)

21 Dec 15:02    INFO  epoch 0 training [time: 14.43s, train loss: 0.0000]
21 Dec 15:02    INFO  Saving current: saved/EASE-Dec-21-2022_15-02-10.pth


In [115]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, verbose=1, show_progress=config['show_progress']
)

21 Dec 14:56    INFO  epoch 0 training [time: 13.74s, train loss: 0.0000]
21 Dec 14:57    INFO  epoch 0 evaluating [time: 65.67s, valid_score: 0.235000]
21 Dec 14:57    INFO  valid result: 
recall@10 : 0.235    mrr@10 : 0.1408    ndcg@10 : 0.1473    hit@10 : 0.2995    precision@10 : 0.0329
21 Dec 14:57    INFO  Saving current: saved/EASE-Dec-21-2022_14-56-03.pth


In [63]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, verbose=1, show_progress=config['show_progress']
)

21 Dec 14:37    INFO  epoch 0 training [time: 12.47s, train loss: 0.0000]
21 Dec 14:38    INFO  epoch 0 evaluating [time: 66.56s, valid_score: 0.196600]
21 Dec 14:38    INFO  valid result: 
recall@10 : 0.1966    mrr@10 : 0.5928    ndcg@10 : 0.3203    hit@10 : 0.8735    precision@10 : 0.2572
21 Dec 14:38    INFO  Saving current: saved/EASE-Dec-21-2022_14-37-34.pth


In [254]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="last"

## inference

In [255]:
model_path='saved/EASE-Dec-21-2022_15-02-10.pth'
# rank K 설정
K = 20

In [376]:
845918*6

5075508

In [256]:
# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']
config['dataset'] = 'train_data'

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

model = get_model(config['model'])(config, test_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128)

# user, item 길이
user_len = len(user_id2token)
item_len = len(item_id2token)

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr')

# user id, predict item id 저장 변수
pred_list = None
user_list = None

# model 평가모드 전환
model.eval()

# progress bar 설정
tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink'))

for data in tbar:
    # interaction 생성
    interaction = dict()
    interaction = Interaction(interaction)
    interaction[user_id] = data
    interaction = interaction.to(device)

    # user item별 score 예측
    score = model.full_sort_predict(interaction)
    score = score.view(-1, item_len)

    rating_pred = score.cpu().data.numpy().copy()

    user_index = data.numpy()

    idx = matrix[user_index].toarray() > 0

    rating_pred[idx] = -np.inf
    rating_pred[:, 0] = -np.inf
    ind = np.argpartition(rating_pred, -K)[:, -K:] #

    arr_ind = rating_pred[np.arange(len(rating_pred))[:, None], ind]

    arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]

    batch_pred_list = ind[
        np.arange(len(rating_pred))[:, None], arr_ind_argsort
    ]

    if pred_list is None:
        pred_list = batch_pred_list
        user_list = user_index
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, user_index, axis=0
        )

result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

# 데이터 저장
sub = pd.DataFrame(result, columns=["user", "item"])
sub.to_csv(
    "submission.csv", index=False
)
print('inference done!')

21 Dec 16:33    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
21 Dec 16:33    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [1, 0, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


Inference:   0%|                                                           | 0/245 [00:00<?, ?it/s]:   0%|▏                                                  | 1/245 [00:00<00:52,  4.67it/s]:   1%|▍                                                  | 2/245 [00:00<00:48,  5.00it/s]:   1%|▌                                                  | 3/245 [00:00<00:45,  5.33it/s]:   2%|▊                                                  | 4/245 [00:00<00:44,  5.45it/s]:   2%|█                                                  | 5/245 [00:00<00:44,  5.38it/s]:   2%|█▏                                                 | 6/245 [00:01<00:44,  5.37it/s]:   3%|█▍                                                 | 7/245 [00:01<00:44,  5.38it/s]:   3%|█▋                                                 | 8/245 [00:01<00:43,  5.44it/s]:   4%|█▊                                                 | 9/245 [00:01<00:43,  5.40it/s]:   4%|██                                                | 10/245 [00:01<00:42,  5.52it/s]:

inference done!


## 제출파일 생성

In [None]:
sub

In [None]:
sub.user = sub.user.map(uidx2user)
sub.item = sub.item.map(iidx2item)

In [132]:
sub.to_csv('EASE_1_0.csv',index=False)

In [133]:
sub

Unnamed: 0,user,item
0,11,4370
1,11,4886
2,11,47
3,11,32587
4,11,40815
...,...,...
313595,138493,8961
313596,138493,5349
313597,138493,4022
313598,138493,32587


In [137]:
sub.user = sub.user.map(user2idx)
sub.item = sub.item.map(item2idx)

In [144]:
train.columns=['user','item','time']
train[:3]

Unnamed: 0,user,item,time
0,0,2505,1230782529
1,0,109,1230782534
2,0,319,1230782539


In [138]:
afterdf = sub.merge(train[['user','item','time']], on=['user','item'], how='left')

In [139]:
afterdf[afterdf['timestamp:float'].notna()]

Unnamed: 0,user,item,timestamp:float


## 시간 순서 잘 지켜졌는지 확인해보기

In [146]:
train.user = train.user.map(uidx2user)
train.item = train.item.map(iidx2item)

In [150]:
import time
from time import localtime

In [160]:
data_path = '../../data/train'
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
writer_data = pd.read_csv(os.path.join(data_path, 'writers.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(data_path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')
director_data = pd.read_csv(os.path.join(data_path, 'directors.tsv'), sep='\t')

### train에 review_year, month, day, year_month 추가

In [153]:
train['review_year'] = train['time'].apply(lambda x : localtime(x).tm_year)

In [155]:
train['month'] = train['time'].apply(lambda x : localtime(x).tm_mon)

In [156]:
train['day'] = train['time'].apply(lambda x : localtime(x).tm_mday)

In [166]:
train['year_month'] = train['time'].apply(lambda x : time.strftime('%Y-%m',localtime(x)))

In [168]:
train.sort_values(['user','time'],inplace=True)

In [172]:
user2lastyear = dict(train.groupby('user').year.max())

In [173]:
user2lastyearmonth = dict(train.groupby('user').year_month.max())

In [174]:
train['lastyear']=train.user.map(user2lastyear)
train['last_yearmonth']=train.user.map(user2lastyearmonth)

In [176]:
train

Unnamed: 0,user,item,time,year,month,day,year_month,lastyear,last_yearmonth
0,11,4643,1230782529,2009,1,1,2009-01,2011,2011-01
1,11,170,1230782534,2009,1,1,2009-01,2011,2011-01
2,11,531,1230782539,2009,1,1,2009-01,2011,2011-01
3,11,616,1230782542,2009,1,1,2009-01,2011,2011-01
4,11,2140,1230782563,2009,1,1,2009-01,2011,2011-01
...,...,...,...,...,...,...,...,...,...
5154466,138493,44022,1260209449,2009,12,7,2009-12,2009,2009-12
5154467,138493,4958,1260209482,2009,12,7,2009-12,2009,2009-12
5154468,138493,68319,1260209720,2009,12,7,2009-12,2009,2009-12
5154469,138493,40819,1260209726,2009,12,7,2009-12,2009,2009-12


### year 채워넣기

In [186]:
merge_df = train.copy()
side_info = [title_data,year_data]
for side in side_info:
    merge_df = merge_df.merge(side,how = 'left',on='item')
merge_df.head()

Unnamed: 0,user,item,time,review_year,month,day,year_month,lastyear,last_yearmonth,title,year
0,11,4643,1230782529,2009,1,1,2009-01,2011,2011-01,Planet of the Apes (2001),2001.0
1,11,170,1230782534,2009,1,1,2009-01,2011,2011-01,Hackers (1995),1995.0
2,11,531,1230782539,2009,1,1,2009-01,2011,2011-01,"Secret Garden, The (1993)",1993.0
3,11,616,1230782542,2009,1,1,2009-01,2011,2011-01,"Aristocats, The (1970)",1970.0
4,11,2140,1230782563,2009,1,1,2009-01,2011,2011-01,"Dark Crystal, The (1982)",1982.0


In [190]:
merge_df['year_from_title'] = merge_df['title'].apply(lambda x : (x[-5:-1]))

In [191]:
merge_df.loc[merge_df['year_from_title']=='007-','year_from_title'] = 2007

In [194]:
merge_df.year_from_title=merge_df.year_from_title.astype(int)

In [196]:
cond = merge_df.year_from_title == merge_df.year

In [217]:
merge_df[~cond & merge_df.year.notna()].title.value_counts()

Fawlty Towers (1975-1979)    163
Name: title, dtype: int64

In [221]:
merge_df.year = merge_df.year.fillna(merge_df.year_from_title)

In [227]:
merge_df.year = merge_df.year.astype(int)

In [246]:
merge_df[merge_df.lastyear < merge_df.year]

Unnamed: 0,user,item,time,review_year,month,day,year_month,lastyear,last_yearmonth,title,year,year_from_title
3064009,81663,91535,1323223207,2011,12,7,2011-12,2011,2011-12,"Bourne Legacy, The (2012)",2012,2012
4617327,123609,89745,1316645274,2011,9,21,2011-09,2011,2011-09,"Avengers, The (2012)",2012,2012


In [236]:
## item별 year이 모두 동일한지 첫번째값과 평균값이 같은 아이템들 확인 -> 모두 일치
sum(merge_df.groupby('item').year.first() == merge_df.groupby('item').year.mean())

6807

In [239]:
item2year = dict(merge_df.groupby('item').year.first())

## sub 보기

In [262]:
sub.user = sub.user.map(uidx2user)
sub.item = sub.item.map(iidx2item)

In [263]:
sub['lastyear']=sub.user.map(user2lastyear)
sub['last_yearmonth']=sub.user.map(user2lastyearmonth)
sub['m_year'] = sub.item.map(item2year)

In [265]:
sub

Unnamed: 0,user,item,lastyear,last_yearmonth,m_year
0,11,4370,2011,2011-01,2001
1,11,4886,2011,2011-01,2001
2,11,47,2011,2011-01,1995
3,11,32587,2011,2011-01,2005
4,11,40815,2011,2011-01,2005
...,...,...,...,...,...
627195,138493,4720,2009,2009-12,2001
627196,138493,293,2009,2009-12,1994
627197,138493,2174,2009,2009-12,1988
627198,138493,4848,2009,2009-12,2001


In [268]:
sub2 = sub[sub.lastyear >= sub.m_year]

In [276]:
df = pd.read_csv('EASE_1_0.csv')

In [282]:
df['item2']= sub2.groupby('user').item.head(10).reset_index(drop=True)

### 이전 제출(1594)대비 현재 제출(1595)에서 달라진 부분 보기

In [324]:
## 이전 제출(1594)대비 현재 제출(1595)에서 달라진 부분 보기
idx_list=[]
user_item2 = dict(df.groupby('user').item2.apply(list))
for i in tqdm(df.itertuples(), total=df.shape[0]):
    if i.item not in user_item2[i.user]:
        idx_list.append(i.Index)

100%|██████████████████████████████████████████████████| 313600/313600 [00:00<00:00, 562116.84it/s]


In [329]:
df['lastyear']=df.user.map(user2lastyear)
df['last_yearmonth']=df.user.map(user2lastyearmonth)
df['m_year'] = df.item.map(item2year)

In [330]:
df.iloc[idx_list,:]

Unnamed: 0,user,item,item2,lastyear,last_yearmonth,m_year
109,61,58559,48780,2007,2007-12,2008
1559,664,63436,8528,2007,2007-11,2008
2144,915,54286,4027,2006,2006-08,2007
3504,1539,79132,3949,2009,2009-07,2010
5019,2160,58559,111,2007,2007-01,2008
...,...,...,...,...,...,...
300758,132448,69844,2542,2007,2007-10,2009
307115,135535,58559,318,2005,2005-12,2008
307702,135798,88125,6539,2010,2010-12,2011
311225,137460,58559,745,2007,2007-09,2008


## 최종 제출 -> 1595

In [288]:
df[['user','item2']].rename(columns={'item2':'item'}).to_csv("EASE_1_0_Top20_remove_review_after_movie.csv",index=False)