In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch

from recbole.model.general_recommender.ease import EASE
from recbole.model.context_aware_recommender.ffm import FFM

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color


SEED=13

In [3]:
!ls ../../data/train
!readlink -ef ../../data/train/train_ratings.csv

Ml_item2attributes.json  genres.tsv  titles.tsv		writers.tsv
directors.tsv		 pro_sg      train_ratings.csv	years.tsv
/opt/ml/input/data/train/train_ratings.csv


## 데이터 로드

In [5]:
!pwd

/opt/ml/input/fighting/Recbole


In [8]:
data_path = '../../data/train'
train = pd.read_csv(os.path.join(data_path, 'train_ratings.csv')) # 전체 학습 데이터
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
writer_data = pd.read_csv(os.path.join(data_path, 'writers.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(data_path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')
director_data = pd.read_csv(os.path.join(data_path, 'directors.tsv'), sep='\t')

In [9]:
user2idx = {v:k for k,v in enumerate(sorted(set(train.user)))}
item2idx = {v:k for k,v in enumerate(sorted(set(train.item)))}
uidx2user = {k:v for k,v in enumerate(sorted(set(train.user)))}
iidx2item = {k:v for k,v in enumerate(sorted(set(train.item)))}

## make inter file

In [10]:
train

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


In [78]:
train.user = train.user.map(user2idx)
train.item = train.item.map(item2idx)

In [79]:
train.columns=['user_id:token','item_id:token','timestamp:float']

In [80]:
outpath = f"dataset/train_data"
os.makedirs(outpath, exist_ok=True)
# sub_train=train.groupby("user").sample(n=10, random_state=SEED)
# sub_train.shape
train.to_csv(os.path.join(outpath,"train_data.inter"),sep='\t',index=False)

In [83]:
user = train[['user_id:token']]
user.user = user['user_id:token'].map(user2idx)
#중복 처리
user.drop_duplicates(
        # subset=["userID", "assessmentItemID"], 
        keep="last", inplace=True)

  user.user = user['user_id:token'].map(user2idx)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user.drop_duplicates(


In [69]:
# user.columns=['user_id:token']

In [84]:
user['tmp:token'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user['tmp:token'] = 1


In [72]:
user.head(3)

Unnamed: 0,user_id:token,tmp:token
375,11,1
555,14,1
632,18,1


In [73]:
outpath = f"dataset/train_data"
os.makedirs(outpath, exist_ok=True)
# sub_train=train.groupby("user").sample(n=10, random_state=SEED)
# sub_train.shape
user.to_csv(os.path.join(outpath,"train_data.user"),sep='\t',index=False)

In [86]:
item = year_data.merge(writer_data, how='left')
item = item.merge(title_data, how='left')
item = item.merge(genre_data, how='left')
item = item.merge(director_data, how='left')
item['item'] = item['item'].map(item2idx)

In [87]:
item.columns=['item_id:token','year:token','write:token_seq','title:token_seq','genre:token','director:token']

In [88]:
print(item.shape)
item.head(3)

(37634, 6)


Unnamed: 0,item_id:token,year:token,write:token_seq,title:token_seq,genre:token,director:token
0,768,1922,nm0831290,"Nosferatu (Nosferatu, eine Symphonie des Graue...",Horror,nm0003638
1,4897,1922,,Nanook of the North (1922),Documentary,
2,4897,1922,,Nanook of the North (1922),Drama,


In [89]:
item = item.dropna()
print(item.shape)
item.head(3)

(32210, 6)


Unnamed: 0,item_id:token,year:token,write:token_seq,title:token_seq,genre:token,director:token
0,768,1922,nm0831290,"Nosferatu (Nosferatu, eine Symphonie des Graue...",Horror,nm0003638
3,2562,1922,nm0000485,"Dr. Mabuse: The Gambler (Dr. Mabuse, der Spiel...",Crime,nm0000485
4,2562,1922,nm0000485,"Dr. Mabuse: The Gambler (Dr. Mabuse, der Spiel...",Mystery,nm0000485


In [90]:
outpath = f"dataset/train_data"
os.makedirs(outpath, exist_ok=True)
# sub_train=train.groupby("user").sample(n=10, random_state=SEED)
# sub_train.shape
item.to_csv(os.path.join(outpath,"train_data.item"),sep='\t',index=False)

## make yaml file

In [95]:
yamldata="""
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
    user: [user_id]
    item: [item_id, year, write, title, genre, director]
    
train_neg_sample_args:
    uniform: 1
"""
with open("train_data.yaml", "w") as f:
    f.write(yamldata)

241

## make config, logger

In [98]:
logger = getLogger()

# configurations initialization
config = Config(model='FFM', dataset="train_data", config_file_list=[f'train_data.yaml'])
config['epochs'] = 100
config['show_progress'] = False
config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config['valid_metric'] = "Recall@10"
config['eval_args'] = {'split': {'RS': [9, 0.5, 0.5]},
                         'group_by': 'user',
                         'order': 'RO',
                         'mode': 'full'}
config['topk']=[20]
init_seed(config['seed'], config['reproducibility'])
# logger initialization
init_logger(config)
logger.info(config)

22 Dec 07:34    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/train_data
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 100
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [9, 0.5, 0.5]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['AUC', 'LogLoss']
topk = [20]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator = 

## make dataset

In [99]:
dataset

[1;35mtrain_data[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 150.1978542566532
[1;34mThe number of items[0m: 4968
[1;34mAverage actions of items[0m: 217.02637406885444
[1;34mThe number of inters[0m: 1077970
[1;34mThe sparsity of the dataset[0m: 99.30811297722984%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'tmp', 'year', 'write', 'title', 'genre', 'director', 'label']

In [100]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

22 Dec 07:35    INFO  train_data
The number of users: 31361
Average actions of users: 150.1978542566532
The number of items: 4968
Average actions of items: 217.02637406885444
The number of inters: 1077970
The sparsity of the dataset: 99.30811297722984%
Remain Fields: ['user_id', 'item_id', 'timestamp', 'year', 'write', 'title', 'genre', 'director']


AttributeError: 'NoneType' object has no attribute 'used_ids'

In [None]:
train_data.dataset
valid_data.dataset
test_data.dataset

## make model

In [None]:
# model loading and initialization
init_seed(config['seed'], config['reproducibility'])
model = FFM(config, train_data.dataset).to(config['device'])
logger.info(model)

## train

In [None]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, verbose=1, show_progress=config['show_progress']
)

### before train

In [None]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, verbose=1, show_progress=config['show_progress']
)

In [None]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, verbose=1, show_progress=config['show_progress']
)

In [18]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, verbose=1, show_progress=config['show_progress']
)

22 Dec 02:15    INFO  epoch 0 training [time: 11.61s, train loss: 0.0000]
22 Dec 02:15    INFO  Saving current: saved/EASE-Dec-22-2022_02-14-52.pth


In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="last"

## inference

In [22]:
model_path='saved/EASE-Dec-22-2022_02-14-52.pth'
# rank K 설정
K = 20

In [33]:
# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']
config['dataset'] = 'train_data'

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

model = get_model(config['model'])(config, test_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128)

# user, item 길이
user_len = len(user_id2token)
item_len = len(item_id2token)

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr')

# user id, predict item id 저장 변수
pred_list = None
user_list = None

# model 평가모드 전환
model.eval()

# progress bar 설정
tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink'))

for data in tbar:
    # interaction 생성
    interaction = dict()
    interaction = Interaction(interaction)
    interaction[user_id] = data
    interaction = interaction.to(device)

    # user item별 score 예측
    score = model.full_sort_predict(interaction)
    score = score.view(-1, item_len)

    rating_pred = score.cpu().data.numpy().copy()

    user_index = data.numpy()

    idx = matrix[user_index].toarray() > 0

    rating_pred[idx] = -np.inf
    rating_pred[:, 0] = -np.inf
    ind = np.argpartition(rating_pred, -K)[:, -K:]

    arr_ind = rating_pred[np.arange(len(rating_pred))[:, None], ind]

    arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]

    batch_pred_list = ind[
        np.arange(len(rating_pred))[:, None], arr_ind_argsort
    ]

    if pred_list is None:
        pred_list = batch_pred_list
        user_list = user_index
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, user_index, axis=0
        )

result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

# 데이터 저장
sub = pd.DataFrame(result, columns=["user", "item"])
sub.to_csv(
    "submission.csv", index=False
)
print('inference done!')

22 Dec 02:23    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
22 Dec 02:23    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [1, 0, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]
Inference:   0%|          | 0/245 [00:00<?, ?it/s]:   0%|          | 1/245 [00:00<00:50,  4.86it/s]:   1%|          | 2/245 [00:00<00:47,  5.17it/s]:   1%|          | 3/245 [00:00<00:43,  5.56it/s]:   2%|▏         | 4/245 [00:00<00:41,  5.80it/s]:   2%|▏         | 5/245 [00:00<00:41,  5.80it/s]:   2%|▏         | 6/245 [00:01<00:40,  5.85it/s]:   3%|▎         | 7/245 [00:01<00:39,  5.98it/s]:   3%|▎         | 8/245 [00:01<00:39,  6.06it/s]:   4%|▎         | 9/245 [00:01<00:38,  6.16it/s]:   4%|▍         | 10/245 [00:01<00:37,  6.26it/s]:   4%|▍         | 11/245 [00:01<00:35,  6.59it/s]:   5%|▍         | 12/245 [00:01<00:33,  6.96it/s]:   5%|▌         | 13/2

inference done!


## 제출파일 생성

In [34]:
sub

Unnamed: 0,user,item
0,0,2381
1,0,2619
2,0,41
3,0,4581
4,0,4790
...,...,...
627195,31359,2541
627196,31359,178
627197,31359,1206
627198,31359,2603


## 제출파일에 기존 interaction 중복 확인 및 제거
-> 중복 없었다.

In [35]:
train_check = train[['user_id:token','item_id:token']]

In [37]:
train_check.columns = ['user','item']

In [42]:
sub_nodup = sub.merge(train_check, how='outer', indicator=True)
sub_nodup

Unnamed: 0,user,item,_merge
0,0,2381,left_only
1,0,2619,left_only
2,0,41,left_only
3,0,4581,left_only
4,0,4790,left_only
...,...,...,...
5781666,31359,4882,right_only
5781667,31359,2652,right_only
5781668,31359,5768,right_only
5781669,31359,4791,right_only


In [45]:
sub_nodup = sub_nodup.query('_merge == "left_only"').drop(columns=['_merge'])
sub_nodup

Unnamed: 0,user,item
0,0,2381
1,0,2619
2,0,41
3,0,4581
4,0,4790
...,...,...
627195,31359,2541
627196,31359,178
627197,31359,1206
627198,31359,2603


In [25]:
sub.user = sub.user.map(uidx2user)
sub.item = sub.item.map(iidx2item)

In [26]:
sub.to_csv('EASE_1_0.csv',index=False)

In [28]:
train

Unnamed: 0,user_id:token,item_id:token,timestamp:float
0,0,2505,1230782529
1,0,109,1230782534
2,0,319,1230782539
3,0,368,1230782542
4,0,1183,1230782563
...,...,...,...
5154466,31359,4882,1260209449
5154467,31359,2652,1260209482
5154468,31359,5768,1260209720
5154469,31359,4791,1260209726


In [None]:
sub.user = sub.user.map(user2idx)
sub.item = sub.item.map(item2idx)

In [None]:
train.columns=['user','item','time']
train[:3]

In [None]:
afterdf = sub.merge(train[['user','item','time']], on=['user','item'], how='left')

In [None]:
afterdf[afterdf['timestamp:float'].notna()]

## 시간 순서 잘 지켜졌는지 확인해보기

In [None]:
train.user = train.user.map(uidx2user)
train.item = train.item.map(iidx2item)

In [None]:
import time
from time import localtime

In [None]:
data_path = '../../data/train'
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
writer_data = pd.read_csv(os.path.join(data_path, 'writers.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(data_path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')
director_data = pd.read_csv(os.path.join(data_path, 'directors.tsv'), sep='\t')

### train에 review_year, month, day, year_month 추가

In [None]:
train['review_year'] = train['time'].apply(lambda x : localtime(x).tm_year)

In [None]:
train['month'] = train['time'].apply(lambda x : localtime(x).tm_mon)

In [None]:
train['day'] = train['time'].apply(lambda x : localtime(x).tm_mday)

In [None]:
train['year_month'] = train['time'].apply(lambda x : time.strftime('%Y-%m',localtime(x)))

In [None]:
train.sort_values(['user','time'],inplace=True)

In [None]:
user2lastyear = dict(train.groupby('user').year.max())

In [None]:
user2lastyearmonth = dict(train.groupby('user').year_month.max())

In [None]:
train['lastyear']=train.user.map(user2lastyear)
train['last_yearmonth']=train.user.map(user2lastyearmonth)

In [None]:
train

### year 채워넣기

In [None]:
merge_df = train.copy()
side_info = [title_data,year_data]
for side in side_info:
    merge_df = merge_df.merge(side,how = 'left',on='item')
merge_df.head()

In [None]:
merge_df['year_from_title'] = merge_df['title'].apply(lambda x : (x[-5:-1]))

In [None]:
merge_df.loc[merge_df['year_from_title']=='007-','year_from_title'] = 2007

In [None]:
merge_df.year_from_title=merge_df.year_from_title.astype(int)

In [None]:
cond = merge_df.year_from_title == merge_df.year

In [None]:
merge_df[~cond & merge_df.year.notna()].title.value_counts()

In [None]:
merge_df.year = merge_df.year.fillna(merge_df.year_from_title)

In [None]:
merge_df.year = merge_df.year.astype(int)

In [None]:
merge_df[merge_df.lastyear < merge_df.year]

In [None]:
## item별 year이 모두 동일한지 첫번째값과 평균값이 같은 아이템들 확인 -> 모두 일치
sum(merge_df.groupby('item').year.first() == merge_df.groupby('item').year.mean())

In [None]:
item2year = dict(merge_df.groupby('item').year.first())

## sub 보기

In [262]:
sub.user = sub.user.map(uidx2user)
sub.item = sub.item.map(iidx2item)

In [263]:
sub['lastyear']=sub.user.map(user2lastyear)
sub['last_yearmonth']=sub.user.map(user2lastyearmonth)
sub['m_year'] = sub.item.map(item2year)

In [265]:
sub

Unnamed: 0,user,item,lastyear,last_yearmonth,m_year
0,11,4370,2011,2011-01,2001
1,11,4886,2011,2011-01,2001
2,11,47,2011,2011-01,1995
3,11,32587,2011,2011-01,2005
4,11,40815,2011,2011-01,2005
...,...,...,...,...,...
627195,138493,4720,2009,2009-12,2001
627196,138493,293,2009,2009-12,1994
627197,138493,2174,2009,2009-12,1988
627198,138493,4848,2009,2009-12,2001


In [268]:
sub2 = sub[sub.lastyear >= sub.m_year]

In [276]:
df = pd.read_csv('EASE_1_0.csv')

In [282]:
df['item2']= sub2.groupby('user').item.head(10).reset_index(drop=True)

### 이전 제출(1594)대비 현재 제출(1595)에서 달라진 부분 보기

In [324]:
## 이전 제출(1594)대비 현재 제출(1595)에서 달라진 부분 보기
idx_list=[]
user_item2 = dict(df.groupby('user').item2.apply(list))
for i in tqdm(df.itertuples(), total=df.shape[0]):
    if i.item not in user_item2[i.user]:
        idx_list.append(i.Index)

100%|██████████████████████████████████████████████████| 313600/313600 [00:00<00:00, 562116.84it/s]


In [329]:
df['lastyear']=df.user.map(user2lastyear)
df['last_yearmonth']=df.user.map(user2lastyearmonth)
df['m_year'] = df.item.map(item2year)

In [330]:
df.iloc[idx_list,:]

Unnamed: 0,user,item,item2,lastyear,last_yearmonth,m_year
109,61,58559,48780,2007,2007-12,2008
1559,664,63436,8528,2007,2007-11,2008
2144,915,54286,4027,2006,2006-08,2007
3504,1539,79132,3949,2009,2009-07,2010
5019,2160,58559,111,2007,2007-01,2008
...,...,...,...,...,...,...
300758,132448,69844,2542,2007,2007-10,2009
307115,135535,58559,318,2005,2005-12,2008
307702,135798,88125,6539,2010,2010-12,2011
311225,137460,58559,745,2007,2007-09,2008


## 최종 제출 -> 1595

In [288]:
df[['user','item2']].rename(columns={'item2':'item'}).to_csv("EASE_1_0_Top20_remove_review_after_movie.csv",index=False)