### 2 stages(Candidates -> Ranking)

#### Candidates: EASE

In [None]:
from recbole.model.context_aware_recommender.fm import FM
from recbole.quick_start import run_recbole
from recbole.config import Config
from recbole.data import create_dataset
from tqdm.notebook import tqdm

from utils import *

# to be manually modified
context = False
MODEL = 'EASE'
version = 'v2'
data_path = f'/opt/ml/input/data/tr_v2_miss_year'

base = f'{version}_base'
atomic_path = f'/opt/ml/input/data/{version}_base/{version}_base'
yaml_path = f'/opt/ml/input/data/{version}_base/{version}_base.yaml'
inter_df = pd.read_csv(os.path.join(
    data_path, 'interactions.csv'))  # 전체 학습 데이터
genre_df = pd.read_csv(join(data_path, 'genres.csv'))
title_df = pd.read_csv(join(data_path, 'titles.csv'))
director_df = pd.read_csv(join(data_path, 'directors.csv'))
writer_df = pd.read_csv(join(data_path, 'writers.csv'))
year_df = pd.read_csv(join(data_path, 'years.csv'))

# yaml
# to be manually modified
cfg_str = """
data_path: /opt/ml/input/data/
dataset: v2_base

field_separator: "\\t"
seq_separator: "\\t"

USER_ID_FIELD: user
ITEM_ID_FIELD: item
TIME_FIELD: time
#YEAR_FIELD: year
#TITLE_FIELD: title
#seq_len: { genre: 10, writer: 10, director: 10 }

load_col:
  inter: [user, item, time]
  # item: [item, year, title, genre, writer, director]
  # item: [item, genre, year, writer, director]

# model config
reg_weight: 500
embedding_size: 10 # (int) The embedding size of features.
mlp_hidden_size: [16, 16, 16] # (list of int) The hidden size of MLP layers.
dropout_prob: 0.2 # (float) The dropout rate.

# Training and evaluation config
eval_setting: RO_RS,full
epochs: 10
seed: 42
train_batch_size: 2048 #4096
eval_batch_size: 2048 #4096
eval_args:
  split: { "RS": [0.9, 0.05, 0.05] }
  order: RO
  group_by: "user"
  mode: full
topk: 10
train_neg_sample_args:
  distribution: uniform
  sample_num: 1
  alpha: 1.0
  dynamic: False
  candidate_num: 0
metrics: ["Recall", "MRR", "NDCG", "Hit", "Precision"]
valid_metric: Recall@10

# logging
show_progress: false
"""
yaml = yaml_path
with open(yaml, "w") as f:
    f.write(cfg_str)

# dataset
inter_cols = {'user': 'user:token', 'item': 'item:token', 'time': 'time:float'}
atom_inter = convert_to_atomic(inter_df, inter_cols)
save_atomic(atom_inter, atomic_path, 'inter')

# run
run_recbole(
    model=MODEL,
    dataset=base,
    config_file_list=[yaml],
)

#### Data

In [None]:
import pandas as pd
base = '/opt/ml/'
# path = base + 'input/recbole/submission_EASE.csv'
path = base + 'input/recbole/EASE_100.csv'

top100 = pd.read_csv(path)

#### Ranking: top 10을 고르되 유저 별 min_year <= item <= max_year 내에 존재하지 않으면 다음 candidate으로

In [None]:
# user, item dataframe에서 user_consumptions_movie_min_max_year를 유저 별로 계산
years_df  = pd.read_csv(base+'input/data/tr_v2_miss_year/years.csv')
genres_df  = pd.read_csv(base+'input/data/tr_v2_miss_year/genres.csv')

inters_df = pd.read_csv(base+'input/data/tr_v2_miss_year/interactions.csv')
inters_df.drop(columns='time', inplace=True)

iy_df = inters_df.merge(years_df, on='item')
iy_df = iy_df.sort_values(by=['user', 'item'])

user_consumptions_movie_min_max_year = iy_df.groupby('user')['year'].agg(['min', 'max'])
user_consumptions_movie_min_max_year = user_consumptions_movie_min_max_year.to_dict()

# item2year dict
item_year_df = years_df.groupby('item')['year'].first()
item_year_dict = item_year_df.to_dict()

In [None]:
# takes 7 mins
from tqdm.notebook import tqdm
from typing import Callable

def Rank(top100: pd.DataFrame, condition: Callable):
    """
    1. top100 df에서 유저 별로
    2. 아이템이 user_consumptions_movie_min_max_year 내에 포함되면 top10에 포함
    3. 포함되지 않으면 넘어감
    """ 
    from collections import defaultdict
    
    top10_dict = defaultdict(list)
    skipped    = defaultdict(int)
    fanatic    = defaultdict(int)
    already    = {k: 9 for k in top100.user.unique()}
    for index, row in tqdm(top100.iterrows()):
        user, item = row['user'], row['item']
        year = item_year_dict[item]
        genres = item2genres[item]
        
        if len(top10_dict[user]) == 10:
            continue
            
        if condition(user, year, genres):
            top10_dict[user].append(item)
        else:
            skipped[user] += 1

    return top10_dict, skipped

def condition(user, year, genres):
    return user_consumptions_movie_min_max_year['min'][user] + down \
        <= year <= (user_consumptions_movie_min_max_year['max'][user] + up)
        
top_items, skipped = Rank(top100, condition) # both dict

In [None]:
# 1.
import pandas as pd

df_normalized = pivoted_df.div(pivoted_df.sum(axis=1), axis=0)

# 2.
from tqdm.notebook import tqdm
top = 0
app = []
pec = []
for index, row in tqdm(df_normalized.iterrows()):
    a = row.nlargest(1)
    r = a.item()
    pec.append((index, r, a.index.item()))
        
# 3.
item2genres = {item: set(group['genre']) for item, group in df.groupby('item')}
print(item2genres[1])

# 4.
pec = {user: genre for user, _, genre in pec}
print(pec)

In [None]:
def get_num_skips(skipped):
    s = sum(v for k, v in skipped.items())
    return s, s/31360

get_num_skips(skipped)

# exp 1: max_year+3 -> (787, 0.02509566326530612)
# exp 2: min-2 <= item <= max_year+2 -> (409, 0.013042091836734694)
# exp 3: genre 30% >= -> include, or just top 10 from the top -> (5251, 0.16744260204081632)
# exp 4: min-2 <= item <= max_year+2 -> (409, 0.013042091836734694)

In [None]:
# sanity check
assert len(top100.user.unique().tolist()) == len(list(top_items.keys())) # 31360

In [None]:
from collections import defaultdict
top10s = defaultdict(list)
topK = 10
for i, (user, tops) in tqdm(enumerate(top_items.items())):
    top10s[user] = tops[:topK]

In [None]:
# sanity check
a = []
for k, v in top10s.items():
    if len(v) != 10:
        a.append(k)
len(a)

In [None]:
import random

random.seed(42)

total_item_set = set(df['item'])
users = df['user'].unique()

def recommend_items(df):
    recommendations = {}
    for user in tqdm(a):
        items = df[df['user'] == user]['item']
        recommendations[user] = random.sample((total_item_set.union(set(top10s[a[user]]))) - set(items), 1)
    return recommendations


In [None]:
recommendations = {}
for user in tqdm(a):
    items = df[df['user'] == user]['item']
    s = set(items).union(set(top10s[user]))
    recommendations[user] = random.sample(total_item_set - s, 1)

In [None]:
# make submission df out of top10s (dict)
pd.DataFrame({'user': top10s.keys(), 'item': top10s.values()})\
    .explode('item')\
    .to_csv('EASE_year_up_down_2_check.csv', index=False)