In [None]:
import numpy as np
from scipy import sparse
import pandas as pd

In [None]:
df = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv')
df = df[['user', 'item']]
df.head()

In [None]:
unique_uid = pd.unique(df['user'])
unique_sid = pd.unique(df['item'])

item_mapping = {idx: i for i, idx in enumerate(df.item.unique())}
user_mapping = {idx: i for i, idx in enumerate(df.user.unique())}

src = [user_mapping[idx] for idx in df['user']]
dst = [item_mapping[idx] for idx in df['item']]

n_users = df.user.nunique()
n_items = df.item.nunique()

In [None]:
M = sparse.csr_matrix((np.ones_like(src),
                    (src, dst)), dtype='float64', shape=(n_users, n_items))

In [None]:
user_degrees = df.groupby('user').nunique().item.values
item_degrees = df.groupby('item').nunique().user.values

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.0, 0.17, 0.34, 0.5, 0.67, 0.84, 1.0],
    'beta': [0.0, 0.17, 0.34, 0.5, 0.67, 0.84, 1.0],
    'gamma': [0.0, 0.17, 0.34, 0.5, 0.67, 0.84, 1.0],
    'delta': [0.0, 0.17, 0.34, 0.5, 0.67, 0.84, 1.0],
}

alpha, beta, gamma, delta = 0.17, 0.34, 0.34, 0.34

In [None]:
# user_degrees: np array shape (U,) containing user degrees 
# item_degrees: np array shape (I,) containing item degrees 
# M: np array shape (U, I) containing interactions 
# alpha, beta, gamma, delta: \ourname{} model parameters 
# exponentiate degrees by model params 
user_alpha = user_degrees**(-alpha) 
item_beta = item_degrees**(-beta) 
user_gamma = user_degrees**(-gamma) 
item_delta = item_degrees**(-delta) 
# outer products 
alpha_beta = user_alpha.reshape((-1, 1)) * item_beta 
gamma_delta = user_gamma.reshape((-1, 1)) * item_delta 

In [None]:
# hadamard products 
M_alpha_beta = M.multiply(alpha_beta)
M_gamma_delta = M.multiply(gamma_delta)
L = M_alpha_beta.dot(M.T).dot(M_gamma_delta)

In [None]:
item_mapping_reversed = dict(map(reversed, item_mapping.items()))
user_mapping_reversed = dict(map(reversed, user_mapping.items()))

In [None]:
idx = np.argsort(-L.toarray(), axis=1) # user별로 추천할 itemId가 순서대로 담긴 행렬

In [None]:
from tqdm import tqdm
import ast

In [None]:
pred_dic = {}
for i in tqdm(range(len(idx))):
    decoded = [item_mapping_reversed[x] for x in idx[i]]
    pred_dic[user_mapping_reversed[i]] = decoded

# 전체 학습 데이터
rating_path = '/opt/ml/input/data/train/train_ratings.csv'
train_df = pd.read_csv(rating_path)

# 제출용 빈 데이터프레임 생성
user_unique = train_df['user'].unique()
users = user_unique.repeat(10)
test_df = pd.DataFrame(users, columns=['user'])
test_df['item']=0

# 유저별로 본 영화 저장한 csv 불러오기
seen_path = '/opt/ml/input/melon/phil/EDA/seen_movie.csv'
seen_df = pd.read_csv(seen_path)

# 딕셔너리 형태로 변경
seen_dic = seen_df.set_index('user').to_dict()['seen']
for key in tqdm(seen_dic.keys()):
    seen_dic[key] = ast.literal_eval(seen_dic[key])

# 유저별로 인기 영화에서 본 영화 빼고 10개씩 추천
index = 0
for user in tqdm(user_unique):
    temp_items = np.array(list(pred_dic[user]))
    seen_list = np.array(seen_dic[user])
    temp_items = temp_items[np.isin(temp_items, seen_list) == False]
    top_k_items = temp_items[:10]
    for i in range(10):
        test_df.loc[index + i, 'item'] = top_k_items[i]
    index += 10

In [None]:
from collections import defaultdict

diff_path = '/opt/ml/input/melon/phil/EDA/diff_movie4.csv'

user_unique = test_df['user'].unique()

k_dic = defaultdict(list)
for user in tqdm(user_unique):
    for item in test_df[test_df['user']==user]['item']:
        k_dic[user].append(item)

# 유저별 4개 영화가 담긴 데이터 불러오기
diff_df = pd.read_csv(diff_path)

# 딕셔너리 형태로 변경
diff_dic = diff_df.set_index('user').to_dict()['diff']
for key in tqdm(diff_dic.keys()):
    diff_dic[key] = ast.literal_eval(diff_dic[key])

# 유저별 gt와 교집합 딕셔너리
inter_dic={}
for user in user_unique:
    inter_dic[user] = set(k_dic[user]).intersection(set(diff_dic[user]))

# 유저별 gt와 같은 개수 딕셔너리
correct_dic = {}
for user in inter_dic.keys():
    correct_dic[user] = len(inter_dic[user])

# gt와 같은 개수 총합
print(sum(correct_dic.values()))