In [4]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from datetime import datetime

# 실험용

In [5]:
# 데이터 로드
data_file = "/opt/ml/input/data/train/train_ratings.csv"
rating_df = pd.read_csv(data_file)

# user와 item에 대한 고유한 인덱스 생성
user_list = rating_df['user'].unique()
item_list = rating_df['item'].unique()

user_to_index = {user: index for index, user in enumerate(user_list)}
item_to_index = {item: index for index, item in enumerate(item_list)}

# 데이터셋에서 user, item을 인덱스로 변환
rating_df['user'] = rating_df['user'].map(user_to_index)
rating_df['item'] = rating_df['item'].map(item_to_index)

num_users = len(user_to_index)
num_items = len(item_to_index)

# csr_matrix 생성
rows = rating_df['user'].values
cols = rating_df['item'].values

rating_score = 0.9
data = np.full(len(rating_df), rating_score)
rating_matrix = csr_matrix((data, (rows, cols)), shape=(num_users, num_items))


In [6]:
class EASE:
    def __init__(self, _lambda):
        self.B = None
        self._lambda = _lambda

    def train(self, X):
        try:
            # csr_matrix --> dense
            X_dense = X.toarray()
            print("X shape:", X_dense.shape)
            
            G = X_dense.T @ X_dense
            print("G shape:", G.shape)
            
            diag_indices = np.diag_indices_from(G)
            G[diag_indices] += self._lambda
            print("G after adding lambda on diag:", G.shape)
            
            # dense --> pseudo-inverse 계산
            P = np.linalg.pinv(G)
            print("P shape:", P.shape)
            
            self.B = P / -np.diag(P)
            self.B[diag_indices] = 0
            print("Final B shape:", self.B.shape)
        except Exception as e:
            print("Error occurred:", e)

    def predict(self, X):
        # csr_matrix --> dense
        X_dense = X.toarray() if isinstance(X, csr_matrix) else X
        return X_dense @ self.B

In [7]:
# EASE 모델 학습
_lambda = 750
ease = EASE(_lambda)
ease.train(rating_matrix)

# 예측
predictions = ease.predict(rating_matrix)

X shape: (31360, 6807)
G shape: (6807, 6807)
G after adding lambda on diag: (6807, 6807)
P shape: (6807, 6807)
Final B shape: (6807, 6807)


In [9]:
# 상위 N개 아이템 추출

N = 10
top_n_items_per_user = []

# 이미 평가한 아이템은 제외
predictions[rating_matrix.nonzero()] = -np.inf

for user_idx in range(predictions.shape[0]):
    user_predictions = predictions[user_idx, :]
    top_n_indices = np.argpartition(user_predictions, -N)[-N:]
    top_n_indices_sorted = top_n_indices[np.argsort(user_predictions[top_n_indices])[::-1]]
    top_n_items_per_user.append(top_n_indices_sorted)

# 인덱스를 실제 아이템 ID로 변환
index_to_item = {index: item for item, index in item_to_index.items()}
top_n_items_per_user_ids = [[index_to_item[idx] for idx in user_items] for user_items in top_n_items_per_user]

# 제출 파일 생성
result = []
for user_id, items in zip(user_list, top_n_items_per_user_ids):
    for item_id in items:
        result.append((user_id, item_id))

# 저장
current_time = datetime.now().strftime('%Y%m%d-%H%M%S')
args_str = f"submission_EASE-lam{_lambda}_tm{current_time}"
submission_df = pd.DataFrame(result, columns=['user', 'item'])
submission_df.to_csv(f"{args_str}.csv", index=False)


In [None]:
# 하위 N개 아이템 추출

N = 80
bottom_n_items_per_user = []  # 각 사용자별로 하위 N개 아이템의 인덱스를 저장할 리스트

for user_idx in range(predictions.shape[0]):
    user_predictions = predictions[user_idx, :]
    bottom_n_indices = np.argpartition(user_predictions, N-1)[:N]
    bottom_n_indices_sorted = bottom_n_indices[np.argsort(user_predictions[bottom_n_indices])]
    bottom_n_items_per_user.append(bottom_n_indices_sorted)


bottom_n_items_per_user = np.array(bottom_n_items_per_user)

# 인덱스를 실제 아이템 ID로 변환
index_to_item = {index: item for item, index in item_to_index.items()}
bottom_n_items_per_user_ids = [[index_to_item[idx] for idx in user_items] for user_items in bottom_n_items_per_user]

# 제출 파일 생성
result = []
for user_id, items in zip(user_list, bottom_n_items_per_user_ids):
    for item_id in items:
        result.append((user_id, item_id))

# 저장
current_time = datetime.now().strftime('%Y%m%d-%H%M%S')
args_str = f"Negative_EASE-lam{_lambda}_tm{current_time}"
submission_df = pd.DataFrame(result, columns=['user', 'item'])
submission_df.to_csv(f"{args_str}.csv", index=False)
