In [79]:
import os
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
import numpy as np
from scipy.sparse import csr_matrix
from datetime import datetime
import pytz
import csv

## user, item matrix 생성 및 index 저장

In [81]:
train_df = pd.read_csv('/opt/ml/input/junwon/data/train/train_ratings.csv')

# 유니크한 유저와 아이템 목록 추출
users = train_df['user'].unique()
items = train_df['item'].unique()

# 유저와 아이템의 인덱스 매핑 생성
user_index = {user: i for i, user in enumerate(users)}
item_index = {item: i for i, item in enumerate(items)}

# 행렬 생성
row_indices = train_df['user'].map(user_index)
col_indices = train_df['item'].map(item_index)
values = np.ones(len(train_df))
matrix = csr_matrix((values, (row_indices, col_indices)), shape=(len(users), len(items)))

# user와 item 값을 그대로 사용하기 위해 인덱스를 역으로 매핑하여 딕셔너리 생성
index_to_user = {i: user for user, i in user_index.items()}
index_to_item = {i: item for item, i in item_index.items()}

## MF with SVD 

### Train

In [82]:
# Adding a new column 'raing' with values initialized to 1
train_df['raing'] = 1

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(train_df[['user', 'item', 'raing']], reader)

# Use SVD algorithm with default hyperparameters
algo = SVD()

# Evaluate the performance of the algorithm using cross validation
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

# Train the model on the entire dataset
trainset = data.build_full_trainset()
algo.fit(trainset)

### Inference

In [None]:
# Get the learned user and item matrices
user_matrix = algo.pu
item_matrix = algo.qi

# Print the shape of the user matrix
print("User matrix shape:", user_matrix.shape)

# Print the shape of the item matrix
print("Item matrix shape:", item_matrix.shape)

User matrix shape: (31360, 100)
Item matrix shape: (6807, 100)


In [71]:
result = user_matrix@item_matrix.T

In [72]:
result[matrix.nonzero()] = -np.inf
result

array([[       -inf,        -inf,        -inf, ..., -0.00720431,
        -0.0503347 , -0.01099133],
       [-0.00461095, -0.00075128, -0.00804032, ..., -0.09922199,
         0.11263872,  0.02426618],
       [ 0.00344952,  0.00143465, -0.00616828, ..., -0.13241257,
         0.02637532, -0.06163633],
       ...,
       [-0.00293851,  0.00591415, -0.00017686, ...,  0.03608433,
        -0.01298933,  0.07074937],
       [-0.00187018,  0.00817371, -0.00275791, ...,  0.1104404 ,
         0.05940141, -0.11123025],
       [       -inf,  0.00161238,  0.00273885, ...,  0.01533187,
        -0.00242231,  0.0190496 ]])

In [73]:
# 유저별 상위 10개의 값을 저장
recommend_list=[]
for i in range(len(result)):
    sorted_indices = np.argsort(-result[i])
    for j in range(10):
        recommend_list.append((index_to_user[i],index_to_item[sorted_indices[j]]))

In [77]:
# output 폴더 생성
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)

def korea_date_time():
    """
    Retrieves the current date and time in the Korea Standard Time (KST) timezone.

    Returns:
        str: The current date and time formatted as 'YYYY-MM-DD_HH:MM:SS' in KST.
    """
    korea_timezone = pytz.timezone("Asia/Seoul")
    date_time = datetime.now(tz=korea_timezone)
    date_time = date_time.strftime("%Y-%m-%d_%H:%M:%S")
    
    return date_time

date_time = korea_date_time()

# CSV 파일에 데이터 저장
filename = filename = os.path.join(output_folder, f'output_{date_time}.csv')
with open(filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['user', 'item'])  # 컬럼명 추가
    writer.writerows(recommend_list)  # 데이터 추가

print(f"Recommendations saved to '{filename}'")

Recommendations saved to 'output/output_2023-06-16_12:20:33.csv'


### 결과
- Top-K 예측은 거의 되지 않음.