In [121]:
import os
import pandas as pd
import numpy as np

data_path = '../data/train'
train_df = pd.read_csv(os.path.join(data_path, 'train_ratings.csv')) # 전체 학습 데이터

In [122]:
#user의 마지막 5개
train_df.groupby('user').tail(5)

Unnamed: 0,user,item,time
371,11,48738,1294796106
372,11,6291,1294796113
373,11,46578,1294796119
374,11,7153,1294796132
375,11,4226,1294796159
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


In [123]:
#user의 랜덤하게 5개 -> 아래 코드에서는 맨 뒤값을 먼저 추출하고 남은 것 중에 5개의 데이터를 뽑아냄.
train_df.groupby('user').apply(lambda x: x.sample(5)).reset_index(drop=True)

Unnamed: 0,user,item,time
0,11,34338,1230785494
1,11,2028,1230857271
2,11,153,1230858914
3,11,158,1230853973
4,11,608,1230858873
...,...,...,...
156795,138493,26242,1255817575
156796,138493,66762,1255805408
156797,138493,2424,1255817156
156798,138493,260,1258390295


In [110]:
# Set random seed
random_seed = 42
np.random.seed(random_seed)

#TODO: 비율 조정 합이 10이되게 하세요!
num_seq = 5
num_ran = 5

# Extract the last 5 data points per user to create valid data
valid_data_last = train_df.groupby('user').tail(num_seq).copy()

# Exclude valid data from train data
train_data = train_df[~train_df.index.isin(valid_data_last.index)].copy()

# Randomly select 5 previous data points per user to create random_data
random_data = train_data.groupby('user').apply(lambda x: x.sample(num_ran)).reset_index(drop=True)

# Exclude random_data from train_data based on matching user and item values
train_data = train_data[~train_data[['user', 'item']].apply(tuple, axis=1).isin(random_data[['user', 'item']].apply(tuple, axis=1))].copy()

valid_data = pd.concat([valid_data_last, random_data], ignore_index=True)
valid_data = valid_data[['user', 'item']].sort_values('user')

# Create a folder to store the datasets
folder_path = "datasets"
os.makedirs(folder_path, exist_ok=True)

# Save train data and valid data as .csv files
train_data.to_csv(os.path.join(folder_path, 'train_data.csv'), index=False)
valid_data.to_csv(os.path.join(folder_path, 'valid_data.csv'), index=False)


           user   item        time
371          11  48738  1294796106
372          11   6291  1294796113
373          11  46578  1294796119
374          11   7153  1294796132
375          11   4226  1294796159
...         ...    ...         ...
5154466  138493  44022  1260209449
5154467  138493   4958  1260209482
5154468  138493  68319  1260209720
5154469  138493  40819  1260209726
5154470  138493  27311  1260209807

[156800 rows x 3 columns]


In [125]:
#중복 확인 
def check_duplicates(df, lst):
    duplicates = df[df.duplicated(subset=lst)]

    if duplicates.empty:
        print('중복된 데이터가 없습니다.')
    else:
        print('중복된 데이터:')
        print(duplicates)

check_data = pd.concat([valid_data, train_data])
check_duplicates(check_data, ['user', 'item'])

중복된 데이터가 없습니다.


In [130]:
# valid data에서 유저별 항목 개수 확인
user_item_counts = valid_data.groupby('user').size()

print("Vakid data 유저별 항목 개수:")
print(user_item_counts)
print()
print("Vakid data 유저별 항목 카테고리 수:",user_item_counts.nunique())

Vakid data 유저별 항목 개수:
user
11        10
14        10
18        10
25        10
31        10
          ..
138473    10
138475    10
138486    10
138492    10
138493    10
Length: 31360, dtype: int64

Vakid data 유저별 항목 카테고리 수: 1


In [117]:
import pandas as pd

def recall_at_10(true_df, pred_df):
    # 실제 상호작용한 아이템 데이터프레임 생성
    true_items = true_df.groupby('user')['item'].apply(set).reset_index(name='true_items')
    # print(true_items)
    # 예측된 상위 10개 아이템 데이터프레임 생성
    pred_items = pred_df.groupby('user')['item'].apply(set).reset_index(name='pred_items')
    # print(pred_items)
    
    # 사용자별 recall@10 계산
    recall_scores = []
    for _, row in true_items.iterrows():
        user = row['user']
        true_set = row['true_items']

        # 해당 사용자의 예측 아이템이 있는지 확인
        pred_set = set(pred_items[pred_items['user'] == user]['pred_items'].values[0])
        intersection = true_set.intersection(pred_set)
        recall = len(intersection) / 10
        recall_scores.append(recall)

    # 전체 사용자에 대한 평균 recall@10 계산
    mean_recall = sum(recall_scores) / len(recall_scores)

    return mean_recall


true_df = pd.read_csv('datasets/valid_data.csv')
pred_df = pd.read_csv('/opt/ml/input/junwon/datasets/output_2023-06-12_15:35:47.csv')

# Recall@10 성능 평가
recall_10 = recall_at_10(true_df, pred_df)
print("Recall@10:", recall_10)


         user                                         true_items
0          11  {48738, 4226, 43556, 53996, 7153, 46578, 19, 6...
1          14  {1282, 3471, 2161, 594, 468, 44022, 1271, 1177...
2          18  {7456, 60482, 186, 59018, 44555, 63062, 7767, ...
3          25  {2692, 1732, 2571, 428, 1136, 1265, 337, 500, ...
4          31  {48322, 56775, 58025, 79695, 89745, 33493, 562...
...       ...                                                ...
31355  138473  {4226, 74789, 4262, 64839, 2858, 3949, 51662, ...
31356  138475  {3683, 25923, 6611, 4405, 951, 7135, 923, 7132...
31357  138486  {6242, 6502, 8360, 4105, 8874, 4720, 2288, 127...
31358  138492  {7361, 1732, 69, 357, 3210, 1197, 8623, 1968, ...
31359  138493  {2827, 48780, 38798, 27311, 594, 40819, 44022,...

[31360 rows x 2 columns]
         user                                         pred_items
0          11  {8961, 32587, 33004, 7373, 7438, 40815, 47, 71...
1          14  {4963, 1223, 6539, 588, 1198, 4016, 914, 1907,...