In [1]:
import os, sys
import pandas as pd
import numpy as np
import json

In [2]:
# path 설정
sys.path.append((os.path.abspath("")))
print(sys.path[-1])

d:\base\boostcamp\temp\code


In [3]:
data_path: str = "data/eval/score"
ease: pd.DataFrame = pd.read_csv(os.path.join(data_path,"score_ease_1.csv"))
recvae: pd.DataFrame = pd.read_csv(os.path.join(data_path,"score_recvae.csv"))
ract: pd.DataFrame = pd.read_csv(os.path.join(data_path,"score_ract_1.csv"))
bert: pd.DataFrame = pd.read_csv(os.path.join(data_path,"recbole_bert4rec_top20_score.csv"))
lgcn: pd.DataFrame = pd.read_csv(os.path.join(data_path,"recbole_lightgcn_top20_score.csv"))


In [4]:
ease

Unnamed: 0,user,item,score
0,11,4886,0.826884
1,11,4370,0.754824
2,11,8961,0.751770
3,11,40815,0.699711
4,11,47,0.548352
...,...,...,...
627195,138493,597,0.419247
627196,138493,589,0.418782
627197,138493,8961,0.415376
627198,138493,45517,0.407788


In [5]:
# 각 데이터프레임에 대해 score를 표준화하여 새로운 열 추가
ease['score_normalized'] = ease.groupby('user')['score'].transform(lambda x: (x - x.mean()) / x.std())
recvae['score_normalized'] = recvae.groupby('user')['score'].transform(lambda x: (x - x.mean()) / x.std())
ract['score_normalized'] = ract.groupby('user')['score'].transform(lambda x: (x - x.mean()) / x.std())
bert['score_normalized'] = bert.groupby('user')['score'].transform(lambda x: (x - x.mean()) / x.std())
lgcn['score_normalized'] = lgcn.groupby('user')['score'].transform(lambda x: (x - x.mean()) / x.std())


In [6]:
ract

Unnamed: 0,user,item,score,score_normalized
0,11,4370,5.154140,3.178949
1,11,37386,4.681453,1.399818
2,11,55232,4.584410,1.034560
3,11,8961,4.565242,0.962417
4,11,1517,4.379100,0.261802
...,...,...,...,...
627195,138493,32,3.351886,-0.881276
627196,138493,1619,3.334370,-1.011881
627197,138493,2004,3.308694,-1.203320
627198,138493,589,3.304873,-1.231814


In [8]:
# 세 모델의 정규화된 점수를 합산
merged_df = pd.concat([
    ease[['user', 'item', 'score_normalized']],
    recvae[['user', 'item', 'score_normalized']],
    ract[['user', 'item', 'score_normalized']],
    bert[['user', 'item', 'score_normalized']],
    lgcn[['user', 'item', 'score_normalized']]
])

# 유저-아이템 쌍별로 정규화된 점수 합산
ensemble_df = merged_df.groupby(['user', 'item'])['score_normalized'].sum().reset_index()

# 유저별로 상위 10개 아이템 선택
final_df = ensemble_df.sort_values(['user', 'score_normalized'], ascending=[True, False]) \
    .groupby('user').head(10) \
    .reset_index(drop=True)

final_df

Unnamed: 0,user,item,score_normalized
0,11,4370,8.988302
1,11,4886,5.115105
2,11,37386,4.068091
3,11,8961,3.483744
4,11,48780,1.996654
...,...,...,...
313595,138493,2012,1.997651
313596,138493,551,1.617706
313597,138493,593,1.565662
313598,138493,2628,1.437350


In [9]:
submission = final_df.loc[:, ['user', 'item']]
submission


Unnamed: 0,user,item
0,11,4370
1,11,4886
2,11,37386
3,11,8961
4,11,48780
...,...,...
313595,138493,2012
313596,138493,551
313597,138493,593
313598,138493,2628


In [10]:
submission.to_csv("data/eval/score/ensemble_soft_1.csv", index=False)


In [11]:
top20 = "data/eval/top20"
rec = pd.read_csv(os.path.join(top20,"ensemble_rank_normalization.csv"))
rec

Unnamed: 0,user,item
0,11,4886
1,11,8961
2,11,4370
3,11,37386
4,11,7438
...,...,...
313595,138493,2011
313596,138493,2174
313597,138493,2762
313598,138493,593


In [12]:
# 각 데이터프레임의 유저별 아이템 수 확인
print("rec의 유저당 아이템 수:", rec.groupby('user').size().unique())
print("submission의 유저당 아이템 수:", submission.groupby('user').size().unique())

# 두 데이터프레임의 차이 분석
total_diff = len(set(zip(rec['user'], rec['item'])) - set(zip(submission['user'], submission['item'])))
print("\n총 다른 추천 수:", total_diff)

# 유저별로 얼마나 다른 아이템이 추천되었는지 확인
user_diff = []
for user in rec['user'].unique():
    rec_items = set(rec[rec['user'] == user]['item'])
    sub_items = set(submission[submission['user'] == user]['item'])
    diff = len(rec_items - sub_items)
    user_diff.append(diff)

print("유저당 평균 다른 추천 수:", np.mean(user_diff))
print("유저당 최소 다른 추천 수:", min(user_diff))
print("유저당 최대 다른 추천 수:", max(user_diff))


rec의 유저당 아이템 수: [10]
submission의 유저당 아이템 수: [10]

총 다른 추천 수: 121389
유저당 평균 다른 추천 수: 3.8708227040816325
유저당 최소 다른 추천 수: 0
유저당 최대 다른 추천 수: 9
