In [57]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from sklearn.metrics import pairwise_distances_argmin_min

# movie.csv와 user, item, cluster, tsne좌표로 이루어진 데이터프레임을 생성합니다.
# 파일 경로와 실제 데이터에 맞게 코드를 수정해야 합니다.
data_dir = "/data/ephemeral/level2-movierecommendation-recsys-06/data/train/"
cluster_2d = "kmeans2d"
movie_df = pd.read_csv(data_dir + f"movie_{cluster_2d}.csv")
user_movie_df = pd.read_csv(data_dir + f"user_movie_{cluster_2d}.csv")

In [58]:
# 각 유저의 상위 10개 클러스터를 추출합니다.
user_top_clusters = pd.Series(
    user_movie_df.groupby(["user", "cluster"])
    .size()
    .groupby("user")
    .nlargest(5)
    .reset_index(level=1, drop=True)
    , name="count"
).to_frame()

In [59]:
print(user_top_clusters)

                count
user   cluster       
11     10          70
       0           45
       17          37
       11          26
       18          23
...               ...
138493 17          48
       10          23
       12          23
       11          21
       9           20

[156800 rows x 1 columns]


In [60]:
idx = user_movie_df[user_movie_df.set_index(["user", "cluster"]).index.isin(user_top_clusters.index)]
idx

Unnamed: 0,user,item,time,genre,year,cluster,t-SNE1,t-SNE2
0,11,880,1230787643,"['Sci-Fi', 'Thriller']",1996,0,-12.032172,48.861572
1,11,2232,1230788590,"['Thriller', 'Sci-Fi', 'Horror', 'Mystery']",1997,0,-22.246576,34.088640
2,11,2720,1230854280,"['Children', 'Action', 'Comedy', 'Adventure']",1999,17,-67.312294,14.625511
3,11,2642,1230788958,"['Action', 'Sci-Fi', 'Adventure']",1983,10,-77.085740,-17.196512
5,11,1917,1230785499,"['Romance', 'Action', 'Sci-Fi', 'Thriller']",1998,0,-25.862740,42.331676
...,...,...,...,...,...,...,...,...
5154463,138493,47465,1256680620,"['Fantasy', 'Drama', 'Thriller']",2005,11,-31.799116,22.496243
5154464,138493,2124,1256680192,"['Children', 'Fantasy', 'Comedy']",1991,11,-38.081944,22.059010
5154465,138493,2085,1256750533,"['Animation', 'Children', 'Adventure']",1961,17,-63.225830,12.620463
5154469,138493,1884,1255807098,"['Comedy', 'Adventure', 'Drama']",1998,12,-48.997196,-39.552235


In [61]:
# "user"와 "cluster"를 기준으로 그룹화하고 T-SNE1 및 T-SNE2의 평균 계산
avg_tsne_values = user_movie_df.groupby(["user", "cluster"])[["t-SNE1", "t-SNE2"]].mean().reset_index()

# user_movie_df와 avg_tsne_values를 user 및 cluster를 기준으로 조인
user_movie_df_with_avg_tsne = pd.merge(user_top_clusters, avg_tsne_values, on=["user", "cluster"], how="left")

user_movie_df_with_avg_tsne

Unnamed: 0,user,cluster,count,t-SNE1,t-SNE2
0,11,10,70,-72.502725,-12.376598
1,11,0,45,-20.286383,43.435302
2,11,17,37,-59.107727,4.301952
3,11,11,26,-32.529107,28.154112
4,11,18,23,-18.362526,6.791109
...,...,...,...,...,...
156795,138493,17,48,-58.142424,4.566223
156796,138493,10,23,-74.565766,-13.042378
156797,138493,12,23,-53.253760,-29.342021
156798,138493,11,21,-31.111905,24.040460


In [77]:
cluster = defaultdict(list)
for i, ser in movie_df.iterrows():
    cluster[ser["cluster"]].append(ser["item"])
cluster

defaultdict(list,
            {17: [1,
              2,
              8,
              10,
              13,
              48,
              60,
              95,
              107,
              126,
              158,
              169,
              170,
              362,
              364,
              380,
              421,
              434,
              455,
              464,
              494,
              551,
              558,
              588,
              594,
              595,
              596,
              609,
              631,
              648,
              661,
              709,
              711,
              733,
              736,
              783,
              798,
              828,
              836,
              901,
              908,
              935,
              986,
              1009,
              1011,
              1015,
              1017,
              1022,
              1023,
              1024,
              1025,
            

In [100]:
temp = movie_df.set_index("item")
temp.loc[10]

genre      ['Thriller', 'Action', 'Adventure']
year                                      1995
cluster                                     17
t-SNE1                               -75.54232
t-SNE2                               13.230308
Name: 10, dtype: object

In [101]:
submission = pd.DataFrame(columns=["user", "item"])
dic = {"user": [], "item": []}

for i, avg in tqdm(user_movie_df_with_avg_tsne.iterrows(), total=user_movie_df_with_avg_tsne.shape[0]):
    tmp = []
    for item_id in cluster[avg["cluster"]]:
        dis = np.sqrt((avg["t-SNE1"] - temp.loc[item_id]["t-SNE1"]) ** 2 + (avg["t-SNE2"] - temp.loc[item_id]["t-SNE2"]) ** 2)
        if len(tmp) != 2:
            tmp.append([item_id, dis])
            tmp.sort(key=lambda x: x[1])
        else:
            if tmp[0][1] > dis:
                tmp[1] = [item_id, dis]
                tmp.sort(key=lambda x: x[1])
            elif tmp[0][1] == dis or tmp[1][1] >= dis:
                tmp[1] = [item_id, dis]
    dic["user"].append(avg["user"])
    dic["item"].append(tmp[0][0])
    dic["user"].append(avg["user"])
    dic["item"].append(tmp[1][0])
submission = submission.append(pd.DataFrame(dic))
submission

  0%|          | 0/156800 [00:00<?, ?it/s]


KeyError: 'item'

In [None]:
submission.to_csv("../data/eval/submission.csv")