In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from sklearn.metrics import pairwise_distances_argmin_min

# movie.csv와 user, item, cluster, tsne좌표로 이루어진 데이터프레임을 생성합니다.
# 파일 경로와 실제 데이터에 맞게 코드를 수정해야 합니다.
data_dir = "/data/ephemeral/level2-movierecommendation-recsys-06/data/train/"
cluster_3d = "kmeans3d"
movie_df = pd.read_csv(data_dir + f"movie_{cluster_3d}.csv")
user_movie_df = pd.read_csv(data_dir + f"user_movie_{cluster_3d}.csv")

In [2]:
# 각 유저의 상위 10개 클러스터를 추출합니다.
user_top_clusters = pd.Series(
    user_movie_df.groupby(["user", "cluster"])
    .size()
    .groupby("user")
    .nlargest(5)
    .reset_index(level=1, drop=True)
    , name="count"
).to_frame()

In [3]:
print(user_top_clusters)

                count
user   cluster       
11     9           89
       0           84
       12          31
       2           30
       22          24
...               ...
138493 2           38
       12          37
       9           36
       0           26
       22          24

[156799 rows x 1 columns]


In [4]:
idx = user_movie_df[user_movie_df.set_index(["user", "cluster"]).index.isin(user_top_clusters.index)]
idx

Unnamed: 0,user,item,time,genre,year,cluster,t-SNE1,t-SNE2,t-SNE3
0,11,880,1230787643,"['Sci-Fi', 'Thriller']",1996,0,-5.086464,19.359690,-4.001454
1,11,2232,1230788590,"['Thriller', 'Sci-Fi', 'Horror', 'Mystery']",1997,0,-7.284986,10.342355,-3.754827
2,11,2720,1230854280,"['Children', 'Action', 'Comedy', 'Adventure']",1999,2,-21.192728,1.290395,11.809307
3,11,2642,1230788958,"['Action', 'Sci-Fi', 'Adventure']",1983,9,-6.052561,-9.233346,16.998371
5,11,1917,1230785499,"['Romance', 'Action', 'Sci-Fi', 'Thriller']",1998,0,-10.034015,17.157347,0.271891
...,...,...,...,...,...,...,...,...,...
5154463,138493,47465,1256680620,"['Fantasy', 'Drama', 'Thriller']",2005,12,-5.841721,5.158888,8.178510
5154464,138493,2124,1256680192,"['Children', 'Fantasy', 'Comedy']",1991,12,-9.245488,3.052117,5.466531
5154465,138493,2085,1256750533,"['Animation', 'Children', 'Adventure']",1961,2,-19.793783,-2.205940,14.098822
5154469,138493,1884,1255807098,"['Comedy', 'Adventure', 'Drama']",1998,22,-22.171705,-13.674581,-1.426584


In [5]:
# "user"와 "cluster"를 기준으로 그룹화하고 T-SNE1 및 T-SNE2의 평균 계산
avg_tsne_values = user_movie_df.groupby(["user", "cluster"])[["t-SNE1", "t-SNE2", "t-SNE3"]].mean().reset_index()

# user_movie_df와 avg_tsne_values를 user 및 cluster를 기준으로 조인
user_movie_df_with_avg_tsne = pd.merge(user_top_clusters, avg_tsne_values, on=["user", "cluster"], how="left")

user_movie_df_with_avg_tsne

Unnamed: 0,user,cluster,count,t-SNE1,t-SNE2,t-SNE3
0,11,9,89,-6.617670,-7.057373,12.904354
1,11,0,84,-9.473680,16.175683,-5.235720
2,11,12,31,-12.657048,9.384611,7.785877
3,11,2,30,-18.755066,-1.401491,7.111600
4,11,22,24,-17.270436,-11.712347,2.724435
...,...,...,...,...,...,...
156794,138493,2,38,-19.058923,-1.839791,6.721616
156795,138493,12,37,-11.632960,8.750837,9.242478
156796,138493,9,36,-5.491334,-7.621428,11.906937
156797,138493,0,26,-8.648714,14.719537,-6.114582


In [20]:
cluster = defaultdict(dict)
for i, ser in movie_df.iterrows():
    cluster[ser["cluster"]][ser["item"]] = 0
cluster

defaultdict(dict,
            {2: {1: 0,
              2: 0,
              8: 0,
              10: 0,
              13: 0,
              60: 0,
              95: 0,
              107: 0,
              126: 0,
              158: 0,
              169: 0,
              170: 0,
              362: 0,
              364: 0,
              380: 0,
              421: 0,
              434: 0,
              455: 0,
              464: 0,
              494: 0,
              533: 0,
              558: 0,
              588: 0,
              609: 0,
              648: 0,
              661: 0,
              709: 0,
              711: 0,
              733: 0,
              736: 0,
              798: 0,
              828: 0,
              836: 0,
              908: 0,
              919: 0,
              986: 0,
              1009: 0,
              1011: 0,
              1015: 0,
              1017: 0,
              1030: 0,
              1031: 0,
              1032: 0,
              1367: 0,
             

In [21]:
user = defaultdict(list)
for i, ser in user_movie_df.iterrows():
    user[ser["user"]].append(ser["item"])
    cluster[ser["cluster"]][ser["item"]] += 1
user

defaultdict(list,
            {11: [880,
              2232,
              2720,
              2642,
              8977,
              1917,
              2916,
              2858,
              2959,
              6979,
              5444,
              3994,
              1591,
              5152,
              256,
              36509,
              2105,
              6503,
              54771,
              3083,
              33166,
              41571,
              3752,
              356,
              33660,
              364,
              1321,
              5283,
              48304,
              2827,
              7481,
              44191,
              19,
              48738,
              173,
              7099,
              3826,
              3969,
              2028,
              784,
              589,
              5219,
              67295,
              27441,
              5816,
              51412,
              608,
              2793,
              245

In [25]:
sorted_cluster = defaultdict(dict)
for key, values in cluster.items():
    for clus, cnt in sorted(values.items(), key=lambda x: x[1], reverse=True):
        sorted_cluster[key][clus] = cnt
sorted_cluster

defaultdict(dict,
            {2: {4306: 13495,
              1: 12217,
              6377: 12105,
              8961: 11857,
              4886: 10826,
              364: 9568,
              49272: 8800,
              6016: 7933,
              588: 7702,
              8360: 6898,
              3114: 6869,
              68954: 6587,
              648: 6583,
              40815: 6494,
              4896: 6431,
              919: 5859,
              733: 5853,
              380: 5679,
              1610: 5635,
              5218: 5176,
              2987: 5118,
              49530: 5043,
              2355: 4521,
              10: 4286,
              908: 4276,
              30793: 4076,
              78499: 4040,
              41566: 4038,
              2617: 3535,
              76093: 3394,
              2: 3364,
              1370: 3290,
              736: 3211,
              2947: 3208,
              41569: 3105,
              38038: 3099,
              2005: 2999,
              3623

In [34]:
dic = defaultdict(list)
for u, c in user_top_clusters.index:
    cnt = 0
    for key, _ in sorted_cluster[c].items():
        if cnt == 2:
            break
        if key not in user[u]:
            dic["user"].append(u)
            dic["item"].append(key)
            cnt += 1
dic

defaultdict(list,
            {'user': [11,
              11,
              11,
              11,
              11,
              11,
              11,
              11,
              11,
              11,
              14,
              14,
              14,
              14,
              14,
              14,
              14,
              14,
              14,
              14,
              18,
              18,
              18,
              18,
              18,
              18,
              18,
              18,
              18,
              18,
              25,
              25,
              25,
              25,
              25,
              25,
              25,
              25,
              25,
              25,
              31,
              31,
              31,
              31,
              31,
              31,
              31,
              31,
              31,
              31,
              35,
              35,
              35,
              35,
  

In [35]:
submission = pd.DataFrame.from_dict(data=dic, orient="columns")
submission = submission.astype(int)
submission

Unnamed: 0,user,item
0,11,1682
1,11,2502
2,11,1206
3,11,79132
4,11,5418
...,...,...
313593,138493,5349
313594,138493,589
313595,138493,32
313596,138493,1197


In [None]:
submission.to_csv("../data/eval/submission_3d_top_count.csv", index=False)

In [8]:
movie = movie_df.set_index("item")
movie.loc[10]

genre      ['Thriller', 'Action', 'Adventure']
year                                      1995
cluster                                      2
t-SNE1                              -25.579601
t-SNE2                               -0.761631
t-SNE3                                7.006908
Name: 10, dtype: object

In [11]:
def inference(user_movie_df_with_avg_tsne, cluster, user, movie):
    dic = {"user": [], "item": []}

    for i, avg in tqdm(user_movie_df_with_avg_tsne.iterrows(), total=user_movie_df_with_avg_tsne.shape[0]):
        tmp = []
        for item_id in cluster[avg["cluster"]]:
            dis = (
                (avg["t-SNE1"] - movie.loc[item_id]["t-SNE1"]) ** 2
                + (avg["t-SNE2"] - movie.loc[item_id]["t-SNE2"]) ** 2
                + (avg["t-SNE3"] - movie.loc[item_id]["t-SNE3"]) ** 2
            )
            if item_id not in user[avg["user"]]:
                if len(tmp) != 2:
                    tmp.append([item_id, dis])
                    tmp.sort(key=lambda x: x[1])
                else:
                    if tmp[0][1] > dis:
                        tmp[1] = [item_id, dis]
                        tmp.sort(key=lambda x: x[1])
                    elif tmp[0][1] == dis or tmp[1][1] >= dis:
                        tmp[1] = [item_id, dis]
        dic["user"].append(avg["user"])
        dic["item"].append(tmp[0][0])
        dic["user"].append(avg["user"])
        dic["item"].append(tmp[1][0])

    return dic

In [12]:
dic = inference(user_movie_df_with_avg_tsne, cluster, user, movie)

100%|██████████| 156799/156799 [2:49:19<00:00, 15.43it/s]  


In [13]:
submission = pd.DataFrame.from_dict(data=dic, orient="columns")
submission = submission.astype(int)
submission

Unnamed: 0,user,item
0,11,108979
1,11,61160
2,11,4618
3,11,1590
4,11,1264
...,...,...
313593,138493,84772
313594,138493,87306
313595,138493,5062
313596,138493,94015


In [14]:
submission.to_csv("../data/eval/submission_3d.csv", index=False)