In [1]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from tqdm import tqdm
from collections import Counter, defaultdict

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Base train_ratings.csv 데이터프레임으로 저장
train_df = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv') 
train_df.head(5)

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563


In [3]:
# 기존 train_ratings.csv 에 rating 추가
train_df['rating'] = 1
train_df.head(5)

Unnamed: 0,user,item,time,rating
0,11,4643,1230782529,1
1,11,170,1230782534,1
2,11,531,1230782539,1
3,11,616,1230782542,1
4,11,2140,1230782563,1


In [4]:
# 모든 유저 x 모든 아이템의 sparse matrix 생성 user x item 
user_item_matrix = train_df.pivot_table('rating', 'user', 'item').fillna(0) # 13sec
user_item_matrix = user_item_matrix.astype(int)
user_item_matrix.head(5)

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [24]:
bad_user_list = pd.read_csv("./bad_user_id03.csv", header=None).to_numpy().squeeze().tolist()

In [25]:
bad_user_item_matrix = user_item_matrix.loc[bad_user_list]
bad_user_item_matrix

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138437,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
138470,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
138473,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
138475,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
users = sorted(bad_user_item_matrix.index.to_list())
items = sorted(bad_user_item_matrix.columns.to_list())
print (f"Total # of users : {len(users)}, total # of items : {len(items)}")
print (f"Users example : {users[:7]}")
print (f"Items example : {items[:7]}")

Total # of users : 10779, total # of items : 6807
Users example : [50, 60, 65, 72, 77, 82, 85]
Items example : [1, 2, 3, 4, 5, 6, 7]


In [27]:
# 모든 유저간의 similarity 계산 (cosine or euclidean)
user_similarity = -euclidean_distances(bad_user_item_matrix, bad_user_item_matrix) # 1m
# user_similarity = cosine_similarity(user_item_matrix, user_item_matrix)

user_similarity_df = pd.DataFrame(user_similarity, index=bad_user_item_matrix.index, columns=bad_user_item_matrix.index)
user_similarity_df.head(5)

user,50,60,65,72,77,82,85,98,99,121,...,138388,138403,138419,138429,138435,138437,138470,138473,138475,138492
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50,-0.0,-10.908712,-11.090537,-10.630146,-11.313708,-15.779734,-11.74734,-11.489125,-10.488088,-12.0,...,-11.789826,-14.317821,-15.0,-12.409674,-11.74734,-23.345235,-11.269428,-10.677078,-14.177447,-11.532563
60,-10.908712,-0.0,-9.380832,-10.29563,-10.630146,-15.556349,-9.949874,-9.746794,-9.219544,-10.908712,...,-11.74734,-13.928388,-16.492423,-11.7047,-10.723805,-23.832751,-10.0,-10.049876,-13.038405,-10.29563
65,-11.090537,-9.380832,-0.0,-10.29563,-10.34408,-15.231546,-10.535654,-10.440307,-9.327379,-11.18034,...,-12.247449,-14.422205,-16.248077,-12.369317,-10.816654,-23.452079,-10.0,-10.34408,-13.038405,-10.77033
72,-10.630146,-10.29563,-10.29563,-0.0,-10.535654,-15.874508,-11.0,-11.0,-9.746794,-11.269428,...,-12.165525,-14.422205,-15.748016,-12.288206,-11.0,-23.452079,-10.77033,-10.630146,-13.711309,-10.29563
77,-11.313708,-10.630146,-10.34408,-10.535654,-0.0,-15.588457,-10.86278,-11.135529,-10.0,-11.401754,...,-12.60952,-14.387495,-16.093477,-12.409674,-10.954451,-23.0,-10.908712,-10.29563,-13.527749,-11.269428


In [28]:
# 각 user 별로 가장 유사한 50명 추출
u_sim_top = pd.DataFrame()
TOP_N = 50

for user in tqdm(user_similarity_df.index):
    temp = pd.DataFrame(user_similarity_df[user].sort_values(ascending=False)[1 : TOP_N + 1].index, index=None).T # 0번째는 자기 자신인 1.0이라 뺌
    u_sim_top = pd.concat([u_sim_top, temp], axis=0)
    
u_sim_top.index = user_similarity_df.index
u_sim_top.head(5) # user별로 행 # 첫 번째 행에 있는 11번 유저와 가장 비슷한 순서대로 열 0,1,2...

100%|██████████| 10779/10779 [00:34<00:00, 314.24it/s]


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50,117768,2052,53188,124866,6403,46173,137608,17482,93225,133232,...,17431,25281,31805,9869,2391,72029,114143,58552,58797,13550
60,53188,83550,30670,105752,11211,75531,64204,68041,69890,68606,...,38673,21495,33423,17120,118960,112163,85614,85845,50473,114351
65,53188,68606,11211,63524,43447,89895,90316,32132,91525,126956,...,96536,56737,64617,43807,56920,37153,101380,85845,81639,77939
72,53188,44016,33155,61857,112953,68606,81626,16706,11211,137661,...,46936,17482,7209,128756,93225,70605,115377,62329,129324,117556
77,53188,137661,68606,58529,5770,43468,137532,111486,125881,36579,...,51754,110739,128756,91274,20755,9859,83756,70103,112403,87978


In [29]:
LEAST_VIEW = 1000

view_count = bad_user_item_matrix.sum(axis=0).to_list() # 각 item 별로 몇 번 시청되었는지 계산

In [30]:
# LEAST_VIEW (default: 1000) 미만 시청된 item들의 상호작용 정보 제거
# 이후에 해당 item들이 추천되지 않게 하기 위함

unpopular_list = list()
for idx, item in tqdm(enumerate(items)):
    if view_count[idx] < LEAST_VIEW :
        unpopular_list.append(item)

6807it [00:00, 588891.29it/s]


In [31]:
# 영화의 출시년도 dataframe으로 저장
years = pd.read_csv("/opt/ml/input/data/train/years.tsv", delimiter="\t")

# item을 넣으면, 해당 item의 출시년도를 반환하는 dict
item2year = dict()
for item, item_year in tqdm(zip(years["item"], years["year"])):
    item2year[item] = item_year

6799it [00:00, 815379.22it/s]


In [32]:
# 각 유저가 시청한 시간 스펙트럼 확인 -> 가장 마지막에 상호작용한 시간 확인
group = train_df.groupby("user")["time"].apply(max)
group = group.apply(dt.datetime.fromtimestamp)

for user in tqdm(group.keys()):
    group[user] = group[user].year + 1

100%|██████████| 31360/31360 [00:00<00:00, 43860.70it/s]


In [33]:
user_item_count = bad_user_item_matrix.copy()

def to_minus(x) : # 이미 본 아이템은 추천 안 하게 하기 위해서 음수로 설정 
    if x != 0 :
        return -TOP_N
    else : return 0

user_item_count = user_item_count.applymap(to_minus) #4분 소요
user_item_count_cp = user_item_count.copy()
user_item_count.head()

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
u_sim_top_np = u_sim_top.to_numpy()
user_item_matrix_np = bad_user_item_matrix.to_numpy()
user_item_count_np = user_item_count_cp.to_numpy()
users = u_sim_top.index
items = bad_user_item_matrix.columns

for i, user in tqdm(enumerate(users), total=len(users)): 
    top_per_user = u_sim_top_np[i, :] # user i 와 가장 유사한 user 50명 [1 x 50]
    for top in top_per_user:
        user_item_count_np[i, :] += user_item_matrix_np[np.array(np.where(users == top))[0][0],:]

100%|██████████| 10779/10779 [01:19<00:00, 135.46it/s]


In [35]:
count_result = pd.DataFrame(user_item_count_np, columns=items, index=users) #numpy->dataframe->저장
pd.DataFrame(count_result).to_csv("bad-user-based_future_count.csv", index=False)
count_result

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50,13,0,0,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,5,1,0,0,0,0,0,0,0,1,...,0,0,0,0,2,0,0,0,0,0
65,3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72,11,1,0,0,0,4,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
77,4,0,0,0,1,2,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138437,-25,5,0,0,0,-26,0,0,0,10,...,0,0,0,0,0,0,0,0,0,0
138470,3,0,0,0,0,3,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
138473,-40,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
138475,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# 위의 코드를 저장한 파일 불러오기
user_item_count_np = pd.read_csv("bad-user-based_future_count.csv") #30초 소요
user_item_count_np.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
0,13,0,0,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,1,0,0,0,0,0,0,0,1,...,0,0,0,0,2,0,0,0,0,0
2,3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,11,1,0,0,0,4,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,1,2,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [37]:
users

Int64Index([    50,     60,     65,     72,     77,     82,     85,     98,
                99,    121,
            ...
            138388, 138403, 138419, 138429, 138435, 138437, 138470, 138473,
            138475, 138492],
           dtype='int64', name='user', length=10779)

In [38]:
result=[]
user_item_count = pd.DataFrame(user_item_count_np)

# 3분 소요
for user in tqdm(range(len(users))): # user의 id가 아닌 index로 돈다.
    user_num = users[user]
    rec_cnt = 0
    
    while rec_cnt < 10: #top 10개 추천
        item = int(user_item_count.iloc[user,:].idxmax()) # item의 id가 들어감
        user_item_count.loc[user,str(item)] = 0 # 추천했으니까 빼줌
        
        # 해당 아이템이 인기가 없는 것이라면 추천하지 않음
        if item in unpopular_list:
            continue
        
        # 해당 아이템이 유저 활동기간 이후에 나온것이라면 추천하지 않음
        item_year = item2year[item]
        if (item_year > group[user_num]):
            continue
        
        result.append([user_num, item])
        rec_cnt += 1

print(result[:5])

100%|██████████| 10779/10779 [01:06<00:00, 161.91it/s]

[[50, 4993], [50, 4226], [50, 527], [50, 858], [50, 2329]]





In [39]:
# assert len(result) == 31360 * 10

pd.DataFrame(result, columns=["user", "item"]).to_csv("bad-submission2.csv", index=False)