In [1]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from tqdm import tqdm
from collections import Counter, defaultdict

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Base train_ratings.csv 데이터프레임으로 저장
train_df = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv') 
train_df.head(5)

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563


In [3]:
# 기존 train_ratings.csv 에 rating 추가
train_df['rating'] = 1
train_df.head(5)

Unnamed: 0,user,item,time,rating
0,11,4643,1230782529,1
1,11,170,1230782534,1
2,11,531,1230782539,1
3,11,616,1230782542,1
4,11,2140,1230782563,1


In [4]:
# 모든 유저 x 모든 아이템의 sparse matrix 생성 user x item 
user_item_matrix = train_df.pivot_table('rating', 'user', 'item').fillna(0) # 13sec
user_item_matrix = user_item_matrix.astype(int)
user_item_matrix.head(5)

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [6]:
bad_user_list = pd.read_csv("./bad_user_id03.csv", header=None).to_numpy().squeeze().tolist()

In [13]:
all_user_set = set(train_df["user"].unique())
bad_user_set = set(bad_user_list)
good_user_set = all_user_set - bad_user_set
good_user_list = sorted(list(good_user_set))

In [15]:
good_user_item_matrix = user_item_matrix.loc[good_user_list]
good_user_item_matrix

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
138471,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
138472,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
138486,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
users = sorted(good_user_item_matrix.index.to_list())
items = sorted(good_user_item_matrix.columns.to_list())
print (f"Total # of users : {len(users)}, total # of items : {len(items)}")
print (f"Users example : {users[:7]}")
print (f"Items example : {items[:7]}")

Total # of users : 20581, total # of items : 6807
Users example : [11, 14, 18, 25, 31, 35, 43]
Items example : [1, 2, 3, 4, 5, 6, 7]


In [17]:
# 모든 유저간의 similarity 계산 (cosine or euclidean)
user_similarity = -euclidean_distances(good_user_item_matrix, good_user_item_matrix) # 1m
# user_similarity = cosine_similarity(user_item_matrix, user_item_matrix)

user_similarity_df = pd.DataFrame(user_similarity, index=good_user_item_matrix.index, columns=good_user_item_matrix.index)
user_similarity_df.head(5)

user,11,14,18,25,31,35,43,58,61,90,...,138443,138446,138456,138457,138459,138461,138471,138472,138486,138493
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,-0.0,-22.045408,-20.615528,-19.77372,-20.78461,-20.736441,-19.519221,-23.958297,-20.248457,-19.0,...,-19.849433,-20.273135,-21.886069,-21.023796,-21.095023,-19.697716,-19.416488,-22.315914,-19.570386,-22.045408
14,-22.045408,-0.0,-15.968719,-15.459625,-16.911535,-17.888544,-14.73092,-20.92845,-16.733201,-14.798649,...,-16.0,-14.933185,-18.083141,-17.320508,-17.349352,-15.362291,-14.798649,-20.445048,-16.522712,-20.396078
18,-20.615528,-15.968719,-0.0,-12.409674,-15.198684,-14.662878,-11.135529,-19.313208,-13.228757,-11.916375,...,-12.60952,-12.409674,-16.248077,-14.035669,-13.266499,-12.60952,-12.884099,-17.691806,-14.282857,-18.947295
25,-19.77372,-15.459625,-12.409674,-0.0,-15.32971,-14.456832,-11.135529,-18.574176,-12.845233,-11.045361,...,-11.7047,-12.727922,-15.748016,-13.152946,-13.928388,-12.041595,-12.409674,-17.0,-13.190906,-18.083141
31,-20.78461,-16.911535,-15.198684,-15.32971,-0.0,-16.613248,-13.892444,-21.213203,-15.620499,-14.247807,...,-15.491933,-13.601471,-18.303005,-16.673332,-16.643317,-14.764823,-13.152946,-18.867962,-16.340135,-19.899749


In [18]:
# 각 user 별로 가장 유사한 50명 추출
u_sim_top = pd.DataFrame()
TOP_N = 50

for user in tqdm(user_similarity_df.index):
    temp = pd.DataFrame(user_similarity_df[user].sort_values(ascending=False)[1 : TOP_N + 1].index, index=None).T # 0번째는 자기 자신인 1.0이라 뺌
    u_sim_top = pd.concat([u_sim_top, temp], axis=0)
    
u_sim_top.index = user_similarity_df.index
u_sim_top.head(5) # user별로 행 # 첫 번째 행에 있는 11번 유저와 가장 비슷한 순서대로 열 0,1,2...

100%|██████████| 20581/20581 [01:38<00:00, 208.38it/s]


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,99454,23239,132199,74110,119584,102858,90678,137891,19375,130150,...,3588,49662,56155,61324,69689,134844,102895,22973,15493,106807
14,81022,71337,35069,95631,127972,32723,55435,90854,52771,7059,...,137678,77853,83587,64865,72707,99986,38856,39566,63153,117891
18,33906,128913,112230,105578,65830,70882,131921,40842,3238,60301,...,36701,86752,39715,65771,53861,72561,21047,70171,32334,109001
25,3238,9617,20379,48058,45533,91963,71824,87402,84530,117817,...,29337,122598,23458,111925,38231,15863,124980,81063,95093,89944
31,38201,20565,4364,23331,120879,12862,89153,21875,24851,82994,...,81181,85446,89729,19432,71337,91308,115657,77148,18177,4270


In [19]:
LEAST_VIEW = 1000

view_count = good_user_item_matrix.sum(axis=0).to_list() # 각 item 별로 몇 번 시청되었는지 계산

In [20]:
# LEAST_VIEW (default: 1000) 미만 시청된 item들의 상호작용 정보 제거
# 이후에 해당 item들이 추천되지 않게 하기 위함

unpopular_list = list()
for idx, item in tqdm(enumerate(items)):
    if view_count[idx] < LEAST_VIEW :
        unpopular_list.append(item)

6807it [00:00, 970548.57it/s]


In [21]:
# 영화의 출시년도 dataframe으로 저장
years = pd.read_csv("/opt/ml/input/data/train/years.tsv", delimiter="\t")

# item을 넣으면, 해당 item의 출시년도를 반환하는 dict
item2year = dict()
for item, item_year in tqdm(zip(years["item"], years["year"])):
    item2year[item] = item_year

6799it [00:00, 806204.71it/s]


In [22]:
# 각 유저가 시청한 시간 스펙트럼 확인 -> 가장 마지막에 상호작용한 시간 확인
group = train_df.groupby("user")["time"].apply(max)
group = group.apply(dt.datetime.fromtimestamp)

for user in tqdm(group.keys()):
    group[user] = group[user].year + 1

100%|██████████| 31360/31360 [00:00<00:00, 42801.64it/s]


In [23]:
user_item_count = good_user_item_matrix.copy()

def to_minus(x) : # 이미 본 아이템은 추천 안 하게 하기 위해서 음수로 설정 
    if x != 0 :
        return -TOP_N
    else : return 0

user_item_count = user_item_count.applymap(to_minus) #4분 소요
user_item_count_cp = user_item_count.copy()
user_item_count.head()

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,-50,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,-50,0,0,0,0,0,-50,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,-50,0,0,0,0,0


In [24]:
u_sim_top_np = u_sim_top.to_numpy()
user_item_matrix_np = good_user_item_matrix.to_numpy()
user_item_count_np = user_item_count_cp.to_numpy()
users = u_sim_top.index
items = good_user_item_matrix.columns

for i, user in tqdm(enumerate(users), total=len(users)): 
    top_per_user = u_sim_top_np[i, :] # user i 와 가장 유사한 user 50명 [1 x 50]
    for top in top_per_user:
        user_item_count_np[i, :] += user_item_matrix_np[np.array(np.where(users == top))[0][0],:]

100%|██████████| 20581/20581 [03:00<00:00, 114.20it/s]


In [25]:
count_result = pd.DataFrame(user_item_count_np, columns=items, index=users) #numpy->dataframe->저장
pd.DataFrame(count_result).to_csv("good-user-based_future_count.csv", index=False)
count_result

item,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,-17,5,0,0,0,1,1,0,0,10,...,0,0,0,0,0,0,0,0,0,0
14,-11,5,1,0,1,1,-47,0,0,3,...,0,0,0,0,0,0,0,0,0,0
18,5,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
25,25,4,0,0,1,10,0,0,0,10,...,0,0,0,0,0,0,0,0,0,0
31,4,3,0,0,0,0,0,0,0,1,...,1,0,1,0,-48,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138461,23,0,0,0,0,2,0,0,0,6,...,0,0,0,0,0,0,0,0,0,0
138471,-15,0,0,0,0,0,1,0,0,5,...,1,0,0,0,0,0,0,0,0,0
138472,10,0,0,0,0,9,0,0,0,1,...,0,1,0,0,1,0,0,0,0,0
138486,-20,7,0,0,0,2,0,0,0,7,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# 위의 코드를 저장한 파일 불러오기
user_item_count_np = pd.read_csv("good-user-based_future_count.csv") #30초 소요
user_item_count_np.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,116823,117176,117533,117881,118696,118700,118900,118997,119141,119145
0,-17,5,0,0,0,1,1,0,0,10,...,0,0,0,0,0,0,0,0,0,0
1,-11,5,1,0,1,1,-47,0,0,3,...,0,0,0,0,0,0,0,0,0,0
2,5,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,25,4,0,0,1,10,0,0,0,10,...,0,0,0,0,0,0,0,0,0,0
4,4,3,0,0,0,0,0,0,0,1,...,1,0,1,0,-48,0,0,0,1,0


In [27]:
users

Int64Index([    11,     14,     18,     25,     31,     35,     43,     58,
                61,     90,
            ...
            138443, 138446, 138456, 138457, 138459, 138461, 138471, 138472,
            138486, 138493],
           dtype='int64', name='user', length=20581)

In [28]:
result=[]
user_item_count = pd.DataFrame(user_item_count_np)

# 3분 소요
for user in tqdm(range(len(users))): # user의 id가 아닌 index로 돈다.
    user_num = users[user]
    rec_cnt = 0
    
    while rec_cnt < 10: #top 10개 추천
        item = int(user_item_count.iloc[user,:].idxmax()) # item의 id가 들어감
        user_item_count.loc[user,str(item)] = 0 # 추천했으니까 빼줌
        
        # 해당 아이템이 인기가 없는 것이라면 추천하지 않음
        if item in unpopular_list:
            continue
        
        # 해당 아이템이 유저 활동기간 이후에 나온것이라면 추천하지 않음
        item_year = item2year[item]
        if (item_year > group[user_num]):
            continue
        
        result.append([user_num, item])
        rec_cnt += 1

print(result[:5])

100%|██████████| 20581/20581 [02:11<00:00, 157.05it/s]

[[11, 8961], [11, 733], [11, 4886], [11, 47], [11, 457]]





In [29]:
# assert len(result) == 31360 * 10

pd.DataFrame(result, columns=["user", "item"]).to_csv("good-submission2.csv", index=False)