In [38]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [39]:
train_df = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")
train_df_grp_user = train_df.groupby("user")["item"]

In [40]:
bad_user = pd.read_csv("/opt/ml/input/data/train/bad_user_id03.csv",header=None)
bad_user = bad_user.values
bad_user

array([[    50],
       [    60],
       [    65],
       ...,
       [138473],
       [138475],
       [138492]])

In [49]:
writer_df = pd.read_csv("/opt/ml/input/data/train/writers.tsv", delimiter="\t")
writer_df_grp_item = writer_df.groupby("item")["writer"]
writer_df_grp_item.get_group(327)

2873    nm0093328
Name: writer, dtype: object

In [52]:
bad_user_writers = dict()
bad_user_top5_writer = dict()

for user in tqdm(bad_user):
    user = user[0]
    items = train_df_grp_user.get_group(user)
    for item in items:
        if item in writer_df_grp_item.groups.keys():
            item_writer = writer_df_grp_item.get_group(item)
            if user in bad_user_writers.keys():
                bad_user_writers[user].extend(item_writer.tolist())
            else:
                bad_user_writers[user] = item_writer.tolist()
    
    bad_user_writer_count = Counter(bad_user_writer_count[user])
    bad_user_top5_writer[user] = bad_user_writer_count.most_common()[:5]

  0%|          | 0/10779 [00:00<?, ?it/s]

In [5]:
genre_df = pd.read_csv("/opt/ml/input/data/train/genres.tsv", delimiter="\t")
genre_df_grp_item = genre_df.groupby("item")["genre"]
genre_df_grp_item.get_group(318)

0    Crime
1    Drama
Name: genre, dtype: object

In [20]:
user_seen_genres = dict()
user_top5_genres = dict()

for user in tqdm(train_df_grp_user.groups.keys()):
    items = train_df_grp_user.get_group(user)
    for item in items:
        item_genre = genre_df_grp_item.get_group(item)
        if user in user_seen_genres.keys():
            user_seen_genres[user].extend(item_genre.tolist())
        else:
            user_seen_genres[user] = item_genre.tolist()
    
    user_genre_count = Counter(user_seen_genres[user])
    user_top5_genres[user] = user_genre_count.most_common()[:5]

100%|██████████| 31360/31360 [07:38<00:00, 68.46it/s]


In [21]:
user_top5_genres

{11: [('Sci-Fi', 176),
  ('Action', 167),
  ('Adventure', 133),
  ('Thriller', 130),
  ('Drama', 112)],
 14: [('Comedy', 106),
  ('Drama', 66),
  ('Adventure', 58),
  ('Children', 57),
  ('Romance', 55)],
 18: [('Drama', 62),
  ('Comedy', 28),
  ('Romance', 25),
  ('Crime', 20),
  ('Thriller', 9)],
 25: [('Drama', 41),
  ('Comedy', 40),
  ('Action', 22),
  ('Crime', 21),
  ('Adventure', 21)],
 31: [('Adventure', 94),
  ('Action', 92),
  ('Sci-Fi', 49),
  ('Comedy', 49),
  ('Fantasy', 47)],
 35: [('Drama', 90),
  ('Action', 69),
  ('Thriller', 62),
  ('Comedy', 60),
  ('Adventure', 48)],
 43: [('Drama', 33),
  ('Romance', 24),
  ('Comedy', 23),
  ('Action', 18),
  ('Adventure', 15)],
 50: [('Drama', 56),
  ('Comedy', 29),
  ('Romance', 25),
  ('Action', 18),
  ('Fantasy', 18)],
 58: [('Drama', 197),
  ('Comedy', 117),
  ('Romance', 75),
  ('Action', 67),
  ('Thriller', 61)],
 60: [('Adventure', 26),
  ('Drama', 25),
  ('Fantasy', 17),
  ('Comedy', 13),
  ('Action', 10)],
 61: [('Drama',

In [26]:
# 아이템(영화)별 시청 횟수

item_watch_count = dict()

for item in tqdm(train_df["item"]):
    if item in item_watch_count.keys():
        item_watch_count[item] += 1
    else:
        item_watch_count[item] = 1
    

100%|██████████| 5154471/5154471 [00:03<00:00, 1388837.96it/s]


In [125]:
# 장르별 인기 Top20 영화 리스트 생성
genre_pop20 = dict()

for genre in tqdm(genre_df["genre"].unique()):
    genre_item = genre_df[genre_df["genre"] == genre]["item"]
    genre_movies = list()
    
    for item in genre_item:
        genre_movies.append((item, item_watch_count[item]))
        
    genre_movies.sort(key=lambda genre_movies: genre_movies[1], reverse=True)
    genre_pop20[genre] = genre_movies[:300]

100%|██████████| 18/18 [00:00<00:00, 442.51it/s]


In [135]:
final_rec = dict()
proportion = [4, 2, 2, 1, 1] # CHANGE HERE !!

for user in tqdm(train_df_grp_user.groups.keys()):
    user_seen = train_df_grp_user.get_group(user)
    
    rec_list = list()
    for genre_idx, (top_genre, genre_counts) in enumerate(user_top5_genres[user]): # 유저가 즐겨보는 top5 장르
        
        cnt = 0
        for (item, item_counts) in genre_pop20[top_genre]: # 장르의 인기있는 영화 20개
            if cnt == proportion[genre_idx]:
                break
            
            if (item not in user_seen.values) and (item not in rec_list):
                rec_list.append(item)
                cnt += 1
            
            
    final_rec[user] = rec_list
    
    
    
final_list = list()
for user in final_rec.keys():
    assert len(set(final_rec[user])) == 10
    for item in final_rec[user]:
        final_list.append([user, item])
        
submission_df = pd.DataFrame(data=final_list, columns=["user", "item"])
submission_df.to_csv("./submission_voted.csv", index=False)

assert len(final_list) == 313600

100%|██████████| 31360/31360 [00:08<00:00, 3894.01it/s]
