In [20]:
import numpy as np
import pandas as pd
import json
import os
from tqdm import tqdm
from collections import Counter, defaultdict

In [3]:
# original train csv file : [user - item - timestamp]
train_df = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")
train_df_grp_user = train_df.groupby("user")["item"]

In [4]:
# genre tsv file : [item - genre]
genre_df = pd.read_csv("/opt/ml/input/data/train/genres.tsv", delimiter="\t")
genre_df_grp_item = genre_df.groupby("item")["genre"]

In [5]:
# 유저가 가장 좋아하는 top 5 찾아내기
user_top_genres  = dict()
user_top5_genres = dict()

def get_top_genres():
    user_seen_genres = dict()

    for user in tqdm(train_df_grp_user.groups.keys()): # 전체 유저 순회
        items = train_df_grp_user.get_group(user) # 유저가 시청한 영화 목록
        for item in items:
            item_genre = genre_df_grp_item.get_group(item) # 영화의 장르
            
            #-- 유저가 시청한 장르 목록 나열 ex. [comedy, horror, horror, drama, family, comedy, comedy, ...]
            if user in user_seen_genres.keys():
                user_seen_genres[user].extend(item_genre.tolist())
            else:
                user_seen_genres[user] = item_genre.tolist()
        
        #-- 장르별로 등장 횟수 count. ex. [("comedy", 3), ("horror", 2), ("Drama", 1), ...]
        user_genre_count = Counter(user_seen_genres[user]) 
        user_top_genres[user] = user_genre_count.most_common() # (정렬) 좋아하는 장르 전체
        user_top5_genres[user] = user_genre_count.most_common()[:5] # (정렬) 좋아하는 장르 top5

    #-- json으로 저장
    print ("Saving 'user_top_genres dict' as json file...")
    with open("./user_top_genres.json", "w") as json_file:
        json.dump(user_top_genres, json_file, indent=4)
        
    print ("Saving 'user_top5_genres dict' as json file...")
    with open("./user_top5_genres.json", "w") as json_file:
        json.dump(user_top5_genres, json_file, indent=4)
        
        

if (os.path.exists("./user_top_genres.json")):
    print ("File exists! Load 'user_top_genres' & 'user_top5_genres' from json file..")
    with open("./user_top_genres.json", "r") as json_file:
        user_top_genres = json.load(json_file)
    with open("./user_top5_genres.json", "r") as json_file:
        user_top5_genres = json.load(json_file)
        
else:
    get_top_genres()
    
print ("user top genres has been loaded.")

File exists! Load 'user_top_genres' & 'user_top5_genres' from json file..
user top genres has been loaded.


In [6]:
user_top5_genres["11"] # user 11 의 선호장르 top5

[['Sci-Fi', 176],
 ['Action', 167],
 ['Adventure', 133],
 ['Thriller', 130],
 ['Drama', 112]]

In [7]:
# 아이템(영화)별 시청 횟수
item_watch_count = dict()

for item in tqdm(train_df["item"]):
    if item in item_watch_count.keys():
        item_watch_count[item] += 1
    else:
        item_watch_count[item] = 1
    

100%|██████████| 5154471/5154471 [00:03<00:00, 1479565.79it/s]


In [28]:
# 장르별 인기순 영화 리스트 생성
genre_pop20 = dict()

for genre in tqdm(genre_df["genre"].unique()):
    genre_item = genre_df[genre_df["genre"] == genre]["item"]
    genre_movies = list()
    
    for item in genre_item:
        genre_movies.append((item, item_watch_count[item]))
        
    genre_movies.sort(key=lambda genre_movies: genre_movies[1], reverse=True)
    genre_pop20[genre] = genre_movies

100%|██████████| 18/18 [00:00<00:00, 379.09it/s]


In [None]:
# Rcommendation by proportion & user prefer top5 genres.
final_rec = dict()
proportion = [10, 10, 10, 10, 10] # CHANGE HERE !!

for user in tqdm(train_df_grp_user.groups.keys()):
    user_seen = train_df_grp_user.get_group(user)
    
    rec_list = list()
    for genre_idx, (top_genre, genre_counts) in enumerate(user_top5_genres[user]): # 유저가 즐겨보는 top5 장르
        
        cnt = 0
        for (item, item_counts) in genre_pop20[top_genre]: # 장르의 인기있는 영화 20개
            if cnt == proportion[genre_idx]:
                break
            
            if (item not in user_seen.values) and (item not in rec_list):
                rec_list.append(item)
                cnt += 1
            
            
    final_rec[user] = rec_list
    
    
    
final_list = list()
for user in final_rec.keys():
    assert len(set(final_rec[user])) == 10
    for item in final_rec[user]:
        final_list.append([user, item])
        
submission_df = pd.DataFrame(data=final_list, columns=["user", "item"])
submission_df.to_csv("./submission_voted.csv", index=False)

assert len(final_list) == 313600

In [45]:
final_rec = dict()
proportion = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] # CHANGE HERE !!

for user in tqdm(train_df_grp_user.groups.keys()):
    user_seen = train_df_grp_user.get_group(user)
    
    rec_list = list()
    for genre_idx, (top_genre, genre_counts) in enumerate(user_top_genres[str(user)][:5]):
        
        cnt = 0
        for (item, item_counts) in genre_pop20[top_genre]: # 장르의 인기있는 영화 20개
            if cnt == proportion[genre_idx]:
                break
            
            if (item not in user_seen.values):
                rec_list.append(item)
                cnt += 1
            
    
    rec_list_cnt = Counter(rec_list).most_common()
    user_rec_list = list()
    for item, cnt in rec_list_cnt[:10]:
        user_rec_list.append(item)
    final_rec[user] = user_rec_list
    
    
    
final_list = list()
for user in final_rec.keys():
    assert len(set(final_rec[user])) == 10
    for item in final_rec[user]:
        final_list.append([user, item])
        
submission_df = pd.DataFrame(data=final_list, columns=["user", "item"])
submission_df.to_csv("./submission_clustering.csv", index=False)

assert len(final_list) == 313600

100%|██████████| 31360/31360 [00:15<00:00, 1988.96it/s]


---
EDA // Real distribution for bad users. - `Genres`
---

In [10]:
bad_user_list = pd.read_csv("./bad_user_id03.csv", header=None).to_numpy().squeeze().tolist()
print (f"bad_user_list size = {len(bad_user_list)}")

bad_user_list size = 10779


In [11]:
# real distrbution for bad users
with open("./answers.json", "r") as answer_json:
    user_answer = json.load(answer_json)

In [12]:
ml_20m_df = pd.read_csv("./movielens-20m/movies.csv")
ml_20m_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [13]:
ml_20m_movie_genres = dict()

for movie in tqdm(ml_20m_df["movieId"]):
    genres = ml_20m_df[ml_20m_df["movieId"] == movie]["genres"]
    genre_list = genres.item().split("|")
    
    if genre_list[0] == "(no genres listed)":
        continue
    
    ml_20m_movie_genres[movie] = genre_list

100%|██████████| 27278/27278 [00:14<00:00, 1843.18it/s]


In [14]:
ml_20m_movie_genres

{1: ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'],
 2: ['Adventure', 'Children', 'Fantasy'],
 3: ['Comedy', 'Romance'],
 4: ['Comedy', 'Drama', 'Romance'],
 5: ['Comedy'],
 6: ['Action', 'Crime', 'Thriller'],
 7: ['Comedy', 'Romance'],
 8: ['Adventure', 'Children'],
 9: ['Action'],
 10: ['Action', 'Adventure', 'Thriller'],
 11: ['Comedy', 'Drama', 'Romance'],
 12: ['Comedy', 'Horror'],
 13: ['Adventure', 'Animation', 'Children'],
 14: ['Drama'],
 15: ['Action', 'Adventure', 'Romance'],
 16: ['Crime', 'Drama'],
 17: ['Drama', 'Romance'],
 18: ['Comedy'],
 19: ['Comedy'],
 20: ['Action', 'Comedy', 'Crime', 'Drama', 'Thriller'],
 21: ['Comedy', 'Crime', 'Thriller'],
 22: ['Crime', 'Drama', 'Horror', 'Mystery', 'Thriller'],
 23: ['Action', 'Crime', 'Thriller'],
 24: ['Drama', 'Sci-Fi'],
 25: ['Drama', 'Romance'],
 26: ['Drama'],
 27: ['Children', 'Drama'],
 28: ['Drama', 'Romance'],
 29: ['Adventure', 'Drama', 'Fantasy', 'Mystery', 'Sci-Fi'],
 30: ['Crime', 'Drama'],
 31: ['D

In [23]:
bad_user_genre_top = defaultdict(list)

for bad_user in tqdm(bad_user_list):
    bad_user_genre_top[bad_user] = list()
    for item in user_answer[str(bad_user)]: # bad_user 의 정답 아이템들 
        if item not in ml_20m_movie_genres.keys():
            continue
        item_genres = ml_20m_movie_genres[item]
        bad_user_genre_top[bad_user].extend(item_genres)
    
    bad_user_genre_top[bad_user] = Counter(bad_user_genre_top[bad_user]).most_common()

100%|██████████| 10779/10779 [00:00<00:00, 24277.80it/s]


In [25]:
#-- json으로 저장
print ("Saving 'bad_user_top_genres dict' as json file...")
with open("./bad_user_top_genres.json", "w") as json_file:
    json.dump(bad_user_genre_top, json_file, indent=4)

Saving 'bad_user_top_genres dict' as json file...


In [26]:
for user in bad_user_list:
    print (bad_user_genre_top[user][:5])

[('Drama', 8), ('Comedy', 6), ('Thriller', 5), ('Romance', 5), ('Mystery', 2)]
[('Drama', 12), ('Comedy', 9), ('Adventure', 7), ('Thriller', 6), ('Action', 6)]
[('Drama', 11), ('Adventure', 10), ('Comedy', 9), ('Action', 8), ('Sci-Fi', 6)]
[('Comedy', 11), ('Drama', 9), ('Romance', 8), ('Action', 6), ('Thriller', 4)]
[('Comedy', 22), ('Action', 11), ('Adventure', 9), ('Sci-Fi', 8), ('Children', 7)]
[('Comedy', 26), ('Drama', 26), ('Action', 25), ('Thriller', 19), ('Adventure', 18)]
[('Comedy', 5), ('Romance', 4), ('Drama', 4), ('Animation', 2), ('Children', 2)]
[('Romance', 3), ('Children', 3), ('Comedy', 3), ('Drama', 3), ('Adventure', 3)]
[('Mystery', 5), ('Sci-Fi', 3), ('Thriller', 3), ('Adventure', 2), ('Action', 2)]
[('Comedy', 10), ('Action', 8), ('Adventure', 6), ('Drama', 6), ('Romance', 5)]
[('Action', 3), ('Sci-Fi', 3), ('Thriller', 3), ('Comedy', 2), ('Mystery', 2)]
[('Action', 19), ('Thriller', 18), ('Comedy', 14), ('Crime', 13), ('Drama', 10)]
[('Adventure', 7), ('Fantasy'