### 장르를 기준으로 user 간의 유사성 보기

In [1]:
import pandas as pd
import numpy as np
import json
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import seaborn as sns
import time
from collections import Counter
# pd.options.plotting.backend = 'plotly'
%matplotlib inline

#### 데이터 불러오기 및 합치기

In [2]:
rating = pd.read_csv("../data/train/train_ratings.csv")
genre = pd.read_csv("../data/train/genres.tsv", sep='\t')
director = pd.read_csv("../data/train/directors.tsv",sep='\t')
title = pd.read_csv("../data/train/titles.tsv",sep='\t')
writer = pd.read_csv("../data/train/writers.tsv",sep='\t')
year = pd.read_csv("../data/train/years.tsv",sep='\t')
with open("../data/train/Ml_item2attributes.json") as f:
    ml_item2att = json.load(f)
    f.close()

In [3]:
df = pd.merge(
    pd.merge(
        pd.merge(
            rating, 
            pd.get_dummies(genre).groupby("item").agg(lambda x:1 if sum(x)>0 else 0).reset_index()),
        pd.merge(director,title)),
    pd.merge(writer, year))

In [4]:
df.head(1)

Unnamed: 0,user,item,time,genre_Action,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Crime,genre_Documentary,...,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Thriller,genre_War,genre_Western,director,title,writer,year
0,11,4643,1230782529,1,1,0,0,0,0,0,...,0,0,1,0,0,0,nm0000318,Planet of the Apes (2001),nm0099541,2001


In [5]:
genre_columns = df.columns[3:-4]
print(len(genre_columns))
genre_columns

18


Index(['genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Children',
       'genre_Comedy', 'genre_Crime', 'genre_Documentary', 'genre_Drama',
       'genre_Fantasy', 'genre_Film-Noir', 'genre_Horror', 'genre_Musical',
       'genre_Mystery', 'genre_Romance', 'genre_Sci-Fi', 'genre_Thriller',
       'genre_War', 'genre_Western'],
      dtype='object')

#### 연도별 장르 분포

In [107]:
df.head(1)

Unnamed: 0,user,item,time,genre_Action,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Crime,genre_Documentary,...,genre_Romance,genre_Sci-Fi,genre_Thriller,genre_War,genre_Western,director,title,writer,year,genre_list
0,11,4643,1230782529,1,1,0,0,0,0,0,...,0,1,0,0,0,nm0000318,Planet of the Apes (2001),nm0099541,2001,genre_Action genre_Adventure genre_Drama genre...


In [6]:
year_genre_df = df.groupby(by="year")[genre_columns].apply(lambda x:x[x==1].count()).reset_index()
year_genre_df

Unnamed: 0,year,genre_Action,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Fantasy,genre_Film-Noir,genre_Horror,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Thriller,genre_War,genre_Western
0,1922,0,0,0,0,0,186,0,0,0,0,1060,0,186,0,0,186,0,0
1,1923,1736,0,0,0,2141,0,0,0,0,0,0,0,0,1736,0,0,0,0
2,1924,0,0,0,0,1311,0,0,126,945,0,0,0,0,945,0,0,0,0
3,1925,0,817,0,0,1705,0,0,942,0,0,339,0,0,817,0,0,603,0
4,1926,0,0,0,0,2283,0,0,0,0,0,0,0,0,0,0,0,2283,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,2010,77961,93337,69324,70889,71090,25628,4714,71384,80427,0,4586,10253,20683,22088,29753,57171,2267,15878
89,2011,70815,59264,7609,4887,30544,12536,736,65356,24725,3161,892,51,15354,9214,48319,56679,27472,3558
90,2012,72260,63009,9535,7096,22353,24628,221,38348,13290,0,4615,1352,130,5721,41136,23833,829,2979
91,2013,35731,27291,7636,2194,14702,7769,0,21130,13156,0,3646,2298,2522,6979,28289,13858,330,429


In [110]:
year_genre_df = df.drop(["user", "time"], axis=1).drop_duplicates().groupby(by="year")[genre_columns].apply(lambda x:x[x==1].count()).reset_index()
year_genre_df

Unnamed: 0,year,genre_Action,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Fantasy,genre_Film-Noir,genre_Horror,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Thriller,genre_War,genre_Western
0,1922,0,0,0,0,0,3,0,0,0,0,1,0,3,0,0,3,0,0
1,1923,8,0,0,0,11,0,0,0,0,0,0,0,0,8,0,0,0,0
2,1924,0,0,0,0,6,0,0,1,3,0,0,0,0,3,0,0,0,0
3,1925,0,1,0,0,11,0,0,4,0,0,3,0,0,1,0,0,1,0
4,1926,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,2010,97,70,44,41,120,22,21,126,58,0,23,5,14,55,24,67,7,8
89,2011,90,68,37,26,102,26,7,132,40,3,10,1,13,32,55,70,17,8
90,2012,86,69,41,25,83,26,4,75,27,0,12,8,1,18,52,38,2,1
91,2013,87,61,30,5,57,29,0,64,38,0,15,3,10,22,48,44,2,3


#### 특정 유저가 봤던 영화들 중 80% 이상이 동일한 장르를 가지는 유저가 있는지 보자
- 80%와 같은 수치는 매개변수로 받도록 하자.
- 예를 들어, user A가 봤던 영화들 중에 80%가 "Drama"라는 장르를 포함한다면, 이 유저는 "Drama" 장르를 선호하는 유저라고 볼 수 있다.
- 반대로, 특정 장르에 지배적이지 않은 유저가 있을 수 있다. 이 유저는 골고루 보는 유저일 수 있다.
- 유저와 장르를 클러스터링하는 것도 생각해볼 수 있다.

In [12]:
# 영화별로 포함하는 장르들 모두 찾기
df_genre = df.groupby(by="title")[genre_columns].apply(lambda x:x[x==1].count()).reset_index()

In [13]:
temp = []
for movie_title in df_genre.index:
    non_zero_genre_list = []
    for genre_name in genre_columns:
        if df_genre.loc[movie_title][genre_name] != 0:
            non_zero_genre_list.append(genre_name)
    non_zero_genre_list.sort()
    temp.append(" ".join(non_zero_genre_list))

In [14]:
df_genre["genre_list"] = temp

In [15]:
df_genre

Unnamed: 0,title,genre_Action,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Fantasy,genre_Film-Noir,genre_Horror,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Thriller,genre_War,genre_Western,genre_list
0,'Salem's Lot (2004),0,0,0,0,0,0,0,78,0,0,78,0,78,0,0,78,0,0,genre_Drama genre_Horror genre_Mystery genre_T...
1,"'burbs, The (1989)",0,0,0,0,573,0,0,0,0,0,0,0,0,0,0,0,0,0,genre_Comedy
2,(500) Days of Summer (2009),0,0,0,0,6784,0,0,6784,0,0,0,0,0,6784,0,0,0,0,genre_Comedy genre_Drama genre_Romance
3,*batteries not included (1987),0,0,0,2725,2725,0,0,0,2725,0,0,0,0,0,2725,0,0,0,genre_Children genre_Comedy genre_Fantasy genr...
4,...And Justice for All (1979),0,0,0,0,0,0,0,402,0,0,0,0,0,0,0,402,0,0,genre_Drama genre_Thriller
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4962,[REC]² (2009),0,0,0,0,0,0,0,0,0,0,376,0,0,0,0,376,0,0,genre_Horror genre_Thriller
4963,eXistenZ (1999),1194,0,0,0,0,0,0,0,0,0,0,0,0,0,1194,1194,0,0,genre_Action genre_Sci-Fi genre_Thriller
4964,xXx (2002),769,0,0,0,0,769,0,0,0,0,0,0,0,0,0,769,0,0,genre_Action genre_Crime genre_Thriller
4965,xXx: State of the Union (2005),310,0,0,0,0,310,0,0,0,0,0,0,0,0,0,310,0,0,genre_Action genre_Crime genre_Thriller


In [20]:
df = pd.merge(df, df_genre[["title", "genre_list"]], on="title", how="inner")

In [22]:
df.head(1)

Unnamed: 0,user,item,time,genre_Action,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Crime,genre_Documentary,...,genre_Romance,genre_Sci-Fi,genre_Thriller,genre_War,genre_Western,director,title,writer,year,genre_list
0,11,4643,1230782529,1,1,0,0,0,0,0,...,0,1,0,0,0,nm0000318,Planet of the Apes (2001),nm0099541,2001,genre_Action genre_Adventure genre_Drama genre...


In [80]:
tmp1 = df.head(100)

In [101]:
def get_genre_preference(x, topk=1):
    x = Counter(" ".join(x).split())
    union = sum(value for key, value in x.items())
    topk_genre = x.most_common(topk)
    topk_genre = [(round(value / union, 3)) for key, value in topk_genre]
    return topk_genre[0]

def get_genre_preference_name(x, topk=1):
    x = Counter(" ".join(x).split())
    union = sum(value for key, value in x.items())
    topk_genre = x.most_common(topk)
    topk_genre = [key for key, value in topk_genre]
    return topk_genre[0]

In [102]:
user_genre_list = df.groupby(by="user")["genre_list"].agg(lambda x: get_genre_preference(x)).reset_index()
user_genre_list.rename(columns={"genre_list": "genre_preference"}, inplace=True)
user_genre_list_ = df.groupby(by="user")["genre_list"].agg(lambda x: get_genre_preference_name(x)).reset_index()
user_genre_list_.rename(columns={"genre_list": "genre_preference_name"}, inplace=True)
user_genre_list = pd.merge(user_genre_list, user_genre_list_, on="user")
user_genre_list.head(3)

Unnamed: 0,user,genre_preference,genre_preference_name
0,11,0.127,genre_Adventure
1,14,0.181,genre_Children
2,18,0.199,genre_Drama


In [128]:
print(user_genre_list[user_genre_list["genre_preference"] >= 0.6]["genre_preference_name"].nunique())
user_genre_list[user_genre_list["genre_preference"] >= 0.6]

1


Unnamed: 0,user,genre_preference,genre_preference_name
4112,18067,0.663,genre_Romance
5193,22701,0.654,genre_Romance
9493,41458,0.641,genre_Romance
17918,78747,0.61,genre_Romance
24450,107347,0.604,genre_Romance
25723,113127,0.64,genre_Romance


In [129]:
print(user_genre_list[user_genre_list["genre_preference"] >= 0.5]["genre_preference_name"].nunique())
user_genre_list[user_genre_list["genre_preference"] >= 0.5]

1


Unnamed: 0,user,genre_preference,genre_preference_name
173,756,0.508,genre_Romance
4112,18067,0.663,genre_Romance
5193,22701,0.654,genre_Romance
8499,37153,0.59,genre_Romance
9493,41458,0.641,genre_Romance
17918,78747,0.61,genre_Romance
18651,81967,0.587,genre_Romance
18946,83261,0.587,genre_Romance
24450,107347,0.604,genre_Romance
24588,108009,0.507,genre_Romance


In [127]:
print(user_genre_list[user_genre_list["genre_preference"] >= 0.4]["genre_preference_name"].nunique())
user_genre_list[user_genre_list["genre_preference"] >= 0.4]

4


Unnamed: 0,user,genre_preference,genre_preference_name
173,756,0.508,genre_Romance
544,2310,0.424,genre_Romance
2492,11017,0.419,genre_Romance
2765,12189,0.414,genre_Drama
2837,12501,0.408,genre_Drama
...,...,...,...
29917,131776,0.413,genre_Drama
30083,132484,0.510,genre_Romance
30890,136428,0.437,genre_Drama
30894,136442,0.413,genre_Romance


In [130]:
print(user_genre_list[user_genre_list["genre_preference"] >= 0.3]["genre_preference_name"].nunique())
user_genre_list[user_genre_list["genre_preference"] >= 0.3]

7


Unnamed: 0,user,genre_preference,genre_preference_name
9,60,0.319,genre_Romance
91,407,0.314,genre_Drama
103,442,0.332,genre_Drama
106,455,0.303,genre_Drama
111,469,0.311,genre_Drama
...,...,...,...
31005,136949,0.309,genre_Romance
31039,137079,0.400,genre_Comedy
31149,137581,0.301,genre_Drama
31221,137904,0.332,genre_Drama


In [126]:
print(user_genre_list[user_genre_list["genre_preference"] >= 0.2]["genre_preference_name"].nunique())
user_genre_list[user_genre_list["genre_preference"] >= 0.2]

15


Unnamed: 0,user,genre_preference,genre_preference_name
9,60,0.319,genre_Romance
20,99,0.232,genre_Comedy
23,121,0.225,genre_Comedy
38,182,0.248,genre_Action
41,201,0.264,genre_Drama
...,...,...,...
31342,138435,0.204,genre_Drama
31348,138456,0.220,genre_Comedy
31352,138470,0.210,genre_Drama
31356,138475,0.209,genre_Drama


In [103]:
user_genre_list["genre_preference_name"].str.contains("genre_Adventure")

0         True
1        False
2        False
3         True
4         True
         ...  
31355    False
31356    False
31357     True
31358    False
31359    False
Name: genre_preference_name, Length: 31360, dtype: bool

In [104]:
user_genre_list

Unnamed: 0,user,genre_preference,genre_preference_name
0,11,0.127,genre_Adventure
1,14,0.181,genre_Children
2,18,0.199,genre_Drama
3,25,0.181,genre_Adventure
4,31,0.173,genre_Adventure
...,...,...,...
31355,138473,0.151,genre_Drama
31356,138475,0.209,genre_Drama
31357,138486,0.153,genre_Adventure
31358,138492,0.281,genre_Comedy
