# 인기도 기반 추천 목록 생성

* 탐색한 내용 : "무엇을 누구에게 추천하여 어떻게 평가할 것인지에 대한 대략적인 그림"
  * 추천에 활용할 피드백 데이터 탐색
  * 추천 서비스에 응용할 수 있는 시간대 정보 탐색
  * 몇 명의 유저가 있는지, 몇 개의 아이템이 있는지
* 유저와 아이템의 메타 정보 탐색 : "추천의 Rule로 활용할 수 있을 만한 요소들 탐색"
  * 영화 데이터에서 피처로 활용 가능한 요소들 (장르, 개봉연도)
  * 유저의 demographic infomation (성별, 나이)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

# csv 불러오기
import pandas as pd

movie_df = pd.read_csv('/content/drive/MyDrive/2307_추천시스템_101/movie_df.csv', index_col=0)
user_df = pd.read_csv('/content/drive/MyDrive/2307_추천시스템_101/user_df.csv', index_col=0)

Mounted at /content/drive


## 1. 데이터 셋 준비

In [2]:
rating_url = 'https://raw.githubusercontent.com/yoonkt200/python-data-analysis/master/data/ml-1m/ratings.dat'
rating_df = pd.read_csv(rating_url, names=['user_id', 'movie_id', 'rating', 'time'], delimiter='::', engine ='python')
rating_df.head()

Unnamed: 0,user_id,movie_id,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
movie_df['genre_list'] = list(map(lambda x: x.split('|'), movie_df['genre']))

In [4]:
movie_df.head()

Unnamed: 0,movie_id,title,genre,release_year,genre_list
0,1,Toy Story (1995),Animation|Children's|Comedy,199,"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),Adventure|Children's|Fantasy,199,"[Adventure, Children's, Fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,199,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama,199,"[Comedy, Drama]"
4,5,Father of the Bride Part II (1995),Comedy,199,[Comedy]


In [5]:
user_df.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode,age_bucket
0,1,F,1,10,48067,False
1,2,M,56,16,70072,50~
2,3,M,25,15,55117,20
3,4,M,45,7,2460,40
4,5,M,25,20,55455,20


In [6]:
# 점수데이터에 영화데이터를 join

rating_movie_df = pd.merge(rating_df, movie_df, on='movie_id', how='left')
RDF = pd.merge(rating_movie_df, user_df, on='user_id', how='left')
RDF.head(5)

Unnamed: 0,user_id,movie_id,rating,time,title,genre,release_year,genre_list,gender,age,occupation,zipcode,age_bucket
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,197,[Drama],F,1,10,48067,False
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,199,"[Animation, Children's, Musical]",F,1,10,48067,False
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,196,"[Musical, Romance]",F,1,10,48067,False
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,200,[Drama],F,1,10,48067,False
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,199,"[Animation, Children's, Comedy]",F,1,10,48067,False


In [7]:
# 결측치 확인 -> 없음
RDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 13 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   user_id       1000209 non-null  int64 
 1   movie_id      1000209 non-null  int64 
 2   rating        1000209 non-null  int64 
 3   time          1000209 non-null  int64 
 4   title         1000209 non-null  object
 5   genre         1000209 non-null  object
 6   release_year  1000209 non-null  int64 
 7   genre_list    1000209 non-null  object
 8   gender        1000209 non-null  object
 9   age           1000209 non-null  int64 
 10  occupation    1000209 non-null  int64 
 11  zipcode       1000209 non-null  object
 12  age_bucket    1000209 non-null  object
dtypes: int64(7), object(6)
memory usage: 106.8+ MB


## 2. 점수 기반 인기 목록 생성

In [8]:
RDF

Unnamed: 0,user_id,movie_id,rating,time,title,genre,release_year,genre_list,gender,age,occupation,zipcode,age_bucket
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,197,[Drama],F,1,10,48067,False
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,199,"[Animation, Children's, Musical]",F,1,10,48067,False
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,196,"[Musical, Romance]",F,1,10,48067,False
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,200,[Drama],F,1,10,48067,False
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,199,"[Animation, Children's, Comedy]",F,1,10,48067,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,Weekend at Bernie's (1989),Comedy,198,[Comedy],M,25,6,11106,20
1000205,6040,1094,5,956704887,"Crying Game, The (1992)",Drama|Romance|War,199,"[Drama, Romance, War]",M,25,6,11106,20
1000206,6040,562,5,956704746,Welcome to the Dollhouse (1995),Comedy|Drama,199,"[Comedy, Drama]",M,25,6,11106,20
1000207,6040,1096,4,956715648,Sophie's Choice (1982),Drama,198,[Drama],M,25,6,11106,20


In [9]:
# 영화 아이디 별 점수의 평균을 group by로 계산

movie_rating_gb = RDF.groupby('movie_id')['rating']

In [10]:
movie_rating_df = pd.concat([movie_rating_gb.mean(),movie_rating_gb.count()], axis=1)
movie_rating_df.columns = ['mean', 'count']

In [11]:
filtered_RDF = RDF[RDF['movie_id'].isin(movie_rating_df.query('count >= 10').index)]

In [12]:
movie_rating_df.query('count >= 10')['mean'].sort_values(ascending=False)[:20]

movie_id
2905    4.608696
2019    4.560510
318     4.554558
858     4.524966
745     4.520548
50      4.517106
527     4.510417
1148    4.507937
922     4.491489
1198    4.477725
904     4.476190
1178    4.473913
260     4.453694
1212    4.452083
750     4.449890
3338    4.444444
720     4.426941
1207    4.425647
3435    4.415608
912     4.412822
Name: mean, dtype: float64

In [13]:
# 함수화
def get_rating_top_20():
  result_ls = movie_rating_df.query('count >= 10')['mean'].sort_values(ascending=False)[:20].index
  return list(result_ls)

In [14]:
get_rating_top_20()

[2905,
 2019,
 318,
 858,
 745,
 50,
 527,
 1148,
 922,
 1198,
 904,
 1178,
 260,
 1212,
 750,
 3338,
 720,
 1207,
 3435,
 912]

## 3. 장르별 인기 목록 생성

In [15]:
genre_df = RDF[['movie_id', 'genre_list', 'rating']]

In [16]:
genre_df.head()

Unnamed: 0,movie_id,genre_list,rating
0,1193,[Drama],5
1,661,"[Animation, Children's, Musical]",3
2,914,"[Musical, Romance]",3
3,3408,[Drama],4
4,2355,"[Animation, Children's, Comedy]",5


In [17]:
genre_dict = {}

for idx, row in genre_df.iterrows():
  for gl in row['genre_list']:
    if gl not in genre_dict.keys():
      genre_dict[gl] = {}
      genre_dict[gl][row['movie_id']] = {}
      genre_dict[gl][row['movie_id']] = {'sum':row['rating'], 'count':1}
    else:
      if row['movie_id'] not in genre_dict[gl].keys():
        genre_dict[gl][row['movie_id']] = {'sum':row['rating'], 'count':1}
      else:
        genre_dict[gl][row['movie_id']]['sum'] += row['rating']
        genre_dict[gl][row['movie_id']]['count'] += 1

In [18]:
for key, val in genre_dict.items():
  for m_id, result in genre_dict[key].items():
    genre_dict[key][m_id]['score'] = result['sum']/result['count']

In [19]:
GENRE_DF = pd.DataFrame()

for genre in genre_dict.keys():
  tmp_df = pd.DataFrame(genre_dict[genre]).T
  tmp_df['genre'] = genre
  tmp_df = tmp_df.reset_index().rename({'index':'movie_id'})
  tmp_df = tmp_df[tmp_df['count']>10]
  GENRE_DF = pd.concat([GENRE_DF, tmp_df])

In [20]:
for gu in GENRE_DF['genre'].unique():
  tmp_df = GENRE_DF.query(f'genre == "{gu}"')
  tmp_df = tmp_df.sort_values('score', ascending=False).head()
  print(tmp_df)

     index      sum   count     score  genre
414   2019   2864.0   628.0  4.560510  Drama
88     318  10143.0  2227.0  4.554558  Drama
305    858  10059.0  2223.0  4.524966  Drama
7      527  10392.0  2304.0  4.510417  Drama
822   1178   1029.0   230.0  4.473913  Drama
    index     sum  count     score      genre
8     745  2970.0  657.0  4.520548  Animation
29   1148  3976.0  882.0  4.507937  Animation
6     720  1939.0  438.0  4.426941  Animation
30   1223  2063.0  473.0  4.361522  Animation
68   3429  1188.0  274.0  4.335766  Animation
    index     sum   count     score       genre
3     919  7298.0  1718.0  4.247963  Children's
19   3114  6687.0  1585.0  4.218927  Children's
15      1  8613.0  2077.0  4.146846  Children's
87   2761  2728.0   674.0  4.047478  Children's
62   1023   881.0   221.0  3.986425  Children's
    index     sum   count     score    genre
45    899  3217.0   751.0  4.283622  Musical
3     919  7298.0  1718.0  4.247963  Musical
48   1288  4673.0  1118.0  4.17

In [21]:
# 함수화
genre_list = list(GENRE_DF['genre'].unique())
def get_rating_top_k_with_genre(genre, k=5):
  result_df = GENRE_DF.query(f'genre == "{genre}"').sort_values('score', ascending=False)[:k]['index']
  return list(result_df)

In [22]:
get_rating_top_k_with_genre(genre_list[1])

[745, 1148, 720, 1223, 3429]

## 4. 연도별 인기 목록 생성

In [23]:
movie_rate_df = pd.merge(movie_df, rating_df)

In [24]:
movie_count_sr = movie_rate_df.groupby('movie_id')['rating'].count()
filtered_movie_ls = movie_count_sr[movie_count_sr>10].index.tolist()

In [25]:
filtered_MDF = movie_rate_df[movie_rate_df['movie_id'].isin(filtered_movie_ls)]

In [26]:
release_year_df = pd.DataFrame(filtered_MDF.groupby(['release_year','movie_id'])['rating'].mean()).reset_index()

In [27]:
for ry in release_year_df['release_year'].unique():
  tmp_df = release_year_df.query(f'release_year == {ry}')
  tmp_df = tmp_df.sort_values('rating', ascending = False).head()
  print(tmp_df)

   release_year  movie_id    rating
0           191      3132  3.631579
    release_year  movie_id    rating
8            192      3022  4.368932
14           192      3629  4.189091
5            192      2010  4.082474
2            192      1348  3.991597
15           192      3742  3.970085
    release_year  movie_id    rating
68           193      3307  4.387454
65           193      3134  4.339394
42           193      1260  4.301948
17           193       905  4.280749
20           193       919  4.247963
     release_year  movie_id    rating
108           194      1212  4.452083
161           194      3435  4.415608
79            194       912  4.412822
80            194       913  4.395973
82            194       923  4.388889
     release_year  movie_id    rating
235           195      2019  4.560510
193           195       922  4.491489
188           195       904  4.476190
219           195      1178  4.473913
182           195       670  4.410714
     release_year  movie_id 

In [28]:
# 함수화
release_year_ls = list(release_year_df['release_year'].unique())
def get_rating_top_k_with_year(year, k=5):
  result_df = release_year_df.query(f'release_year == {year}').sort_values('rating', ascending=False)[:k]
  return list(result_df)

In [29]:
get_rating_top_k_with_year(200)

['release_year', 'movie_id', 'rating']

## 5. 시청 횟수 기반 인기 목록 생성

In [30]:
# 함수화
def get_view_top_20():
  return rating_df['movie_id'].value_counts().sort_values(ascending=False).index.tolist()[:20]

In [31]:
get_view_top_20()

[2858,
 260,
 1196,
 1210,
 480,
 2028,
 589,
 2571,
 1270,
 593,
 1580,
 1198,
 608,
 2762,
 110,
 2396,
 1197,
 527,
 1617,
 1265]

## 6. 연령대별 인기 목록 생성

In [32]:
age_dict = {}

for idx, row in RDF.iterrows():
  age_bucket = row['age_bucket']
  if age_bucket == False:
    continue
  if age_bucket not in age_dict.keys():
    age_dict[age_bucket] = {}
    age_dict[age_bucket][row['movie_id']] = {}
    age_dict[age_bucket][row['movie_id']] = {'sum':row['rating'], 'count':1}
  else:
    if row['movie_id'] not in age_dict[age_bucket].keys():
      age_dict[age_bucket][row['movie_id']] = {'sum':row['rating'], 'count':1}
    else:
      age_dict[age_bucket][row['movie_id']]['sum'] += row['rating']
      age_dict[age_bucket][row['movie_id']]['count'] += 1

In [33]:
for key, val in age_dict.items():
  for m_id, result in age_dict[key].items():
    age_dict[key][m_id]['score'] = result['sum']/result['count']

In [34]:
age_bucket_DF = pd.DataFrame()

for age_bucket in age_dict.keys():
  tmp_df = pd.DataFrame(age_dict[age_bucket]).T
  tmp_df['age_bucket'] = age_bucket
  tmp_df = tmp_df.reset_index().rename({'index':'movie_id'})
  tmp_df = tmp_df[tmp_df['count']>10]
  age_bucket_DF = pd.concat([age_bucket_DF, tmp_df])

In [35]:
for ab in age_bucket_DF['age_bucket'].unique():
  tmp_df = age_bucket_DF.query(f'age_bucket == "{ab}"')
  tmp_df = tmp_df.sort_values('score', ascending=False).head()
  print(tmp_df)

     index    sum  count     score age_bucket
496   1213  121.0   25.0  4.840000      False
970   1212   53.0   11.0  4.818182      False
711   1228   66.0   14.0  4.714286      False
736    916   75.0   16.0  4.687500      False
642    923  117.0   25.0  4.680000      False
      index    sum  count     score age_bucket
2229   2905   53.0   11.0  4.818182        50~
1164   1797   52.0   11.0  4.727273        50~
17     3030  179.0   38.0  4.710526        50~
1170   1148  318.0   68.0  4.676471        50~
584    2019  493.0  107.0  4.607477        50~
      index     sum   count     score age_bucket
2124   2905   120.0    26.0  4.615385         20
142     318  4028.0   878.0  4.587699         20
33     1198  4519.0   987.0  4.578521         20
468     858  3737.0   817.0  4.574051         20
19      260  5158.0  1128.0  4.572695         20
      index    sum  count     score age_bucket
2394   3739   57.0   12.0  4.750000         40
562    1148  237.0   51.0  4.647059         40
2449   

In [36]:
# 함수화
age_bucket_list = list(age_bucket_DF['age_bucket'].unique())
def get_rating_top_k_with_age(age_bucket, k=20):
  result_df = age_bucket_DF.query(f'age_bucket == "{age_bucket}"').sort_values('score', ascending=False)[:k]['index']
  return list(result_df)

In [37]:
get_rating_top_k_with_age(age_bucket_list[0])

[1213,
 1212,
 1228,
 916,
 923,
 3000,
 1250,
 3949,
 903,
 1208,
 3504,
 318,
 2125,
 1262,
 750,
 2248,
 2019,
 2067,
 1230,
 296]

# 개인별 추천 목록 매칭

* 개인별 추천 목록에 포함될 내용
  * 점수기반 인기 영화 20개
  * 장르별 인기 영화 장르당 5개
  * 연도별 인기 영화 연도당 5개
  * 많이 시청한 영화 20개
  * 해당 유저 연령대별 인기 영화 20개

## 추천 목록 생성

In [38]:
def make_recommendation_list(user_age_bucket):
  recommendation_dict = {}
  added_list = []

  recommendation_dict['age'] = get_rating_top_k_with_age(user_age_bucket)
  added_list.extend(recommendation_dict['age'])

  recommendation_dict['high_score'] = get_rating_top_20()
  added_list.extend(recommendation_dict['high_score'])

  genre_popular_list = []
  for genre in genre_list:
    genre_popular_list.append((genre, get_rating_top_k_with_genre(genre)))
    added_list.extend(get_rating_top_k_with_genre(genre))
  recommendation_dict['genre'] = genre_popular_list

  year_popular_list = []
  for year in release_year_ls:
    year_popular_list.append((year, get_rating_top_k_with_year(year)))
    added_list.extend(get_rating_top_k_with_year(year))
  recommendation_dict['year'] = year_popular_list

  recommendation_dict['high_view'] = get_view_top_20()
  added_list.extend(recommendation_dict['high_view'])

  recommendation_dict['total_recommendations'] = list(set(added_list))

  return recommendation_dict

In [39]:
user_df['recommendations'] = user_df['age_bucket'].apply(make_recommendation_list)

## 추천 목록 저장

In [40]:
user_rec_dict = user_df.set_index('user_id')[['recommendations']].to_dict()['recommendations']

In [41]:
output_path = '/content/drive/MyDrive/2307_추천시스템_101/user_rec_dict.pickle'

import pickle

with open(output_path, 'wb') as f:
  pickle.dump(user_rec_dict, f)

In [42]:
with open(output_path, 'rb') as f:
  user_result_dict = pickle.load(f)

In [43]:
user_result_dict

Output hidden; open in https://colab.research.google.com to view.