📖 참고 : 패스트캠퍼스 - 딥러닝을 활용한 추천시스템 구현 올인원 패키지 Online.

# Contents-based Filtering_movie

------

# 1. 라이브러리 호출 및 데이터 읽기

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
path = 'data/movielens/'

ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
len(ratings_df)

100836

In [5]:
len(movies_df)

9742

In [6]:
len(tags_df)

3683

In [7]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [8]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


# 2. EDA

## 장르를 이용한 영화 탐색

In [9]:
total_count = len(movies_df.index)
total_genres = list(set([genre for sublist in list(map(lambda x: x.split('|'), movies_df['genres'])) for genre in sublist]))

In [10]:
print(f"전체 영화 수: {total_count}")
print(f"장르: {total_genres}")

전체 영화 수: 9742
장르: ['Romance', 'IMAX', 'Action', 'Animation', 'War', 'Fantasy', '(no genres listed)', 'Western', 'Crime', 'Children', 'Adventure', 'Drama', 'Horror', 'Documentary', 'Sci-Fi', 'Musical', 'Thriller', 'Comedy', 'Film-Noir', 'Mystery']


In [11]:
genre_count = dict.fromkeys(total_genres)

for each_genre_list in movies_df['genres']:
    for genre in each_genre_list.split('|'):
        if genre_count[genre] == None:
            genre_count[genre] = 1
        else:
            genre_count[genre] = genre_count[genre] + 1

In [12]:
genre_count

{'Romance': 1596,
 'IMAX': 158,
 'Action': 1828,
 'Animation': 611,
 'War': 382,
 'Fantasy': 779,
 '(no genres listed)': 34,
 'Western': 167,
 'Crime': 1199,
 'Children': 664,
 'Adventure': 1263,
 'Drama': 4361,
 'Horror': 978,
 'Documentary': 440,
 'Sci-Fi': 980,
 'Musical': 334,
 'Thriller': 1894,
 'Comedy': 3756,
 'Film-Noir': 87,
 'Mystery': 573}

## 장르별 가중치 계산
- idf

In [13]:
for each_genre in genre_count:
    genre_count[each_genre] = np.log10(total_count/genre_count[each_genre])

genre_count

{'Romance': 0.7856152382210405,
 'IMAX': 1.7899910382813284,
 'Action': 0.7266719338379385,
 'Animation': 1.2026069149931968,
 'War': 1.4065847623240424,
 'Fantasy': 1.0971106675631865,
 '(no genres listed)': 2.457169208193496,
 'Western': 1.7659316540881678,
 'Crime': 0.9098289421369025,
 'Children': 1.1664800458677336,
 'Adventure': 0.8872447746804204,
 'Drama': 0.3490620385623247,
 'Horror': 0.9983092704481497,
 'Documentary': 1.3451954487495636,
 'Sci-Fi': 0.9974220495432563,
 'Musical': 1.4649016584241867,
 'Thriller': 0.7112681505684965,
 'Comedy': 0.4139225416416778,
 'Film-Noir': 2.0491288726171324,
 'Mystery': 1.2304935032683613}

In [14]:
genre_representation = pd.DataFrame(columns=sorted(total_genres), index=movies_df.index)
for index, each_row in tqdm(movies_df.iterrows()): # tqdm : 진행바
    dict_temp = {i: genre_count[i] for i in each_row['genres'].split('|')}
    row_to_add = pd.DataFrame(dict_temp, index=[index])
    genre_representation.update(row_to_add)

genre_representation

9742it [01:04, 150.52it/s]


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,0.726672,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193583,,,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193585,,,,,,,,,0.349062,,,,,,,,,,,
193587,,0.726672,,1.202607,,,,,,,,,,,,,,,,


## 태그를 이용한 영화 탐색

In [15]:
tag_column = list(map(lambda x: x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x: x.strip(), list([tag for sublist in tag_column for tag in sublist])))))

print(unique_tags)

['good cinematography', 'fantasy world', 'independent', 'televangelist', 'representation of children', 'too long', 'Deep Throat', 'police corruption', 'Highly quotable', 'Guardians of the Galaxy', 'hotel', 'Van Gogh', 'alternate endings', 'creativity', 'way too long', 'Palahnuik', 'indiana jones', 'weddings', 'Angelina Jolie', 'Tolkein', 'pool', 'Journalism', 'symbolic', 'ancient Rome', 'gambling', 'hitman', 'marriage', 'Cerebral', 'smart', 'vertriloquism', 'invisibility', 'stiller', 'embarassing scenes', 'jon hamm', 'menacing', 'creative', 'Hawkeye', 'suburbia', 'widows/widowers', "80's", 'EPIC', 'Studio Ghibli', 'sexy', 'George Bernard Shaw', 'gun tactics', 'Norman Bates', 'stapler', 'classic movie', 'good writing', 'ummarti2006', 'Shakespeare', 'genocide', 'space station', 'birds', 'coen brothers', 'Stupid ending', 'will ferrell', 'Oscar (Best Cinematography)', 'Charlize Theron', 'I see dead people', 'Shia LaBeouf', 'goofy', 'Western', 'multiple short stories', 'camp', 'Adrien Brody

In [16]:
print(len(tag_column))
print(len(unique_tags))

3683
1589


In [17]:
total_movie_count = len(set(tags_df['movieId']))
tag_count_dict = dict.fromkeys(unique_tags)

for each_movie_tag_list in tags_df['tag']:
    for tag in each_movie_tag_list.split(","):
        if tag_count_dict[tag.strip()] == None:
            tag_count_dict[tag.strip()] = 1
        else:
            tag_count_dict[tag.strip()] += 1

tag_idf = dict()
for each_tag in tag_count_dict:
    tag_idf[each_tag] = np.log10(total_movie_count / tag_count_dict[each_tag])

tag_idf

{'good cinematography': 3.196452541703389,
 'fantasy world': 2.7193312869837265,
 'independent': 3.196452541703389,
 'televangelist': 3.196452541703389,
 'representation of children': 3.196452541703389,
 'too long': 2.895422546039408,
 'Deep Throat': 3.196452541703389,
 'police corruption': 3.196452541703389,
 'Highly quotable': 2.7193312869837265,
 'Guardians of the Galaxy': 3.196452541703389,
 'hotel': 3.196452541703389,
 'Van Gogh': 3.196452541703389,
 'alternate endings': 2.895422546039408,
 'creativity': 3.196452541703389,
 'way too long': 3.196452541703389,
 'Palahnuik': 3.196452541703389,
 'indiana jones': 2.895422546039408,
 'weddings': 2.895422546039408,
 'Angelina Jolie': 3.196452541703389,
 'Tolkein': 2.5943925503754266,
 'pool': 3.196452541703389,
 'Journalism': 3.196452541703389,
 'symbolic': 3.196452541703389,
 'ancient Rome': 3.196452541703389,
 'gambling': 2.4974825373673704,
 'hitman': 2.7193312869837265,
 'marriage': 2.4974825373673704,
 'Cerebral': 3.196452541703389,

In [18]:
len(tag_idf.keys())

1589

In [19]:
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))
for name, group in tqdm(tags_df.groupby(by='movieId')):
    temp_list = list(map(lambda x: x.split(','), list(group['tag'])))
    temp_tag_list = list(set(list(map(lambda x: x.strip(), list([tag for sublist in temp_list for tag in sublist])))))

    dict_temp = {i: tag_idf[i.strip()] for i in temp_tag_list}
    row_to_add = pd.DataFrame(dict_temp, index=[group['movieId'].values[0]])
    tag_representation.update(row_to_add)

tag_representation = tag_representation.sort_index(0)
tag_representation

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1572/1572 [05:17<00:00,  4.95it/s]


Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,,,,,,,,
184471,,,,,,,,,,,...,,,,,,,,,,
187593,,,,,,,,,,,...,,,,,,,,,,
187595,,,,,,,,,,,...,,,,,,,,,,


In [20]:
tag_representation.loc[1].dropna()

fun      2.497483
pixar    2.895423
Name: 1, dtype: object

## 장르 + 태그를 이용한 영화 탐색

In [21]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)

# 3. Contents 유사도 평가
- 코사인 유사도 사용

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim, index=[a.index])

    return result_df

In [23]:
print(movie_representation.head())

   (no genres listed)  Action  Adventure  Animation  Children    Comedy  \
1                 0.0     0.0   0.887245   1.202607   1.16648  0.413923   
2                 0.0     0.0   0.887245   0.000000   1.16648  0.000000   
3                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   
4                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   
5                 0.0     0.0   0.000000   0.000000   0.00000  0.413923   

   Crime  Documentary     Drama   Fantasy  ...  women  wonderwoman  workplace  \
1    0.0          0.0  0.000000  1.097111  ...    0.0          0.0        0.0   
2    0.0          0.0  0.000000  1.097111  ...    0.0          0.0        0.0   
3    0.0          0.0  0.000000  0.000000  ...    0.0          0.0        0.0   
4    0.0          0.0  0.349062  0.000000  ...    0.0          0.0        0.0   
5    0.0          0.0  0.000000  0.000000  ...    0.0          0.0        0.0   

   writing  wrongful imprisonment  wry  younger men  zither  z

In [24]:
cs_df = cos_sim_matrix(movie_representation, movie_representation)
cs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.0,0.124438,0.008403,0.040571,0.011755,0.0,0.016339,0.331122,0.0,0.131794,...,0.064466,0.260941,0.071492,0.27171,0.0,0.348295,0.379492,0.0,0.232553,0.093519
2,0.124438,1.0,0.0,0.0,0.0,0.0,0.0,0.240843,0.0,0.095861,...,0.0,0.0,0.0,0.0,0.0,0.108082,0.117763,0.0,0.0,0.0
3,0.008403,0.0,1.0,0.179391,0.011294,0.0,0.072246,0.0,0.0,0.0,...,0.00656,0.0,0.068686,0.0,0.0,0.020322,0.022142,0.0,0.0,0.089849
4,0.040571,0.0,0.179391,1.0,0.05453,0.0,0.348828,0.0,0.0,0.0,...,0.031674,0.101979,0.567487,0.0,0.0,0.098119,0.106908,0.365843,0.0,0.433821
5,0.011755,0.0,0.011294,0.05453,1.0,0.0,0.640342,0.0,0.0,0.0,...,0.009177,0.0,0.096091,0.0,0.0,0.028429,0.030976,0.0,0.0,0.125697


In [25]:
print(cs_df.shape)
print(cs_df[1].sort_values(ascending=False))

(9742, 9742)
2         1.000000
46972     0.322201
158813    0.300850
119655    0.300850
80748     0.300850
            ...   
4921      0.000000
4920      0.000000
4919      0.000000
4917      0.000000
193609    0.000000
Name: 1, Length: 9742, dtype: float64


In [26]:
print(movies_df.loc[1])
print(movies_df.loc[46972])
print(movies_df.loc[126142])
print(movies_df.loc[2043])
print(movies_df.loc[2399])

title                                Toy Story (1995)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 1, dtype: object
title     Night at the Museum (2006)
genres    Action|Comedy|Fantasy|IMAX
Name: 46972, dtype: object
title     The Cave of the Golden Rose (1991)
genres            Adventure|Children|Fantasy
Name: 126142, dtype: object
title     Darby O'Gill and the Little People (1959)
genres                   Adventure|Children|Fantasy
Name: 2043, dtype: object
title     Santa Claus: The Movie (1985)
genres       Adventure|Children|Fantasy
Name: 2399, dtype: object


# 4. 추천시스템 성능 평가

In [27]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=43)

In [28]:
test_userids = list(set(test_df.userId.values))
test_userids

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [29]:
result_df = pd.DataFrame()

for user_id in tqdm(test_userids):
    user_record_df = train_df.loc[train_df.userId == int(user_id), :]
    
    user_sim_df = cs_df.loc[user_record_df['movieId']]
    user_rating_df = user_record_df[['rating']]
    sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)

    prediction = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (sim_sum+1)

    prediction_df = pd.DataFrame(prediction, index=cs_df.index).reset_index()
    prediction_df.columns = ['movieId', 'pred_rating']    
    prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]

    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 610/610 [00:12<00:00, 49.71it/s]


In [30]:
result_df.head(10)

Unnamed: 0,movieId,pred_rating,userId,rating,timestamp
0,101,4.079146,1,5.0,964980868
1,223,2.467171,1,3.0,964980985
2,552,4.116076,1,4.0,964982653
3,733,3.905327,1,4.0,964982400
4,736,4.073862,1,3.0,964982653
5,780,4.08545,1,3.0,964984086
6,923,2.962807,1,5.0,964981529
7,1024,4.387218,1,5.0,964982876
8,1029,4.395651,1,5.0,964982855
9,1030,4.389013,1,3.0,964982903


In [31]:
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['pred_rating'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.3653768509679165 1.168493410750748


# 5. 후기

- TF-IDF를 이용해 가중치를 만들고, 이를 이용하여 추천 알고리즘에 접목하는 것이 신기했다.
- 아직 코드를 따라서 쳐보는 정도이지만 향후 해당 코드를 수정해서 내 것으로 만드는 작업을 진행해야겠다.
- 컨텐츠 기반 추천 시스템의 흐름을 알아볼 수 있어 좋았다.