# 영화추천 AI

## 1.데이터 준비

### (1)데이터 전처리

In [84]:
import os
import pandas as pd

rating_file_path=os.getenv('HOME') + '/workplace/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


좋아하는 영화를 추천하는 것이기 때문에 3이상만 남깁니다.

In [85]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [86]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [87]:
ratings['count']
ratings = pd.DataFrame(ratings)
display(ratings)

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [88]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/workplace/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding = 'ISO-8859-1')
movies.head(30)

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


### (2)데이터 분석

유저의 수와 영화의 수를 살펴봅니다.

In [89]:
# 영화 수
ratings['movie_id'].nunique()

3628

In [90]:
# 유저 수
ratings['user_id'].nunique()

6039

In [91]:
# 가장 인기있는 영화 30개 (인기순)
movie_count = ratings.groupby('movie_id')['count'].count()
movie_count.sort_values(ascending=False).head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: count, dtype: int64

In [92]:
# 유저별 몇 개의 영화를 보고 있는지에 대한 통계
user_count = ratings.groupby('user_id')['movie_id'].count()
user_count.describe()

count    6039.000000
mean      138.512668
std       156.241599
min         1.000000
25%        38.000000
50%        81.000000
75%       177.000000
max      1968.000000
Name: movie_id, dtype: float64

In [93]:
# 유저별 count 중앙값에 대한 통계
user_median = ratings.groupby('user_id')['count'].median()
user_median.describe()

count    6039.000000
mean        4.055970
std         0.432143
min         3.000000
25%         4.000000
50%         4.000000
75%         4.000000
max         5.000000
Name: count, dtype: float64

실제로 좋아하는 영화를 찾기가 힘들어서 탑 30중 아는 영화들을 임의로 넣어봤습니다.

In [94]:
# 내가 선호하는 영화 5가지를 추가.
my_favorite = ['2858' , '2028' ,'2762' ,'110' ,'1']

# '6041'이라는 user_id가 위 영화를 5회씩 감상했다고 가정하겠습니다.
my_playlist = pd.DataFrame({'user_id': ['6041']*5, 'movie_id': my_favorite, 'count':[5.0]*5})

if not ratings.isin({'user_id':['6041']})['user_id'].any():
    # user_id에 '6041'이라는 데이터가 없다면
    ratings = ratings.append(my_playlist)
    # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

ratings.drop('timestamp', inplace=True, axis=1)
ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,count
1000203,6040,1090,3.0
1000205,6040,1094,5.0
1000206,6040,562,5.0
1000207,6040,1096,4.0
1000208,6040,1097,4.0
0,6041,2858,5.0
1,6041,2028,5.0
2,6041,2762,5.0
3,6041,110,5.0
4,6041,1,5.0


영화이름이 따로있어 불편하니 하나로 합쳐줍니다.

In [95]:
movies = pd.DataFrame(movies)

data = pd.merge(ratings, movies)
data

Unnamed: 0,user_id,movie_id,count,title,genre
0,1,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4.0,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4.0,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5.0,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...
836473,5851,3607,5.0,One Little Indian (1973),Comedy|Drama|Western
836474,5854,3026,4.0,Slaughterhouse (1987),Horror
836475,5854,690,3.0,"Promise, The (Versprechen, Das) (1994)",Romance
836476,5938,2909,4.0,"Five Wives, Three Secretaries and Me (1998)",Documentary


### (3)Indexing

원활한 인덱싱 작업을 위해 유니크한 값들로 모아줍니다.

In [96]:
# 고유한 유저, 영화를 찾아내는 코드
user_unique = data['user_id'].unique()
movie_unique = data['title'].unique()

# 유저, 영화 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [97]:
# 인덱싱이 잘 되었는지 확인해 봅니다. 
print(user_to_idx[6040]) 
print(movie_to_idx['Tom and Huck (1995)'])

1679
1980


In [98]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(data):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    data['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = data['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(data):
    print('movie column indexing OK!!')
    data['title'] = temp_movie_data
else:
    print('movie_id column indexing Fail!!')

data

user_id column indexing OK!!
movie column indexing OK!!


Unnamed: 0,user_id,movie_id,count,title,genre
0,0,1193,5.0,0,Drama
1,1,1193,5.0,0,Drama
2,2,1193,4.0,0,Drama
3,3,1193,4.0,0,Drama
4,4,1193,5.0,0,Drama
...,...,...,...,...,...
836473,1621,3607,5.0,3623,Comedy|Drama|Western
836474,3481,3026,4.0,3624,Horror
836475,3481,690,3.0,3625,Romance
836476,4159,2909,4.0,3626,Documentary


In [99]:
# 1회만 play한 데이터의 비율을 보는 코드
only_one = data[data['count']<3]
one, all_data = len(only_one), len(data)
print(f'{one},{all_data}')
print(f'Ratio of only_one over all data is {one/all_data:.2%}')
# f-format에 대한 설명은 https://bit.ly/2DTLqYU

0,836478
Ratio of only_one over all data is 0.00%


### (4)CSR Matrix 만들기

In [100]:
# 실습 위에 설명보고 이해해서 만들어보기
from scipy.sparse import csr_matrix

num_user = data['user_id'].nunique()
num_movie = data['title'].nunique()

csr_data = csr_matrix((data['count'], (data.user_id, data.title)), shape= (num_user, num_movie))
csr_data

<6039x3628 sparse matrix of type '<class 'numpy.float64'>'
	with 836478 stored elements in Compressed Sparse Row format>

In [101]:
csr_data.data

array([5., 3., 3., ..., 3., 3., 3.])

In [102]:
csr_data.indices

array([   0,    1,    2, ..., 2158, 3512, 2947], dtype=int32)

In [103]:
csr_data.indptr

array([     0,     53,    169, ..., 836460, 836477, 836478], dtype=int32)

## 2.모델 학습하기

In [104]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [105]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [106]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6039 sparse matrix of type '<class 'numpy.float64'>'
	with 836478 stored elements in Compressed Sparse Column format>

In [107]:
# 모델 훈련
als_model.fit(csr_data_transpose)

100%|██████████| 15/15 [00:02<00:00,  5.17it/s]


In [108]:
my_user_id, my_favorite_movie = user_to_idx[6040], movie_to_idx['Toy Story (1995)']
my_user_id_vector, my_favorite_movie_vector = als_model.user_factors[my_user_id], als_model.item_factors[my_favorite_movie]


In [109]:
my_user_id_vector

array([-2.9799657 , -0.19578636,  0.98495466, -0.94070935,  0.30443957,
       -2.7571242 , -0.63466364, -1.0983442 , -3.0963655 , -0.8766545 ,
        3.4119813 , -1.6619668 , -0.44841632, -0.87935996,  4.2904773 ,
        1.5283465 , -0.78724366,  2.571357  ,  0.9141369 , -2.2462075 ,
        3.852119  , -2.0921674 ,  1.9276204 , -0.21155882, -3.2359107 ,
        4.8810215 ,  3.6684494 , -1.6069667 ,  3.894351  ,  3.2374759 ,
       -3.3065093 ,  3.7665746 ,  1.627389  , -0.13452272,  7.2186003 ,
        3.7849846 ,  1.3347517 ,  1.0247644 , -1.9007475 , -0.17294303,
       -3.1685991 ,  3.073799  , -0.10902148, -0.7115511 ,  2.5918095 ,
       -4.8107624 , -1.0469105 ,  0.5735138 ,  0.7428269 , -2.3426795 ,
        1.3974453 ,  0.9722313 , -2.5821223 ,  0.49079236,  3.9853199 ,
        2.5175095 ,  3.4083261 , -1.27448   , -0.96208745,  2.2606509 ,
        0.5763981 ,  1.8012725 , -0.3941518 , -0.38638654,  3.2937198 ,
       -3.3980439 ,  4.4403443 ,  2.7315793 ,  1.0014318 ,  1.20

In [110]:
my_favorite_movie_vector

array([-0.0138505 , -0.03048646,  0.04808879,  0.02657401,  0.00247965,
       -0.00116293, -0.00986978, -0.00793661,  0.00516484,  0.00570635,
        0.03655091, -0.00172029, -0.0136339 ,  0.03260032, -0.02592358,
       -0.02724183,  0.03371146, -0.00064807,  0.02155118,  0.01027151,
        0.01678084,  0.00475532, -0.01009163,  0.01787827,  0.00812158,
        0.04022009,  0.00607376, -0.00629774,  0.03441702,  0.01302734,
        0.00494863,  0.00665499, -0.00571671,  0.02321709,  0.00925763,
       -0.00213682, -0.00428238, -0.0380539 , -0.02749609, -0.00525484,
        0.00328178,  0.01746348, -0.00913205,  0.01753276,  0.01833621,
       -0.00185144, -0.0343002 ,  0.00887396, -0.00192347,  0.01486891,
        0.02259343,  0.04220835,  0.00468416,  0.01909536,  0.0242788 ,
       -0.00307549, -0.0025563 , -0.01728071,  0.01978232, -0.02337266,
        0.01025276,  0.0492621 ,  0.01546435, -0.00809153, -0.00882009,
       -0.02048151,  0.00106605,  0.03050644, -0.01300792,  0.02

In [111]:
# zimin과 black_eyed_peas를 내적하는 코드
np.dot(my_user_id_vector, my_favorite_movie_vector)

0.8292246

## 3.테스트하기

In [112]:
test = movie_to_idx['Heat (1995)']
test_vector = als_model.item_factors[test]
np.dot(my_user_id_vector, test_vector)

0.002244696

In [113]:
favorite_movie = 'Tom and Huck (1995)'
movie_idx = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_idx, N=15)
similar_movie

[(1980, 0.99999994),
 (2017, 0.80509835),
 (2004, 0.794873),
 (2002, 0.79272735),
 (1995, 0.77816135),
 (3108, 0.7643511),
 (1978, 0.75816286),
 (2339, 0.75607073),
 (2659, 0.74872464),
 (2021, 0.7472839),
 (2023, 0.74525076),
 (1617, 0.73336375),
 (593, 0.73332226),
 (2001, 0.72741926),
 (2006, 0.7217523)]

In [114]:
#artist_to_idx 를 뒤집어, index로부터 artist 이름을 얻는 dict를 생성합니다. 
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

['Tom and Huck (1995)',
 'Pagemaster, The (1994)',
 'Far From Home: The Adventures of Yellow Dog (1995)',
 'Amazing Panda Adventure, The (1995)',
 'Wild America (1997)',
 'Lassie (1994)',
 'Andre (1994)',
 'Alaska (1996)',
 'Tall Tale (1994)',
 'Theodore Rex (1995)',
 'Flipper (1996)',
 'Little Rascals, The (1994)',
 'Homeward Bound: The Incredible Journey (1993)',
 'King and I, The (1999)',
 'Free Willy 2: The Adventure Home (1995)']

In [115]:
def get_similar_movies(title: str):
    
    movie_idx = movie_to_idx[title]

    similar_movies = als_model.similar_items(movie_idx)
    similar_movies = [idx_to_movie[i[0]] for i in similar_movies]

    return similar_movies


비슷한 영화가 추천되는지 확인해봅니다.

In [116]:
get_similar_movies('Nixon (1995)')

['Nixon (1995)',
 'He Got Game (1998)',
 'People vs. Larry Flynt, The (1996)',
 'Primary Colors (1998)',
 'Cobb (1994)',
 'American Buffalo (1996)',
 'Blue Chips (1994)',
 "Mo' Better Blues (1990)",
 'Ghosts of Mississippi (1996)',
 'Rosewood (1997)']

해당 유저가 좋아할만한 영화를 추천해줍니다.

In [117]:
user = user_to_idx[6040]
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(1412, 1.1387906),
 (1143, 1.0887308),
 (1176, 1.0693319),
 (1000, 1.0412812),
 (83, 1.0078323),
 (169, 0.9790127),
 (707, 0.96910954),
 (1363, 0.9637522),
 (1512, 0.9473661),
 (784, 0.9432775),
 (47, 0.9361787),
 (868, 0.92718375),
 (133, 0.9053448),
 (312, 0.90292907),
 (481, 0.88084376),
 (178, 0.87035483),
 (243, 0.8593823),
 (203, 0.8587768),
 (1501, 0.8431135),
 (231, 0.82074875)]

In [118]:
[idx_to_movie[i[0]] for i in movie_recommended]

['Right Stuff, The (1983)',
 'Do the Right Thing (1989)',
 'Sex, Lies, and Videotape (1989)',
 'Roger & Me (1989)',
 'Gandhi (1982)',
 'Fish Called Wanda, A (1988)',
 'Sting, The (1973)',
 'Drugstore Cowboy (1989)',
 'Buena Vista Social Club (1999)',
 '400 Blows, The (Les Quatre cents coups) (1959)',
 'To Kill a Mockingbird (1962)',
 'Heavenly Creatures (1994)',
 'Ed Wood (1994)',
 'Glengarry Glen Ross (1992)',
 'Kolya (1996)',
 'Unforgiven (1992)',
 'Ghostbusters (1984)',
 'Delicatessen (1991)',
 'When We Were Kings (1996)',
 'Insider, The (1999)']

이 영화들을 왜 추천해줬는지 알아보기 위해 임의의 영화를 한번 넣어봅니다.

In [119]:
recommended_movie = movie_to_idx['Do the Right Thing (1989)']
explain = als_model.explain(user, csr_data, itemid=recommended_movie)

In [120]:
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[('Crimes and Misdemeanors (1989)', 0.06650850358194574),
 ('Raging Bull (1980)', 0.053629926434732605),
 ('Say Anything... (1989)', 0.04837020374698174),
 ('Heathers (1989)', 0.047523874034109535),
 ('Dog Day Afternoon (1975)', 0.03899915980939479),
 ('Player, The (1992)', 0.03675375696970611),
 ('12 Angry Men (1957)', 0.036025131747606354),
 ('Wings of Desire (Der Himmel über Berlin) (1987)', 0.0349416658923024),
 ('Shall We Dance? (Shall We Dansu?) (1996)', 0.033633106226896615),
 ('Spartacus (1960)', 0.03358890650563135)]

나름 비슷한 영화를 추천한것을 확인할 수 있습니다.

## 4.회고

이번 프로젝트는 다른 것도 중요하지만 dictionary와 merge의 사용법을 익힌 것이 기억에 남는다. 두가지를 사용하지 않고 테스트 부분까지 와서 데이터 처리하기가 까다로워 전부 뒤집어 엎고 처음부터 다시 만들었는데 어느 쪽이 더 좋은 선택이었는지는 모르겠지만 저 두 기능을 사용한 것이 코드가 더 간결한 것 같다.