# 잠재요인 협업 필터링 - SVD

In [1]:
import pandas as pd
import numpy as np

In [14]:
movie_df = pd.read_csv('movies.csv')
movie_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [16]:
movie_df.shape

(9742, 3)

In [10]:
ratings = pd.read_csv('ratings.csv')
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [12]:
ratings.shape

(100836, 4)

In [21]:
ratings.userId.nunique(), ratings.movieId.nunique()

(610, 9724)

In [9]:
tag_df = pd.read_csv('tags.csv')
tag_df.head(3)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992


In [None]:
!pip install scikit-surprise

In [19]:
from surprise import SVD, Reader
from surprise.dataset import DatasetAutoFolds

reader = Reader(line_format = 'user item rating timestamp', sep = ',',
                rating_scale = (0.5, 5))
data_folds = DatasetAutoFolds('ratings_noh.csv', reader = reader)

In [23]:
# 전체 데이터를 학습 데이터로 사용
trainset = data_folds.build_full_trainset()

In [24]:
type(trainset)

surprise.trainset.Trainset

In [None]:
dir(trainset)

## 모델 생성 및 학습

In [28]:
model = SVD(n_epochs= 20, n_factors = 50, random_state= 2022)
model.fit(trainset)
# 알아서 잠재요인 찾아줌

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f14d1e71cd0>

- 사용자 ID : 9
- 영화 ID : 42 (Dead President(1995))

In [30]:
# 영화정보
movie_df[movie_df.movieId == 42]

Unnamed: 0,movieId,title,genres
38,42,Dead Presidents (1995),Action|Crime|Drama


In [32]:
# 사용자 9번이 영화 42번을 봤는지 확인
movieIds = ratings[ratings.userId == 9]['movieId']
movieIds[movieIds == 42].count()

0

- verbose=True  : 상세정보 추출

In [35]:
# 사용자 9번이 42번 영화에 대한 예상 평점
# SVD model은 예측시 문자만 들어갈 수 있다.
pred = model.predict(str(9), str(42), verbose=True)
pred

user: 9          item: 42         r_ui = None   est = 3.25   {'was_impossible': False}


Prediction(uid='9', iid='42', r_ui=None, est=3.249924377339538, details={'was_impossible': False})

- 사용자 9번이 보지 않은 영화 중 예측점수가 가장 높은 top10

In [36]:
seen= ratings[ratings.userId == 9]['movieId'].tolist()
total_movies = movie_df.movieId.tolist()
unseen = [movie for movie in total_movies if movie not in seen]

In [38]:
len(unseen), len(seen)

(9696, 46)

In [42]:
def sortkey_est(pred):
    return pred.est

In [43]:
predict_lst= [model.predict(str(9), str(mid)) for mid in unseen]

In [88]:
predict_lst.sort(key = sortkey_est, reverse = True)
top_movie_ids = [int(pred.iid) for pred in predict_lst[:10]]
top_movie_ratings = [pred.est for pred in predict_lst[:10]]
top_movie_title = [movie_df[movie_df.movieId == id].iloc[0,1] for id in top_movie_ids]

In [87]:
movie_df[movie_df.movieId == 32].iloc[0,1]

'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)'

In [89]:
top_movie_title

['Shawshank Redemption, The (1994)',
 'Ran (1985)',
 'Evil Dead II (Dead by Dawn) (1987)',
 'Lawrence of Arabia (1962)',
 'Boondock Saints, The (2000)',
 "Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",
 'Spotlight (2015)',
 'Boogie Nights (1997)',
 'Usual Suspects, The (1995)',
 'Philadelphia Story, The (1940)']

In [65]:
movie_df.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [69]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [51]:
predict_lst[:10]

[Prediction(uid='9', iid='318', r_ui=None, est=4.070330794979969, details={'was_impossible': False}),
 Prediction(uid='9', iid='1217', r_ui=None, est=4.063731956995097, details={'was_impossible': False}),
 Prediction(uid='9', iid='1261', r_ui=None, est=4.051908410348554, details={'was_impossible': False}),
 Prediction(uid='9', iid='1204', r_ui=None, est=4.0227662213503805, details={'was_impossible': False}),
 Prediction(uid='9', iid='3275', r_ui=None, est=4.011500870494226, details={'was_impossible': False}),
 Prediction(uid='9', iid='4973', r_ui=None, est=3.9996958730949137, details={'was_impossible': False}),
 Prediction(uid='9', iid='142488', r_ui=None, est=3.9853484799628194, details={'was_impossible': False}),
 Prediction(uid='9', iid='1673', r_ui=None, est=3.9798846903676015, details={'was_impossible': False}),
 Prediction(uid='9', iid='50', r_ui=None, est=3.9786251266485744, details={'was_impossible': False}),
 Prediction(uid='9', iid='898', r_ui=None, est=3.978415496006661, det

In [47]:
top_movie_ids

[318, 1217, 1261, 1204, 3275, 4973, 142488, 1673, 50, 898]

In [56]:
top_movie_ratings

[4.070330794979969,
 4.063731956995097,
 4.051908410348554,
 4.0227662213503805,
 4.011500870494226,
 3.9996958730949137,
 3.9853484799628194,
 3.9798846903676015,
 3.9786251266485744,
 3.978415496006661]

In [57]:
top_movie_title

38    Dead Presidents (1995)
Name: title, dtype: object

In [58]:
movie_df[movie_df.movieId ==318]

Unnamed: 0,movieId,title,genres
277,318,"Shawshank Redemption, The (1994)",Crime|Drama


In [90]:
top_df = pd.DataFrame({
    '영화명' : top_movie_title,
    '예측평점' : top_movie_ratings,
})
top_df

Unnamed: 0,영화명,예측평점
0,"Shawshank Redemption, The (1994)",4.070331
1,Ran (1985),4.063732
2,Evil Dead II (Dead by Dawn) (1987),4.051908
3,Lawrence of Arabia (1962),4.022766
4,"Boondock Saints, The (2000)",4.011501
5,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",3.999696
6,Spotlight (2015),3.985348
7,Boogie Nights (1997),3.979885
8,"Usual Suspects, The (1995)",3.978625
9,"Philadelphia Story, The (1940)",3.978415
