In [12]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# 데이터 불러오기
movies = pd.read_csv('ml-latest-small\movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

print("movie shape : ", movies.shape)
print("rating shape : ", ratings.shape)

display(movies.head())
display(ratings.head())

movie shape :  (9742, 3)
rating shape :  (100836, 4)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [18]:
# 유저-영화 평점 매트릭스
rating_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

# 영화 간 유사도 계산
movie_similarity = cosine_similarity(rating_matrix.T)

movie_similarity[0][:10]

  ret = a @ b


array([1.        , 0.41056206, 0.2969169 , 0.03557272, 0.3087623 ,
       0.37631587, 0.27749056, 0.131629  , 0.23258593, 0.39557323])

In [36]:
# 영화 제목으로 추천 리스트 출력하는 로직 만들기
title = 'Jumanji (1995)'
movieId = movies[movies.title==title].movieId.values[0]
print("movieId : ", movieId)

similar_scores = list(enumerate(movie_similarity[movieId-1]))
sorted_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True) 
# lambda x: x[1]는 각 튜플의 두 번째 요소(유사도 값)를 기준으로 정렬
top_similar_movies = sorted_scores[1:11] 
print(top_similar_movies)

movieId :  2
[(322, 0.5884377258584123), (436, 0.5498181061555003), (325, 0.5449810767978691), (418, 0.5380455669772968), (504, 0.5248764206089308), (483, 0.5181613195590727), (506, 0.515619976850775), (512, 0.5074579891325981), (18, 0.497560264136898), (276, 0.4973675079070729)]


In [73]:
# 영화별 평균 평점과 평점 수 집계
movie_stats = ratings.groupby('movieId')['rating'].agg(['mean', 'count']).reset_index()
movie_stats.columns = ['movieId', 'average_rating', 'rating_count']
display(movie_stats.head())

Unnamed: 0,movieId,average_rating,rating_count
0,1,3.92093,215
1,2,3.431818,110
2,3,3.259615,52
3,4,2.357143,7
4,5,3.071429,49


In [76]:
rank = 1
result = []
for movieId, sim in top_similar_movies:
    target_movieId = movieId + 1  # movieId가 0부터 시작한 경우라면 +1

    # 해당 movieId가 movies에 존재하지 않는 경우는 skip
    if movies[movies.movieId == target_movieId].empty:
        continue
    title = movies[movies.movieId ==(target_movieId)].title.values[0]
    genres = movies[movies.movieId ==(target_movieId)].genres.values[0]
    rating = movie_stats[movie_stats.movieId ==(target_movieId)].average_rating.values[0]
    count = movie_stats[movie_stats.movieId ==(target_movieId)].rating_count.values[0]
    result.append({
        'Rank': rank,
        'Title': title,
        'Genres': genres,
        'Similarity': sim,
        'Average Rating': rating,
        'Rating Count': count
    })
    rank = rank+1
print(result)

[{'Rank': 1, 'Title': 'Cops and Robbersons (1994)', 'Genres': 'Comedy', 'Similarity': 0.5498181061555003, 'Average Rating': 2.1666666666666665, 'Rating Count': 9}, {'Rank': 2, 'Title': 'To Live (Huozhe) (1994)', 'Genres': 'Drama', 'Similarity': 0.5449810767978691, 'Average Rating': 4.0, 'Rating Count': 3}, {'Rank': 3, 'Title': 'Beverly Hillbillies, The (1993)', 'Genres': 'Comedy', 'Similarity': 0.5380455669772968, 'Average Rating': 2.675, 'Rating Count': 20}, {'Rank': 4, 'Title': 'North (1994)', 'Genres': 'Comedy', 'Similarity': 0.5248764206089308, 'Average Rating': 2.2142857142857144, 'Rating Count': 7}, {'Rank': 5, 'Title': 'Lassie (1994)', 'Genres': 'Adventure|Children', 'Similarity': 0.5181613195590727, 'Average Rating': 3.0, 'Rating Count': 2}, {'Rank': 6, 'Title': 'Perfect World, A (1993)', 'Genres': 'Crime|Drama|Thriller', 'Similarity': 0.515619976850775, 'Average Rating': 3.3846153846153846, 'Rating Count': 13}, {'Rank': 7, 'Title': 'Radioland Murders (1994)', 'Genres': 'Comedy

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

class MovieRecommender:
    def __init__(self, movies_path, ratings_path):
        self.movies = pd.read_csv(movies_path)
        self.ratings = pd.read_csv(ratings_path)
        self.movie_stats = None
        self.rating_matrix = None
        self.movie_similarity = None
        self._prepare_data()
    def _prepare_data(self):
        # 유저-영화 평점 매트릭스
        self.rating_matrix = self.ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

        # 영화 간 유사도 계산
        self.movie_similarity = cosine_similarity(self.rating_matrix.T)
        
        # 영화별 평균 평점과 평점 수 집계
        self.movie_stats = self.ratings.groupby('movieId')['rating'].agg(['mean', 'count']).reset_index()
        self.movie_stats.columns = ['movieId', 'average_rating', 'rating_count']
    
    def get_movieId(self, title):
        movieId = movies[movies.title==title].movieId.values[0]
        return movieId + 1
    
    def get_similar_movies(self, movieId, n=10):
        similar_scores = list(enumerate(self.movie_similarity[movieId]))
        sorted_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True) 
        # lambda x: x[1]는 각 튜플의 두 번째 요소(유사도 값)를 기준으로 정렬
        top_similar_movies = sorted_scores[1:n+1] 
        
        rank = 1
        result = []
        for movieId, sim in top_similar_movies:
            target_movieId = movieId + 1  # movieId가 0부터 시작한 경우라면 +1

            # 해당 movieId가 movies에 존재하지 않는 경우는 skip
            if movies[movies.movieId == target_movieId].empty:
                continue
            title = movies[movies.movieId ==(target_movieId)].title.values[0]
            genres = movies[movies.movieId ==(target_movieId)].genres.values[0]
            rating = movie_stats[movie_stats.movieId ==(target_movieId)].average_rating.values[0]
            count = movie_stats[movie_stats.movieId ==(target_movieId)].rating_count.values[0]
            result.append({
                'Rank': rank,
                'Title': title,
                'Genres': genres,
                'Similarity': sim,
                'Average Rating': rating,
                'Rating Count': count
            })
            rank = rank+1
        print(result)