In [1]:
import pandas as pd
import os
from scipy.sparse import csr_matrix

# 데이터 준비와 전처리

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


# 분석해 봅시다.

## ratings에 있는 유니크한 영화 개수

In [6]:
movie_unique = ratings['movie_id'].unique()
ratings['movie_id'].nunique()
# movie_unique

3628

## ratings에 있는 유니크한 사용자 수

In [7]:
user_unique = ratings['user_id'].unique()    
ratings['user_id'].nunique()

6039

## 가장 인기있는 영화 30개(인기순)

In [8]:
movie_unique_to_idx =  {k:movies.loc[movies['movie_id'] == k,'title'].iloc[0] for k in movie_unique}
movie_unique_to_idx

#ratings dataFrame 의 movie_id 컬럼에 들어있는 값 (movie_id series) 에 맞춰서 title 을 입력한
#temp_ratings series 생성
temp_ratings = ratings['movie_id'].map(movie_unique_to_idx.get).dropna()
temp_ratings
ratings['movie_title'] = temp_ratings
ratings

Unnamed: 0,user_id,movie_id,count,timestamp,movie_title
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975)
1,1,661,3,978302109,James and the Giant Peach (1996)
2,1,914,3,978301968,My Fair Lady (1964)
3,1,3408,4,978300275,Erin Brockovich (2000)
4,1,2355,5,978824291,"Bug's Life, A (1998)"
...,...,...,...,...,...
1000203,6040,1090,3,956715518,Platoon (1986)
1000205,6040,1094,5,956704887,"Crying Game, The (1992)"
1000206,6040,562,5,956704746,Welcome to the Dollhouse (1995)
1000207,6040,1096,4,956715648,Sophie's Choice (1982)


In [9]:
movie_count = ratings.groupby('movie_id')['count'].count()
movie_count.sort_values(ascending=False).head(30)

mapper = ratings.set_index('movie_id').movie_title.to_dict()
ratings.groupby('movie_id')['count'].count().sort_values(ascending=False).head(30)
rank_dict_id = dict(ratings.groupby('movie_id')['count'].count().sort_values(ascending=False).head(30))
rank_dict_title = {}
for k,v in rank_dict_id.items():
#     print(k,v)
#     print(movie_unique_to_idx[k])
    rank_dict_title['idx:{},'.format(k)+movie_unique_to_idx[k]]=v

rank_dict_title

{'idx:2858,American Beauty (1999)': 3211,
 'idx:260,Star Wars: Episode IV - A New Hope (1977)': 2910,
 'idx:1196,Star Wars: Episode V - The Empire Strikes Back (1980)': 2885,
 'idx:1210,Star Wars: Episode VI - Return of the Jedi (1983)': 2716,
 'idx:2028,Saving Private Ryan (1998)': 2561,
 'idx:589,Terminator 2: Judgment Day (1991)': 2509,
 'idx:593,Silence of the Lambs, The (1991)': 2498,
 'idx:1198,Raiders of the Lost Ark (1981)': 2473,
 'idx:1270,Back to the Future (1985)': 2460,
 'idx:2571,Matrix, The (1999)': 2434,
 'idx:480,Jurassic Park (1993)': 2413,
 'idx:2762,Sixth Sense, The (1999)': 2385,
 'idx:608,Fargo (1996)': 2371,
 'idx:110,Braveheart (1995)': 2314,
 'idx:1580,Men in Black (1997)': 2297,
 "idx:527,Schindler's List (1993)": 2257,
 'idx:1197,Princess Bride, The (1987)': 2252,
 'idx:2396,Shakespeare in Love (1998)': 2213,
 'idx:1617,L.A. Confidential (1997)': 2210,
 'idx:318,Shawshank Redemption, The (1994)': 2194,
 'idx:858,Godfather, The (1972)': 2167,
 'idx:1265,Ground

# 내가 선호하는 영화를 5가지 골라서 rating에 추가해줍시다.

In [10]:
# 사용하는 컬럼만 남겨줍니다.
using_cols = ['user_id', 'movie_id', 'count']
ratings = ratings[using_cols]

In [11]:
# 본인이 좋아하시는 아티스트 데이터로 바꿔서 추가하셔도 됩니다! 단, 이름은 꼭 데이터셋에 있는 것과 동일하게 맞춰주세요. 
my_favorite = ['Godfather, The (1972)' , 'Pulp Fiction (1994)' ,'Schindler\'s List (1993)'
               ,'Sixth Sense, The (1999)' ,'Matrix, The (1999)']

my_favorite_id = [858,296,527,2762,2571]

# ratings['user_id'].nunique()+2 이라는 user_id가 위 영화를 5번씩 봤다고 가정하겠습니다.
my_playlist = pd.DataFrame({'user_id': [ratings['user_id'].nunique()+2]*5,
#                             'movie_title': my_favorite,
                            'movie_id':my_favorite_id,
#                              'timestamp':[0]*5
                            'count':[5]*5
                           })

if not ratings.isin({'user_id':[ratings['user_id'].nunique()+2]})['user_id'].any():  # user_id에 ratings['user_id'].nunique()+2 이라는 데이터가 없다면
    ratings = ratings.append(my_playlist, ignore_index = True)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,count
836473,6040,1090,3
836474,6040,1094,5
836475,6040,562,5
836476,6040,1096,4
836477,6040,1097,4
836478,6041,858,5
836479,6041,296,5
836480,6041,527,5
836481,6041,2762,5
836482,6041,2571,5


# CSR matrix를 직접 만들어봅시다.

In [13]:
num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

print(num_user,num_movie)
csr_data = csr_matrix(
                        (ratings['count'],(ratings['user_id'], ratings['movie_id']))
                        , shape = (num_user, num_movie)
                     )
csr_data

6040 3628


ValueError: row index exceeds matrix dimensions

# als_model = AlternatingLeastSquared 모델을 직접 구성하여 훈련시켜 봅시다.

# 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해보세요.

# 내가 좋아하는 영화와 비슷한 영화를 추천받아봅시다.

# 내가 가장 좋아할만한 영화들을 추천받아 봅시다.
