## 인기제품 추천

In [None]:
import pandas as pd

use_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

# 사용자 정보
users = pd.read_csv('./u.user', sep = '|', names = use_cols, encoding = 'latin-1')
users = users.set_index('user_id')
users.head()

In [None]:
item_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown',
            'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary',
             'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
            'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# 영화 정보와 영화 평점 정보
movies = pd.read_csv('./u.item', sep='|', names=item_cols, encoding='latin-1')
movies = movies.set_index('movie_id')
movies.head()

In [None]:
# 사용자의 영화 평점 정보
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv('./u.data', sep='\t', names = rating_cols, encoding = 'latin-1')
ratings = ratings.set_index('user_id')
ratings.head()

In [None]:
# Best-Seller
def recom_movie1(n_items):
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items]
    recom_movies = movies.iloc[movie_sort.index, :]
    recommendations = recom_movies['title']
    
    return recommendations

def recom_movie2(n_items):
    return movies.iloc[movie_mean.sort_values[ascending=False][:n_items].index]['title']

movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
recom_movie1(5)

In [None]:
import numpy as np

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

rmse = []
for user in set(ratings.index):
    y_true = ratings.loc[user]['rating']
    y_pred = movie_mean[ratings.loc[user]['movie_id']]
    acc = RMSE(y_true, y_pred)
    rmse.append(acc)
    
print(np.mean(rmse))

## 사용자 집단별 추천
+ users, movies, ratings
+ merged_matrix, rating_matrix

In [None]:
import pandas as pd

# 필요 데이터를 로드한다.
use_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./u.user', sep = '|', names = use_cols, encoding = 'latin-1')
item_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown',
            'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary',
             'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
            'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('./u.item', sep='|', names=item_cols, encoding='latin-1')
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./u.data', sep='\t', names = rating_cols, encoding = 'latin-1')

In [None]:
# 필요없는 열은 제거하고, 사용할 열만 가져옴
ratings = ratings.drop('timestamp', axis = 1) 
movies = movies[['movie_id', 'title']]

In [None]:
# train/test 분리
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, stratify = y)

In [None]:
# 지표 정의
import numpy as np

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

# y_pred는 x_train으로 얻어진 matrix에서 계산한 rating이고,
# y_true는 실제 데이터에서 유저가 특정 영화를 평가한 rating이다.
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    
    return RMSE(y_true, y_pred)

# 사용자/영화로 피봇 테이블을 생성
rating_matrix = x_train.pivot(index = 'user_id', columns = 'movie_id', values = 'rating')

In [None]:
def best_seller(user_id, movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        rating = 3.0
    return rating

train_mean = x_train.groupby(['movie_id'])['rating'].mean()
score(best_seller)

In [None]:
merged_matrix = pd.merge(x_train, users)
users = users.set_index('user_id')

g_mean = merged_matrix[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

In [None]:
# Gender 기준 추천
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['sex']
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
            
    else:
        gender_rating = 3.0
        
    return gender_rating

score(cf_gender)

## 연습문제 2-1
+ 직업별 영화 추천

In [None]:
occ_mean = merged_matrix[['movie_id', 'occupation', 'rating']].groupby(['movie_id', 'occupation'])['rating'].mean()

def cf_occupation(user_id, movie_id):
    if movie_id in rating_matrix:
        occ = users.loc[user_id, :]['occupation']
        if occ in occ_mean[movie_id]:
            occ_rating = occ_mean[movie_id][occ]
        else:
            occ_rating = 3.0
    else:
        occ_rating = 3.0
        
    return occ_rating

score(cf_occupation)

## 연습문제 2-2
+ 성별과 직업을 동시 고려한 영화 추천

In [None]:
scc_mean = merged_matrix[['movie_id', 'sex', 'occupation','rating']].groupby(['movie_id', 'sex', 'occupation'])['rating'].mean()

def cf_scc(user_id, movie_id):
    if movie_id in rating_matrix:
        sex = users.loc[user_id, :]['sex']
        occ = users.loc[user_id, :]['occupation']
        if (sex in scc_mean[movie_id]) and (occ in scc_mean[movie_id][sex]):
            scc_rating = scc_mean[movie_id][sex][occ]
        else:
            scc_rating = 3.0
    else:
        scc_rating = 3.0
        
    return scc_rating

score(cf_scc)