## 인기제품 추천

In [None]:
import pandas as pd

use_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

# 사용자 정보
users = pd.read_csv('./u.user', sep = '|', names = use_cols, encoding = 'latin-1')
users = users.set_index('user_id')
users.head()

In [None]:
item_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown',
            'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary',
             'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
            'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# 영화 정보와 영화 평점 정보
movies = pd.read_csv('./u.item', sep='|', names=item_cols, encoding='latin-1')
movies = movies.set_index('movie_id')
movies.head()

In [None]:
# 사용자의 영화 평점 정보
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv('./u.data', sep='\t', names = rating_cols, encoding = 'latin-1')
ratings = ratings.set_index('user_id')
ratings.head()

In [None]:
# Best-Seller
def recom_movie1(n_items):
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items]
    recom_movies = movies.iloc[movie_sort.index, :]
    recommendations = recom_movies['title']
    
    return recommendations

# def recom_movie2(n_items):
#     return movies.iloc[movie_mean.sort_values[ascending=False][:n_items].index]['title']

movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
recom_movie1(5)

In [None]:
import numpy as np

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

rmse = []
for user in set(ratings.index):
    y_true = ratings.loc[user]['rating']
    y_pred = movie_mean[ratings.loc[user]['movie_id']]
    acc = RMSE(y_true, y_pred)
    rmse.append(acc)
    
print(np.mean(rmse))

## 사용자 집단별 추천
+ users, movies, ratings
+ merged_matrix, rating_matrix

In [None]:
import pandas as pd

# 필요 데이터를 로드한다.
use_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./u.user', sep = '|', names = use_cols, encoding = 'latin-1')
item_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown',
            'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary',
             'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
            'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('./u.item', sep='|', names=item_cols, encoding='latin-1')
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./u.data', sep='\t', names = rating_cols, encoding = 'latin-1')

In [None]:
# 필요없는 열은 제거하고, 사용할 열만 가져옴
ratings = ratings.drop('timestamp', axis = 1) 
movies = movies[['movie_id', 'title']]

In [None]:
# train/test 분리
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, stratify = y)

In [None]:
# 지표 정의
import numpy as np

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

# y_pred는 x_train으로 얻어진 matrix에서 계산한 rating이고,
# y_true는 실제 데이터에서 유저가 특정 영화를 평가한 rating이다.
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    
    return RMSE(y_true, y_pred)

# 사용자/영화로 피봇 테이블을 생성
rating_matrix = x_train.pivot(index = 'user_id', columns = 'movie_id', values = 'rating')

In [None]:
def best_seller(user_id, movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        rating = 3.0
    return rating

train_mean = x_train.groupby(['movie_id'])['rating'].mean()
score(best_seller)

In [None]:
merged_matrix = pd.merge(x_train, users)
users = users.set_index('user_id')

g_mean = merged_matrix[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

In [None]:
# Gender 기준 추천
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['sex']
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
            
    else:
        gender_rating = 3.0
        
    return gender_rating

score(cf_gender)

## 연습문제 2-1
+ 직업별 영화 추천

In [None]:
occ_mean = merged_matrix[['movie_id', 'occupation', 'rating']].groupby(['movie_id', 'occupation'])['rating'].mean()

def cf_occupation(user_id, movie_id):
    if movie_id in rating_matrix:
        occ = users.loc[user_id, :]['occupation']
        if occ in occ_mean[movie_id]:
            occ_rating = occ_mean[movie_id][occ]
        else:
            occ_rating = 3.0
    else:
        occ_rating = 3.0
        
    return occ_rating

score(cf_occupation)

## 연습문제 2-2
+ 성별과 직업을 동시 고려한 영화 추천

In [None]:
scc_mean = merged_matrix[['movie_id', 'sex', 'occupation','rating']].groupby(['movie_id', 'sex', 'occupation'])['rating'].mean()

def cf_scc(user_id, movie_id):
    if movie_id in rating_matrix:
        sex = users.loc[user_id, :]['sex']
        occ = users.loc[user_id, :]['occupation']
        if (sex in scc_mean[movie_id]) and (occ in scc_mean[movie_id][sex]):
            scc_rating = scc_mean[movie_id][sex][occ]
        else:
            scc_rating = 3.0
    else:
        scc_rating = 3.0
        
    return scc_rating

score(cf_scc)

## 3-1

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index = rating_matrix.index, columns = rating_matrix.index)

In [None]:
def CF_simple(user_id, movie_id):
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx) # (348, )
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating = 3.0
        
    return mean_rating

In [None]:
score(CF_simple)

## 3-2

In [None]:
def score(model, neighbor_size = 0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    
    return RMSE(y_true, y_pred)

In [None]:
def cf_knn(user_id, movie_id, neighbor_size = 0):
    if movie_id in rating_matrix:
        # 해당 유저와 다른 유저와의 유사성
        sim_scores = user_similarity[user_id].copy()
        # 해당 영화 평점만
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx) # (348, )
        
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores,movie_ratings) /sim_scores.sum()
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores,movie_ratings) /sim_scores.sum()
            else:
                mean_rating = 3.0
        
    else:
        mean_rating = 3.0
        
    return mean_rating

score(cf_knn, neighbor_size=30)

In [None]:
rating_matrix = ratings.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id')

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index = rating_matrix.index, columns= rating_matrix.index)

In [None]:
def recom_movie(user_id, n_items, neighbor_size = 30):
    user_movie = rating_matrix.loc[user_id].copy()
    for movie in rating_matrix:
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0
        else:
            user_movie.loc[movie] = cf_knn(user_id, movie, neighbor_size)
    movie_sort = user_movie.sort_values(ascending = False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    
    return recommendations

recom_movie(user_id = 2, n_items = 5, neighbor_size=30)

In [None]:
rating_matrix = ratings.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id')
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index = rating_matrix.index, columns= rating_matrix.index)

for neighbor_size in [10, 20, 30, 40, 50, 60]:
    print(score(cf_knn, neighbor_size))

## 3-3

In [None]:
rating_mean = rating_matrix.mean(axis = 1)
rating_bias = (rating_matrix.T - rating_mean).T

In [None]:
def cf_knn(user_id, movie_id, neighbor_size = 0):
    if movie_id in rating_bias:
        # 해당 유저와 다른 유저와의 유사성
        sim_scores = user_similarity[user_id].copy()
        # 해당 영화 평점만
        movie_ratings = rating_bias[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx) # (348, )
        
        if neighbor_size == 0:
            prediction = np.dot(sim_scores,movie_ratings) /sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores,movie_ratings) /sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
        
    else:
        prediction = rating_mean[user_id]
        
    return prediction

## 3-4

In [None]:
rating_binary1 = np.array((rating_matrix > 0).astype(float))
rating_binary2 = rating_binary1.T
counts = np.dot(rating_binary1, rating_binary2)
counts = pd.DataFrame(counts, index = rating_matrix.index, columns = rating_matrix.index).fillna(0)

In [None]:
def CF_knn_bias_sig(user_id, movie_id, neighbor_size = 0):
    if movie_id in rating_bias:
        # 해당 유저와 다른 유저와의 유사성
        sim_scores = user_similarity[user_id].copy()
        # 해당 영화 평점만
        movie_ratings = rating_bias[movie_id].copy()
        # 현재 영화에 대해서 평가하지 않은 사용자를 True로 표시
        no_rating = movie_ratings.isnull()
        # 현재 사용자와 다른 사용자와의 공통 영화 평가 갯수를 가져옴
        common_counts = counts[user_id]
        # 미리 정해진 공통 평가 영화 숫자보다 작은 경우 True로 표시한다.
        low_significant = common_counts < SIG_LEVEL
        # 평가하지 않았고, 공통 평가 영화 숫자보다 작은 경우
        # 평가하지 않았고, 공통 평가 영화 숫자보다 큰 경우
        # 평가했지만, 공통 평가 영화 숫자보다 작은 경우
        # 위의 경우를 모두 제외
        none_rating_idx = movie_ratings[no_rating | low_significant].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx) # (348, )
        
        if neighbor_size == 0:
            prediction = np.dot(sim_scores,movie_ratings) /sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > MIN_RATINGS:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores,movie_ratings) /sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
        
    else:
        prediction = rating_mean[user_id]
        
    if prediction < 1:
        prediction = 1
    elif prediction > 5:
        prediction = 5
        
    return prediction

In [None]:
SIG_LEVEL=3
MIN_RATINGS = 2
score(CF_knn_bias_sig, 30)

## 3-5

In [None]:
rating_matrix = ratings.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id')
from sklearn.metrics.pairwise import cosine_similarity

rating_matrix_t = np.transpose(rating_matrix)
matrix_dummy = rating_matrix_t.copy().fillna(0)
item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity, index = rating_matrix_t.index, columns= rating_matrix_t.index)

In [None]:
def CF_IBCF(user_id, movie_id):
    if movie_id in item_similarity:
        sim_scores = item_similarity[movie_id]
        user_rating = rating_matrix_t[user_id]
        non_rating_idx = user_rating[user_rating.isnull()].index
        sim_scores = sim_scores.drop(non_rating_idx)
        mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating

## 4-1

In [None]:
class MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose = True):
        self.R = np.array(ratings)
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose
        
    def rmse(self):
        xs, ys = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        
        return np.sqrt(np.mean(self.errors ** 2))
    
    def train(self):
        self.P = np.random.normal(scale = 1./self.K, size = (self.num_users, self.K))
        self.Q = np.random.normal(scale = 1./self.K, size = (self.num_items, self.K))
        
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])
        
        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)]
        
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse = self.rmse()
            training_process.append((i + 1, rmse))
            if self.verbose:
                if(i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.4f" % (i +1, rmse))
        return training_process
    
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)
            
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])
            
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i, :])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j, :])

R_temp = ratings.pivot(index= 'user_id', columns = 'movie_id', values = 'rating').fillna(0)
mf = MF(R_temp, K = 30, alpha = 0.001, beta = 0.02, iterations =100, verbose = True)
train_process = mf.train()

## (ratings_train 여기서) 4-2

In [None]:
from sklearn.utils import shuffle
TRAIN_SIZE = 0.75
ratings = shuffle(ratings, random_state = 1)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

In [None]:
class NEW_MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose = True):
        self.R = np.array(ratings)
        
        item_id_index = []
        index_item_id = []
        for i, one_id = in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])
        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id)
        
        user_id_index = []; index_user_id = []
        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])
        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)
            
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose
        
    def rmse(self):
        xs, ys = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        
        return np.sqrt(np.mean(self.errors ** 2))
    
    def train(self):
        self.P = np.random.normal(scale = 1./self.K, size = (self.num_users, self.K))
        self.Q = np.random.normal(scale = 1./self.K, size = (self.num_items, self.K))
        
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])
        
        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)]
        
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse = self.rmse()
            training_process.append((i + 1, rmse))
            if self.verbose:
                if(i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.4f" % (i +1, rmse))
        return training_process
    
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def set_test(selft, ratings_test):
        test_set = []
        for i in range(len(ratings_test)):
            x = self.user_id_index[ratings_test.iloc[i, 0]]
            y = self.item_id_index[ratings_test.iloc[i, 1]]
            z = ratings_test.iloc[i, 2]
            test_set.append([x, y, z])
            self.R[x, y] = 0
        self.test_set = test_set
        
        return test_set
    
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))
    
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)
            
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])
            
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i, :])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j, :])
            
    def test(self):
        self.P = np.random.normal(scale = 1./self.K, size = (self.num_users, self.K))
        self.Q = np.random.normal(scale = 1./self.K, size = (self.num_items, self.K))
        
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])
        
        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)]
        training_process = []
        
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_process.append((i + 1, rmse1, rmse2))
            if self.verbose:
                if(i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.4f; Test RMSE = %.4f" % (i +1, rmse1, rmse2))
        return training_process
    
    def get_one_prediction(self, user_id, item_id):
        return self.get_prediction(self.user_id_index[user_id], self.item_id_index[item_id])
        
    def full_prediction(self):
        return self.b + self.b_u[:, np.newaxis] + self.b_d[np.newaxis, :] + self.P.dot(self.Q.T)
    
R_temp = ratings.pivot(index= 'user_id', columns = 'movie_id', values = 'rating').fillna(0)
mf = NEW_MF(R_temp, K = 30, alpha = 0.001, beta = 0.02, iterations =100, verbose = True)
test_set = mf.set_test(ratings_test)
result = mf.test()

## 4-3

In [None]:
result = []
index = []
# 최적의 K 찾기
for k in range(50, 261, 10):
    print('K = ', K)
    R_temp = ratings.pivot(index= 'user_id', columns = 'movie_id', values = 'rating').fillna(0)
    mf = NEW_MF(R_temp, K = K, alpha = 0.001, beta = 0.02, iterations =100, verbose = True)
    test_set = mf.set_test(ratings_test)
    result = mf.test()
    index.append(K)
    results.append(result)

In [None]:
summary = []
for i in range(len(results)):
    RMSE = []
    for result in results[i]:
        RMSE.append(result[2])
    min = np.min(RMSE)
    j = RMSE.index(min)
    summary.append([index[i], j + 1, RMSE[j]])

In [None]:
# 그래프 그리기
import matplotlib.pyplot as plt
plt.plot(index, [x[2] for x in summary])
plt.ylim(0.89, 0.94)
plt.xlabel('K')
plt.ylabel('RMSE')
plt.show()

## 5-1

In [None]:
import numpy as np

from surprise import BaselineOnly
from surprise import KNNWithMeans
from surprise import SVD
from surprise import SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import cross_validate, train_test_split

data = Dataset.load_builtin('ml-100k')

trainset, testset = train_test_split(data, test_size = 0.25)

In [None]:
algo = KNNWithMeans()
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

In [None]:
algorithms = [BsaelineOnly, KNNwithMeans, SVD, SVDpp]
names = []
results = []
for option in algorithms:
    algo = option()
    names.append(option.__name__)
    algo.fit(trainset)
    predictions = algo.test(testset)
    results.append(accuracy.rmse(predictions))
    
names = np.array(names); results = np.array(results)

import matplotlib.pyplot as plt
index = np.argsort(results)
plt.ylim(0.8, 1)
plt.plot(names[index], results[index])
results[index]

## 5-3

In [None]:
sim_options = {'name': 'pearson_baseline', 'user_based':True}
algo = KNNwithMeans(k = 30, sim_options = sim_options)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

## 5-4

In [None]:
result =[]
for neighbor_size in (10, 20, 30, 40, 50, 60):
    algo = KNNwithMeans(k=neighbor_size,sim_options = {'name': 'pearson_baseline', 'user_based':True})
    algo.fit(trainset)
    predictions = algo.test(testset)
    result.append([neighbor_size, accuracy.rmse(predictions)])
result

## 5-5

In [None]:
from surprise.model_selection import GridSearchCV
param_grid = {'k':[5, 10, 15, 25],
             'sim_options':{'name':['pearson_baseline', 'cosine'],
                           'user_based':['True, False']}}
gs = GridSearchCV(KNNwithMeans, param_grid, measures = ['rmse'], cv= 4)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

In [None]:
param_grid = {'n_epochs':[70, 80, 90],
             'lr_all':[0.005, 0.006, 0.007],
             'reg_all':[0.05, 0.07, 0.1]}
gs = GridSearchCV(KNNwithMeans, param_grid, measures = ['rmse'], cv= 4)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

## 5-6

In [None]:
from surprise.dataset import Reader

rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./u.data', sep='\t', names = rating_cols, encoding = 'latin-1')
reader = Reader(rating_scale =(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)

## 6-1

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adamax

In [None]:
K = 200
mu = ratings_train.rating.mean()
M = ratings.user_id.max() + 1
N = ratings.movie_id.max() + 1

def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

user = Input(shape = (1, ))
item = Input(shape = (1, ))
P_embedding = Embedding(M, K, embeddings_regularizer = l2())(user)
Q_embedding = Embedding(N, K, embeddings_regularizer = l2())(item)
user_bias = Embedding(M, 1, embeddings_regularizer = l2())(user) 
item_bias = Embedding(N, 1, embeddings_regularizer = l2())(item) 

R = layers.dot([P_embedding, Q_embedding], axes = 2, name = 'dot_layer')
R = layers.add([R, user_bias, item_bias])
R = Flatten()(R)

model = Model(inputs = [user, item], outputs = R)
model.compile(loss = RMSE, optimizer = SGD(), metrics = [RMSE])

model.summary()

In [None]:
result = model.fit(x = [ratings_train.user_id.values, ratings_train.movie_id.values],
                  y = ratings_train.rating.values - mu,
                  epochs = 60,
                  batch_size = 256,
                  validation_data = ([ratings_test.user_id.values, ratings_test.movie_id.values],
                                    ratings_test.rating.values - mu))

In [None]:
import matplotlib.pyplot as plt
plt.plot(result.history['RMSE'], label = 'Train RMSE')
plt.plot(result.history['val_RMSE'], label = 'Test RMSE')
plt.legend()
plt.show()

In [None]:
user_ids = ratings_test.user_id.values[0:6]
movie_ids = ratings_test.movie_id.values[0:6]
predictions = model.predict([user_ids, movie_ids]) + mu
print('Actuals: \n', ratings_test[0:6])
print()
print('Predictions: \n', predictions)

In [None]:
def RMSE2(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2))

user_ids = ratings_test.user_id.values
movie_ids = ratings_test.movie_id.values
y_pred = model.predict([user_ids, movie_ids]) + mu
y_pred = np.ravel(y_pred, order = 'C')
y_true = np.array(ratings_test.rating)

RMSE2(y_true, y_pred)

## 6-2

In [None]:
user = Input(shape = (1, ))
item = Input(shape = (1, ))
P_embedding = Embedding(M, K, embeddings_regularizer = l2())(user)
Q_embedding = Embedding(N, K, embeddings_regularizer = l2())(item)
user_bias = Embedding(M, 1, embeddings_regularizer = l2())(user) 
item_bias = Embedding(N, 1, embeddings_regularizer = l2())(item) 

from tensorflow.keras.layers import Dense, Concatenate, Activation
P_embedding = Flatten()(P_embedding)
Q_embedding = Flatten()(Q_embedding)
user_bias = Flatten()(user_bias)
item_bias = Flatten()(item_bias)
R = Concatenate()([P_embedding, Q_embedding, user_bias, item_bias])

In [None]:
R = Dense(2048)(R)
R = Activation('linear')(R)
R = Dense(256)(R)
R = Activation('linear')(R)
R = Dense(1)(R)

model = Model(inputs = [user, item], outputs = R)
model.compile(loss = RMSE, optimizer = SGD(), metrics = [RMSE])

model.summary()

In [None]:
result = model.fit(x = [ratings_train.user_id.values, ratings_train.movie_id.values],
                  y = ratings_train.rating.values - mu,
                  epochs = 60,
                  batch_size = 256,
                  validation_data = ([ratings_test.user_id.values, ratings_test.movie_id.values],
                                    ratings_test.rating.values - mu))

## 6-3

In [None]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./u.user', sep = '|', names = u_cols, encoding = 'latin-1')
users = users[['user_id', 'occupation']]

occupation = {}
def convert_occ(x):
    if x in occupation:
        return occupation[x]
    else:
        occupation[x] = len(occupation)
        return occupation[x]
    
users['occupation'] = users['occupation'].apply(convert_occ)

In [None]:
L = len(occupation)
train_occ = pd.merge(ratings_train, users, on = 'user_id')['occupation']
test_occ = pd.merge(ratings_test, users, on = 'user_id')['occupation']

In [None]:
K = 200
mu = ratings_train.rating.mean()
M = ratings.user_id.max() + 1
N = ratings.movie_id.max() + 1

def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

In [None]:
user = Input(shape = (1, ))
item = Input(shape = (1, ))
P_embedding = Embedding(M, K, embeddings_regularizer = l2())(user)
Q_embedding = Embedding(N, K, embeddings_regularizer = l2())(item)
user_bias = Embedding(M, 1, embeddings_regularizer = l2())(user) 
item_bias = Embedding(N, 1, embeddings_regularizer = l2())(item) 

from tensorflow.keras.layers import Dense, Concatenate, Activation
P_embedding = Flatten()(P_embedding)
Q_embedding = Flatten()(Q_embedding)
user_bias = Flatten()(user_bias)
item_bias = Flatten()(item_bias)
R = Concatenate()([P_embedding, Q_embedding, user_bias, item_bias])

occ = Input(shape = (1, ))
occ_embedding = Embedding(L, 3, embeddings_regularizer = l2())(occ)
occ_layer = Flatten()(occ_embedding)
R = Concatenate()([P_embedding, Q_embedding, user_bias, item_bias, occ_layer])

In [None]:
R = Dense(2048)(R)
R = Activation('linear')(R)
R = Dense(256)(R)
R = Activation('linear')(R)
R = Dense(1)(R)

model = Model(inputs = [user, item, occ], outputs = R)
model.compile(loss = RMSE, optimizer = SGD(), metrics = [RMSE])

model.summary()

In [None]:
result = model.fit(x = [ratings_train.user_id.values, ratings_train.movie_id.values, train_occ.values],
                  y = ratings_train.rating.values - mu,
                  epochs = 60,
                  batch_size = 256,
                  validation_data = ([ratings_test.user_id.values, ratings_test.movie_id.values, test_occ.values],
                                    ratings_test.rating.values - mu))

## 7-1

In [None]:
def recommender0(recomm_list):
    recommendations = []
    for pair in recomm_list:
        recommendations.append(random.random() * 4 + 1)
    return np.array(recommendations)

def recommender1(recomm_list):
    recommendations = []
    for pair in recomm_list:
        recommendations.append(random.random() * 4 + 1)
    return np.array(recommendations)

weight = [0.8, 0.2]
recomm_list = np.array(ratings_test)
predictions0 = recommender0(recomm_list)
predictions1 = recommender1(recomm_list)
predictions = predictions0 * weight[0] + predictions1 * weight[1]
RMSE2(recomm_list[:, 2], predictions)

## 7-2

In [None]:
def recommender0(recomm_list, mf):
    recommendations = np.array([mf.get_one_prediction(user, movie) for (user, movie) in recomm_list])
    return recommendations

def recommender1(recomm_list, meighbor_size = 0):
    recommendations = np.array([CF_knn_bias(user, movie, neighbor_size) for (user, movie) in recomm_list])
    return recommendations

recomm_list = np.array(ratings_test.iloc[:, [0, 1]])
predictions0 = recommender0(recomm_list, mf)
RMSE2(ratings_test.iloc[:, 2], predictions0)

predictions1 = recommender1(recomm_list, 37)
RMSE2(ratings_test.iloc[:, 2], predictions1)

weight = [0.8, 0.2]
predictions = predictions0 * weight[0] + predictions1 * weight[1]
RMSE2(ratings_test.iloc[:, 2], predictions)

## 8-1

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

ratings = {'user_id': [1, 2, 4],
          'movie_id':[2, 3, 7],
          'rating': [4, 3, 1]}
ratings = pd.DataFrame(ratings)

rating_matrix = ratings.pivot(index = 'user_id', columns = 'movie_id', values = 'rating').fillna(0)
full_matrix1 = np.array(rating_matrix)
print(full_matrix1)

In [None]:
data = np.array(ratings['rating'])
row_indices = np.array(ratings['user_id'])
col_indices = np.array(ratings['movie_id'])
rating_matrix =csr_matrix((data, (row_indices, col_indices)))
print(rating_matrix)

In [None]:
full_matrix2 = rating_matrix.toarray()
print(full_matrix2)

In [None]:
print(rating_matrix * 2)
print(rating_matrix.T)
print(rating_matrix.dot(rating_matrix.T))

## 8-3

In [None]:
import pandas as pd

# 필요 데이터를 로드한다.
use_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./u.user', sep = '|', names = use_cols, encoding = 'latin-1')
item_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown',
            'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary',
             'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
            'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('./u.item', sep='|', names=item_cols, encoding='latin-1')
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./u.data', sep='\t', names = rating_cols, encoding = 'latin-1')

In [None]:
from scipy.sparse import csr_matrix
data = np.array(ratings['rating'])
row_indices = np.array(ratings['user_id'])
col_indices = np.array(ratings['movie_id'])
ratings = csr_matrix((data, (row_indices, col_indices)), dtype = int)

In [None]:
class NEW_MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose = True):
        self.R = ratings
            
        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose
        
    def rmse(self):
        xs, ys = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        
        return np.sqrt(np.mean(self.errors ** 2))
    
    def train(self):
        self.P = np.random.normal(scale = 1./self.K, size = (self.num_users, self.K))
        self.Q = np.random.normal(scale = 1./self.K, size = (self.num_items, self.K))
        
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])
        
        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)]
        
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse = self.rmse()
            training_process.append((i + 1, rmse))
            if self.verbose:
                if(i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.4f" % (i +1, rmse))
        return training_process
    
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def set_test(self, ratings_test):
        test_set = []
        for i in range(len(ratings_test)):
            x, y, z = ratings_test.iloc[i]
            test_set.append([x, y, z])
            self.R[x, y] = 0
        self.test_set = test_set
        
        return test_set
    
    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))
    
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)
            
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])
            
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i, :])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j, :])
            
    def test(self):
        self.P = np.random.normal(scale = 1./self.K, size = (self.num_users, self.K))
        self.Q = np.random.normal(scale = 1./self.K, size = (self.num_items, self.K))
        
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])
        
        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i, j]) for i, j in zip(rows, columns)]
        training_process = []
        
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_process.append((i + 1, rmse1, rmse2))
            if self.verbose:
                if(i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.4f; Test RMSE = %.4f" % (i +1, rmse1, rmse2))
        return training_process
    
    def get_one_prediction(self, user_id, item_id):
        return self.get_prediction(user_id, item_id)
        
    def full_prediction(self):
        return self.b + self.b_u[:, np.newaxis] + self.b_d[np.newaxis, :] + self.P.dot(self.Q.T)
    
R_temp = ratings.copy()
mf = NEW_MF(R_temp, K = 30, alpha = 0.001, beta = 0.02, iterations =100, verbose = True)

In [None]:
ratings.indptr.shape

In [None]:
ratings.indices.shape

In [None]:
ratings.data.shape

In [None]:
ratings.toarray().shape

In [None]:
len(ratings.indptr)

In [None]:
ratings.getrow(0).todense()

In [None]:
a = []

for i in ratings:
    a.append(i)

In [None]:
ratings.indices[0]

In [None]:
a[0].toarray().ravel()[ratings.indices[0]]

In [None]:
a[0].toarray().ravel()

In [None]:
ratings.indices[0]

In [None]:
ratings.getrow(0).todense()

In [None]:
ratings.getnnz()

In [None]:
ratings.toarray().shape