In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import sys
sys.path.append('../')

# import re
import numpy as np
import pandas as pd

# import nltk
# from nltk.corpus import stopwords

from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import TfidfVectorizer  # tf-idf
from sklearn.decomposition import LatentDirichletAllocation  # LDA

from ml.query import *
from ml.utils import *

In [2]:
def recommend_books(df_svd_preds, user_idx, ori_books_df, ori_ratings_df, num_recommendations, default):
    if user_idx != 0:

        # 현재는 index로 적용이 되어있으므로 user_idx - 1을 해야함
        user_row_number = user_idx - 1

        # 최종적으로 만든 pred_df에서 사용자 index에 따라 책 데이터 정렬 -> 책 평점이 높은 순으로 정렬
        sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)

        # 원본 평점 데이터에서 user id에 해당하는 데이터를 뽑기
        user_data = ori_ratings_df[ori_ratings_df.user_idx == user_idx]

        # 위에서 뽑은 user_data와 원본 책 데이터를 합치기
        user_history = user_data.merge(ori_books_df, on='book_id').sort_values(['rating'], ascending=False)

        # 원본 책 데이터에서 유저가 읽은 책 데이터를 제외한 데이터를 추출
        recommendations = ori_books_df[~ori_books_df['book_id'].isin(user_history['book_id'])]

        # 유저의 평점이 높은 순으로 정렬된 데이터와 위 recommendations을 합치기
        recommendations = recommendations.merge(pd.DataFrame(sorted_user_predictions).reset_index(), on='book_id')
        recommendations = recommendations.rename(columns={user_row_number: 'Predictions'}).sort_values('Predictions',
                                                                                                       ascending=False).iloc[
                          :num_recommendations, :]
        recommendations = recommendations[['book_id', 'description']].set_index('book_id')['description'].to_dict()

    else:
        recommendations = default

    return recommendations


def get_topics(components, feature_names, n=15):
    result = {}
    for idx, topic in enumerate(components):
        topic_terms = [(feature_names[i]) for i in topic.argsort()[:-n - 1:-1]]
        result[idx] = topic_terms
    return result


def clustering_books(vectorizer, num_labels, book_rcmm):

    X = vectorizer.fit_transform(list(book_rcmm.values()))

    lda_model = LatentDirichletAllocation(n_components=num_labels, learning_method='online', random_state=777,
                                          max_iter=1)
    lda_top = lda_model.fit_transform(X)

    terms = vectorizer.get_feature_names()  # 단어 집합. 1,000개의 단어가 저장됨.
    keywords = get_topics(lda_model.components_, terms)

    result_dict = {}
    labels = np.argmax(lda_top, axis=1)
    book_ids = np.array(list(book_rcmm.keys()))

    for l, b in zip(labels, book_ids):
        if l not in result_dict:
            result_dict[l] = []
        result_dict[l].append(b)

    result_dict = dict(sorted(result_dict.items(), key=lambda x: x[0]))

    return result_dict, keywords

In [3]:
WORKING_DIRECTORY = '/Users/jeongmoonwon/Downloads/Courses/BKMS1/team_project/bkms1-team10/book_recsys'

In [4]:
book_rcmm_dir_path = os.path.join(WORKING_DIRECTORY, 'results')
os.makedirs(book_rcmm_dir_path, exist_ok=True)

In [5]:
db_path = os.path.join(WORKING_DIRECTORY, 'resources/project.db')
con = connection(db_path)

In [6]:
ratings = read_table(con, ratings_query)
books = read_table(con, books_query)

In [7]:
books_df = books[['book_id', 'description', 'average_rating']]
books_df

Unnamed: 0,book_id,description,average_rating
0,7327624,Omnibus book club edition containing the Ladie...,4.03
1,6066819,Addie Downs and Valerie Adler were eight when ...,3.49
2,89376,What is Heaven really going to be like? What w...,4.26
3,89378,In Newbery Medalist Cynthia Rylant's classic b...,4.43
4,18628482,Embrace the word of God with the inspirational...,4.70
...,...,...,...
427309,23363286,"In the city that never sleeps, spirits are equ...",4.24
427310,31522515,In the latest Death on Demand Mystery from the...,3.55
427311,23252156,"When Kirk Stanford moves into his new home, st...",4.00
427312,18069148,Derek and Blake have an understanding -- they ...,2.00


In [17]:
# 읽은 횟수가 10보다 적은 책은 삭제
counts = ratings['book_id'].value_counts()
valid_book_ids = counts[counts >= 10].index
ratings_df = ratings[ratings['book_id'].isin(valid_book_ids)]

ratings_df = ratings_df[['user_id', 'book_id',  'rating']]

# 인덱스로 매핑된 user_id 열 추가
user_mapping = {user_id: idx+1 for idx, user_id in enumerate(ratings_df['user_id'].unique())}
ratings_df['user_idx'] = ratings_df['user_id'].map(user_mapping)
ratings_df['user_idx'] = ratings_df['user_idx'].astype(int)
ratings_df

Unnamed: 0,user_id,book_id,rating,user_idx
1,01ec1a320ffded6b2dd47833f2c8e4fb,32993133,5,1
3,01ec1a320ffded6b2dd47833f2c8e4fb,35121403,4,1
4,01ec1a320ffded6b2dd47833f2c8e4fb,33807229,4,1
6,01ec1a320ffded6b2dd47833f2c8e4fb,32734333,4,1
7,01ec1a320ffded6b2dd47833f2c8e4fb,33802909,4,1
...,...,...,...,...
3374593,e9c60194dcde6416f661fdc947be1357,6388558,4,9999
3374595,e9c60194dcde6416f661fdc947be1357,12988016,5,9999
3374596,e9c60194dcde6416f661fdc947be1357,14866,4,9999
3374598,e9c60194dcde6416f661fdc947be1357,42899,2,9999


In [9]:
users = read_table(con, users_query)

users_df = users[['user_id']]

users_df['user_idx'] = users_df['user_id'].map(user_mapping)
users_df['user_idx'] = users_df['user_idx'].fillna(0)
users_df['user_idx'] = users_df['user_idx'].astype(int)

users_df

Unnamed: 0,user_id,user_idx
0,0000c3d51aa099745e93a4e99c4856c8,6706
1,0005f52944ea1992e95d61f287acaea9,9708
2,0007f8dd09337afd986d765569cf0110,6061
3,001abeffb7caaf02c99963352f22a8fe,3840
4,00238d8a4c276c47f5d5e242f54a8f28,4481
...,...,...
9995,fff9a27567817b04a05b8244e8b5883d,5026
9996,fffc34d137f5c5c5e1ca1d6f325a4dcf,6211
9997,fffc9cfe8fd818f574c8c219b93274c0,978
9998,fffdcfc2f90be83462ec61c91a0daf22,8881


In [10]:
# 이력이 부족한 사용자
users_df[users_df['user_idx']==0]

Unnamed: 0,user_id,user_idx
746,13d3272409bd2c8ecee8628dc0a589f8,0


In [18]:
# 테스트로 일부만
ratings_df = ratings_df.iloc[:5000,:]
users_df = users_df[users_df['user_idx'].isin(list(ratings_df.user_idx))]

In [19]:
df_user_book_ratings = ratings_df.pivot(
    index='user_idx',
    columns='book_id',
    values='rating'
).fillna(0)

df_user_book_ratings

book_id,1,10025305,10054335,10059498,10108463,10112885,10123237,10126914,10163292,10169662,...,9943270,99472,99561,9957,9968116,9969571,9984522,9993763,9998680,9999
user_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [20]:
matrix = df_user_book_ratings.values  # pivot_table 값을 numpy matrix로 만든 것
user_ratings_mean = np.mean(matrix, axis=1)  # 사용자의 평균 평점
matrix_user_mean = matrix - user_ratings_mean.reshape(-1, 1)  # 유저-책에 대해 사용자 평균 평점을 뺀 것

In [21]:
# scipy에서 제공해주는 svd
# U 행렬, sigma 행렬, V 전치 행렬을 반환.
U, sigma, Vt = svds(matrix_user_mean, k=12)
sigma = np.diag(sigma)

In [22]:
# U, Sigma, Vt의 내적을 수행하면, 다시 원본 행렬로 복원됨 + 사용자 평균 rating을 적용
svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns=df_user_book_ratings.columns)

In [23]:
# 이력 없는 유저들에게는 평점이 4보다 크면서 이력 개수가 많은 책들을 추천
default = books_df[books_df['average_rating']>4.0]
log_cnt = ratings_df['book_id'].value_counts()
log_cnt = pd.DataFrame({'book_id': list(log_cnt.keys()), 'cnt': list(log_cnt.values)})
default = pd.merge(default, log_cnt, on = 'book_id').sort_values('cnt', ascending = False).iloc[:25, :]
default = default[['book_id','description']].set_index('book_id')['description'].to_dict()

In [24]:
# 추천
users_df['book_rcmm'] = users_df.apply(lambda x: recommend_books(df_svd_preds, x['user_idx'], books_df, ratings_df,
                                                                 25, default), axis=1)

In [25]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # 상위 1,000개의 단어를 보존
users_df[['book_ids', 'keywords']] = users_df.apply(lambda x: clustering_books(vectorizer, 5, x['book_rcmm']),
                                                    axis=1, result_type="expand")
users_df = users_df.drop(columns=['book_rcmm', 'user_idx'])

In [26]:
users_df

Unnamed: 0,user_id,book_ids,keywords
66,01ec1a320ffded6b2dd47833f2c8e4fb,"{0: ['12924304', '9477490', '15704486', '23545...","{0: ['secret', 'left', 'effortless', 'spare', ..."
553,0ef32090550901ead25cb0ea21c4d36b,"{0: ['15803163', '27833670', '20821550', '3312...","{0: ['shown', 'october', 'stolen', 'history', ..."
2707,4672eb229c808b792b8ea95f01f19784,"{0: ['11096647', '6482837', '6482981', '104258...","{0: ['ship', 'leah', 'dream', 'street', 'start..."
