In [40]:
# IPython 기본 설정 변경
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [41]:
# 기본 설정
import os
import sys
import pandas as pd
import numpy as np
import torch
import scipy
import random

import seaborn as sns

# warning을 보고 싶지 않을 경우 아래 코드를 실행
import warnings
warnings.filterwarnings('ignore')

In [42]:
data_path = 'data/'
users = pd.read_csv(data_path + 'users.csv')
books = pd.read_csv(data_path + 'books.csv')
train = pd.read_csv(data_path + 'train_ratings.csv')
test = pd.read_csv(data_path + 'test_ratings.csv')
sub = pd.read_csv(data_path + 'sample_submission.csv')

ids = pd.concat([train['user_id'], sub['user_id']]).unique()
isbns = pd.concat([train['isbn'], sub['isbn']]).unique()

idx2user = {idx:id for idx, id in enumerate(ids)}
idx2isbn = {idx:isbn for idx, isbn in enumerate(isbns)}

user2idx = {id:idx for idx, id in idx2user.items()}
isbn2idx = {isbn:idx for idx, isbn in idx2isbn.items()}

train['user_id'] = train['user_id'].map(user2idx)
sub['user_id'] = sub['user_id'].map(user2idx)
test['user_id'] = test['user_id'].map(user2idx)
users['user_id'] = users['user_id'].map(user2idx)

train['isbn'] = train['isbn'].map(isbn2idx)
sub['isbn'] = sub['isbn'].map(isbn2idx)
test['isbn'] = test['isbn'].map(isbn2idx)
books['isbn'] = books['isbn'].map(isbn2idx)

In [43]:
books.shape, users.shape, train.shape, test.shape


((149570, 10), (68092, 3), (306795, 3), (76699, 3))

In [44]:
def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6

In [45]:
def process_context_data(users, books, ratings1, ratings2):
    users['location_city'] = users['location'].apply(lambda x: x.split(',')[0])
    users['location_state'] = users['location'].apply(lambda x: x.split(',')[1])
    users['location_country'] = users['location'].apply(lambda x: x.split(',')[2])
    users = users.drop(['location'], axis=1)

    ratings = pd.concat([ratings1, ratings2]).reset_index(drop=True)

    # 인덱싱 처리된 데이터 조인
    context_df = ratings.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')
    train_df = ratings1.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')
    test_df = ratings2.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')

    # 인덱싱 처리
    loc_city2idx = {v:k for k,v in enumerate(context_df['location_city'].unique())}
    loc_state2idx = {v:k for k,v in enumerate(context_df['location_state'].unique())}
    loc_country2idx = {v:k for k,v in enumerate(context_df['location_country'].unique())}

    train_df['location_city'] = train_df['location_city'].map(loc_city2idx)
    train_df['location_state'] = train_df['location_state'].map(loc_state2idx)
    train_df['location_country'] = train_df['location_country'].map(loc_country2idx)
    test_df['location_city'] = test_df['location_city'].map(loc_city2idx)
    test_df['location_state'] = test_df['location_state'].map(loc_state2idx)
    test_df['location_country'] = test_df['location_country'].map(loc_country2idx)

    train_df['age'] = train_df['age'].fillna(int(train_df['age'].mean()))
    train_df['age'] = train_df['age'].apply(age_map)
    test_df['age'] = test_df['age'].fillna(int(test_df['age'].mean()))
    test_df['age'] = test_df['age'].apply(age_map)

    # book 파트 인덱싱
    category2idx = {v:k for k,v in enumerate(context_df['category'].unique())}
    publisher2idx = {v:k for k,v in enumerate(context_df['publisher'].unique())}
    language2idx = {v:k for k,v in enumerate(context_df['language'].unique())}
    author2idx = {v:k for k,v in enumerate(context_df['book_author'].unique())}

    train_df['category'] = train_df['category'].map(category2idx)
    train_df['publisher'] = train_df['publisher'].map(publisher2idx)
    train_df['language'] = train_df['language'].map(language2idx)
    train_df['book_author'] = train_df['book_author'].map(author2idx)
    test_df['category'] = test_df['category'].map(category2idx)
    test_df['publisher'] = test_df['publisher'].map(publisher2idx)
    test_df['language'] = test_df['language'].map(language2idx)
    test_df['book_author'] = test_df['book_author'].map(author2idx)

    idx = {
        "loc_city2idx":loc_city2idx,
        "loc_state2idx":loc_state2idx,
        "loc_country2idx":loc_country2idx,
        "category2idx":category2idx,
        "publisher2idx":publisher2idx,
        "language2idx":language2idx,
        "author2idx":author2idx,
    }

    return idx, train_df, test_df

In [46]:
idx, context_train, context_test = process_context_data(users, books, train, test)


In [47]:
context_train.shape, context_test.shape

((306795, 11), (76699, 11))

In [48]:
context_train.isnull().sum()

user_id             0
isbn                0
rating              0
age                 0
location_city       0
location_state      0
location_country    0
category            0
publisher           0
language            0
book_author         0
dtype: int64

In [49]:
context_train[['user_id','isbn','rating']].head()

Unnamed: 0,user_id,isbn,rating
0,0,0,4
1,1,0,7
2,2,0,8
3,3,0,8
4,4,0,9


In [50]:
# 파일이 사용자별로 정렬되었는지 확인합니다.
train = context_train[['user_id','isbn','rating']].copy().reset_index(drop=True)
train.columns = ['userID','itemID' , 'rating']

In [51]:
from recommenders.datasets.python_splitters import python_random_split, python_stratified_split

train, valid = python_random_split(train, 0.75)


In [52]:
import surprise
reader = surprise.Reader(line_format='user item rating timestamp', sep = ',', rating_scale=(0, 10))

train_set = surprise.Dataset.load_from_df(train, reader=reader).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x7fc331d9afd0>

In [53]:
from recommenders.utils.timer import Timer

svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print("Took {} seconds for training.".format(train_time.interval))

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc275155670>

Took 11.148109953850508 seconds for training.


In [54]:
from recommenders.models.surprise.surprise_utils import predict
predictions = predict(svd, valid, usercol='userID', itemcol='itemID')
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,15485,1638,7.453614
1,10191,13639,7.363689
2,16627,125648,8.272895
3,3459,103855,6.88096
4,2266,795,6.399949


In [55]:
valid

Unnamed: 0,userID,itemID,rating
31332,15485,1638,7
114196,10191,13639,6
302635,16627,125648,8
279523,3459,103855,7
19094,2266,795,9
...,...,...,...
136933,612,19375,7
235172,9130,68355,10
5816,4545,206,7
167606,43543,30196,10


In [56]:
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
eval_rmse = rmse(valid, predictions)
print("RMSE:\t\t%f" % eval_rmse)

RMSE:		2.199182


In [57]:
test = pd.read_csv(data_path + 'test_ratings.csv')
test

Unnamed: 0,user_id,isbn,rating
0,11676,0002005018,0
1,116866,0002005018,0
2,152827,0060973129,0
3,157969,0374157065,0
4,67958,0399135782,0
...,...,...,...
76694,278543,1576734218,0
76695,278563,3492223710,0
76696,278633,1896095186,0
76697,278668,8408044079,0


In [58]:
test_tmp = test[['user_id','isbn','rating']].copy().reset_index(drop=True)
test_tmp.columns = ['userID','itemID' , 'rating']
test_tmp

Unnamed: 0,userID,itemID,rating
0,11676,0002005018,0
1,116866,0002005018,0
2,152827,0060973129,0
3,157969,0374157065,0
4,67958,0399135782,0
...,...,...,...
76694,278543,1576734218,0
76695,278563,3492223710,0
76696,278633,1896095186,0
76697,278668,8408044079,0


In [59]:
##########Predict test
predictions = predict(svd, test_tmp, usercol='userID', itemcol='itemID')
predictions.head()
predicts = predictions['prediction']

Unnamed: 0,userID,itemID,prediction
0,11676,2005018,7.211495
1,116866,2005018,7.068137
2,152827,60973129,7.068137
3,157969,374157065,7.068137
4,67958,399135782,7.068137


In [60]:
predicts

0        7.211495
1        7.068137
2        7.068137
3        7.068137
4        7.068137
           ...   
76694    7.068137
76695    7.068137
76696    7.068137
76697    7.068137
76698    7.068137
Name: prediction, Length: 76699, dtype: float64

In [61]:

submission = pd.read_csv(data_path + 'sample_submission.csv')
submission['rating'] = list(predicts)
submission



Unnamed: 0,user_id,isbn,rating
0,11676,0002005018,7.211495
1,116866,0002005018,7.068137
2,152827,0060973129,7.068137
3,157969,0374157065,7.068137
4,67958,0399135782,7.068137
...,...,...,...
76694,278543,1576734218,7.068137
76695,278563,3492223710,7.068137
76696,278633,1896095186,7.068137
76697,278668,8408044079,7.068137


In [62]:
import time

now = time.localtime()
now_date = time.strftime('%Y%m%d', now)
now_hour = time.strftime('%X', now)
save_time = now_date + '_' + now_hour.replace(':', '')
submission.to_csv('submit/{}_{}.csv'.format(save_time, 'SVD'), index=False)