In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split

pd.set_option('display.max_row', 50)

train = pd.read_csv('../data/train_ratings.csv')
test = pd.read_csv('../data/test_ratings.csv')
books = pd.read_csv('../data/books.csv')
users = pd.read_csv('../data/users.csv')

def rmse(real: list, predict: list) -> float:
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

SEED = 42
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(SEED)

In [22]:
_data = pd.concat([train, test])

user2idx = {v:k for k,v in enumerate(_data['user_id'].unique())}
book2idx = {v:k for k,v in enumerate(_data['isbn'].unique())}

train['isbn'] = train['isbn'].map(book2idx)
train['user_id'] = train['user_id'].map(user2idx)

test['isbn'] = test['isbn'].map(book2idx)
test['user_id'] = test['user_id'].map(user2idx)

X_tr, X_val, y_tr, y_val = train_test_split(train.drop(['rating'],axis = 1), train['rating'], test_size=0.2, random_state=SEED)

_train = np.zeros((len(user2idx), len(book2idx))) # 유저수, 책 수
_valid = np.zeros((len(user2idx), len(book2idx)))
_test = np.zeros((len(user2idx), len(book2idx)))


In [24]:
def df_to_arr(X: pd.DataFrame, y: pd.Series, arr: np.ndarray) -> np.ndarray:
    for i, value in enumerate(X.values):
        arr[value[0], value[1]] += y.values[i]

    return arr

_train = df_to_arr(X_tr, y_tr, _train)
_valid = df_to_arr(X_val, y_val, _valid)
# 이건 어짜피 0만 집어넣어서 필요없음
#_test = df_to_arr(test[['user_id', 'isbn']], test['rating'], _test)

In [25]:
_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [7., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])