In [None]:
#### import module ####  

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split

import random
import os
import re

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
#### load data and set seed ####

path= './data/'

users = pd.read_csv(path + 'my_users.csv')
books = pd.read_csv(path + 'my_books.csv')
train_ratings = pd.read_csv(path + 'train_ratings.csv')
test_ratings = pd.read_csv(path + 'test_ratings.csv')
submit = pd.read_csv(path + 'sample_submission.csv')


def rmse(real, predict):
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

SEED = 42
def seed_everything(seed = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(SEED)


In [None]:
#### merge data with ratings tables ####

train_ratings = pd.read_csv(path+'train_ratings.csv')
test_ratings = pd.read_csv(path+'test_ratings.csv')

train_ratings = pd.merge(train_ratings,books, how='right',on='isbn')
train_ratings.dropna(subset=['rating'], inplace = True)
train_ratings = pd.merge(train_ratings, users, how='right',on='user_id')
train_ratings.dropna(subset=['rating'], inplace = True)

test_ratings['index'] = test_ratings.index
test_ratings = pd.merge(test_ratings,books, how='right',on='isbn')
test_ratings.dropna(subset=['rating'], inplace = True)
test_ratings = pd.merge(test_ratings, users, how='right',on='user_id')
test_ratings.dropna(subset=['rating'], inplace = True)
test_ratings = test_ratings.sort_values('index')
test_ratings.drop(['index'], axis=1, inplace=True)

train_ratings['user_id'] = train_ratings['user_id'].astype('str')
test_ratings['user_id'] = test_ratings['user_id'].astype('str')

train_ratings['location_city'] = train_ratings['location_city'].astype('str')
test_ratings['location_city'] = test_ratings['location_city'].astype('str')

train_ratings['location_state'] = train_ratings['location_state'].astype('str')
test_ratings['location_state'] = test_ratings['location_state'].astype('str')

train_ratings['location_country'] = train_ratings['location_country'].astype('str')
test_ratings['location_country'] = test_ratings['location_country'].astype('str')


**KSY1526 님의 [github](https://github.com/KSY1526/Book-Project-BoostCamp)를 참고하여 작성하였습니다**

In [None]:
#### using stratified 10-fold ####

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
folds = []
for train_idx, valid_idx in skf.split(train_ratings, train_ratings['rating']):
    folds.append((train_idx,valid_idx))

In [None]:
#### set optuna object function and train/test catboost model ####

def objective(trial):
    param = {
        "random_state":42,
        "objective" : "RMSE",
        "cat_features" : list(train_ratings.drop(['rating'],axis = 1).columns),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        "n_estimators":trial.suggest_int("n_estimators", 1000, 10000),
        "max_depth":trial.suggest_int("max_depth", 4, 16),
        'random_strength' :trial.suggest_int('random_strength', 0, 100),
        "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
    }

    model = CatBoostRegressor(**param, task_type='GPU', devices='0')

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=100
    )

    cat_pred = model.predict(X_valid)
    score = rmse(y_valid, cat_pred)

    return score

for fold in range(0,10):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train = train_ratings.drop(['rating'],axis = 1).iloc[train_idx]
    X_valid = train_ratings.drop(['rating'],axis = 1).iloc[valid_idx]
    y_train = train_ratings['rating'].iloc[train_idx]
    y_valid = train_ratings['rating'].iloc[valid_idx]

    sampler = optuna.samplers.TPESampler(seed=42)
    study = optuna.create_study(
        study_name = 'cat_parameter_opt',
        direction = 'minimize',
        sampler = sampler,
    )
    study.optimize(objective, n_trials=10)

    model = CatBoostRegressor(**study.best_params, task_type='GPU', devices='0', random_state=SEED, objective='RMSE', 
                              cat_features=list(train_ratings.drop(['rating'],axis = 1).columns))
    model.fit(X_train, y_train)
                
    pred = model.predict(test_ratings.drop(['rating'], axis = 1))
    test_ratings[f'pred_{fold}'] = pred
    print(f'================================================================================\n\n')

In [None]:
#### 10개 결과를 평균내어 사용 ####
test_ratings['rating'] = (test_ratings['pred_0'] + test_ratings['pred_1'] + test_ratings['pred_2'] + test_ratings['pred_3'] + test_ratings['pred_4'] +test_ratings['pred_5'] +
                                            test_ratings['pred_6'] + test_ratings['pred_7'] + test_ratings['pred_8'] + test_ratings['pred_9']) / 10
test = test_ratings[['user_id', 'isbn', 'rating']]
test.to_csv('submit/CatBoostFinal.csv', index = False)

In [None]:
#### 은비님 LGBM이랑 9 : 1로 앙상블 ####
data1 = pd.read_csv('./submit/CatBoostFinal.csv')
data2 = pd.read_csv('./submit/eunbi.csv')

data1['rating'] = data1['rating'] * 0.9 + data2['rating'] * 0.1
data1.to_csv('submit/EnsembleFinal.csv', index = False)