In [None]:
! pip install catboost
! pip install optuna

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

from scipy.stats import randint
from sklearn.utils.fixes import loguniform

from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold 

In [None]:
class DataLoader :
    
    def __init__(self, path, users, books) :
        
        self.path = path
        self.users = users
        self.books = books
        self.train = None
        self.test = None
        self.sub = None
        
    def change_age(self, x) :
        
        if x < 10 :
            return 0
        elif 10 <= x < 20:
            return 1
        elif 20 <= x < 30:
            return 2
        elif 30 <= x < 40:
            return 3
        elif 40 <= x < 50:
            return 4
        elif 50 <= x < 60:
            return 5
        elif 60 <= x < 70:
            return 6
        elif 70 <= x < 80:
            return 7
        elif 80 <= x < 90:
            return 8
        else:
            return 9

    def load_data(self) :
        
        self.train = pd.read_csv(self.path + 'train_ratings.csv')
        self.test = pd.read_csv(self.path + 'test_ratings.csv')
        self.sub = pd.read_csv(self.path + 'sample_submission.csv')

        self.users['age'] = self.users['age'].apply(self.change_age)

        users_ = self.users.copy()
        books_ = self.books.copy()

        self.train = pd.merge(self.train, users_, on='user_id', how='left')
        self.sub = pd.merge(self.sub, users_, on='user_id', how='left')
        self.test = pd.merge(self.test, users_, on='user_id', how='left')
        self.train = pd.merge(self.train, books_, on='isbn', how='left')
        self.sub = pd.merge(self.sub, books_, on='isbn', how='left')
        self.test = pd.merge(self.test, books_, on='isbn', how='left')
        
        self.train["avg_rating"] = self.train["avg_rating"].astype(int)
        self.sub["avg_rating"] = self.sub["avg_rating"].astype(int)
        self.test["avg_rating"] = self.test["avg_rating"].astype(int)

        self.train['year_of_publication'] = self.train['year_of_publication'].astype(int)
        self.sub['year_of_publication'] = self.sub['year_of_publication'].astype(int)
        self.test['year_of_publication'] = self.test['year_of_publication'].astype(int)
        
        self.sub = self.sub.drop(columns='rating')
        self.sub["rating"] = 0
        
        self.data = {'train' : self.train,
                     'test' : self.test.drop(['rating'], axis=1),
                     'users' : self.users,
                     'books' : self.books,
                     'sub' : self.sub}

    def split(self) : 
        
        X_train, X_valid, y_train, y_valid = train_test_split(self.data['train'].drop(['rating'], axis=1),
                                                              self.data['train']['rating'],
                                                              test_size= 0.2, 
                                                              shuffle=True)
                                                                                                        
        self.data['X_train'], self.data['X_valid'], self.data['y_train'], self.data['y_valid'] = X_train, X_valid, y_train, y_valid
        
        return self.data

In [None]:
class CatBoost:

    def __init__(self, data, n_splits=5, random_state=15) :
        
        self.data = data
        self.cat_features = list(range(0, self.data['X_train'].shape[1]))
        self.n_splits = n_splits
        self.random_state = random_state
        
        self.X_train = self.data["X_train"].values
        self.X_valid = self.data["X_valid"].values
        self.y_train = self.data["y_train"].values
        self.y_valid = self.data["y_valid"].values
        self.sub = self.data['sub'].values

        self.cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)

    def objective(self, trial) :
        
        param = {'random_state' : 15,
                 'objective' : "RMSE",
                 'cat_features' : self.cat_features,
                 'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.5),
                 'n_estimators' : trial.suggest_int("n_estimators", 1000, 10000),
                 'bagging_temperature' : trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
                 'random_strength' : trial.suggest_int('random_strength', 0, 100),
                 'max_depth' : trial.suggest_int("max_depth", 4, 12),  
                 'l2_leaf_reg' : trial.suggest_float("l2_leaf_reg", 1e-8, 3e-5),  
                 'min_child_samples' : trial.suggest_int("min_child_samples", 40, 100), 
                 'max_bin' : trial.suggest_int("max_bin", 200, 500),
                 'od_type' : trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])}

        rmse_scores = []
        
        for train_idx, valid_idx in self.cv.split(self.X_train, self.y_train) :
            
            X_train_fold, X_valid_fold = self.X_train[train_idx], self.X_train[valid_idx]
            y_train_fold, y_valid_fold = self.y_train[train_idx], self.y_train[valid_idx]

            model = CatBoostRegressor(**param, task_type='GPU', devices='0')

            model.fit(X_train_fold, y_train_fold,
                      cat_features=self.cat_features,
                      eval_set=(X_valid_fold, y_valid_fold),
                      early_stopping_rounds=100,
                      verbose=500)

            y_pred_fold = model.predict(X_valid_fold)
            rmse_fold = mean_squared_error(y_valid_fold, y_pred_fold, squared=False)
            rmse_scores.append(rmse_fold)

        return np.mean(rmse_scores)

    def hyperparameter_search(self, n_trials) :
        
        study = optuna.create_study(direction='minimize')
        study.optimize(self.objective, n_trials=n_trials)

        best_params = study.best_params
        best_rmse = study.best_value

        print(f"Best RMSE: {best_rmse}")
        print(f"Best hyperparameters: {best_params}")

        return best_params

    def train(self, params) :
        
        param = {'random_state' : 15,
                 'objective' : "RMSE",
                 'cat_features' : self.cat_features,
                 'learning_rate': params['learning_rate'],
                 'n_estimators' : params['n_estimators'],
                 'bagging_temperature': params['bagging_temperature'],
                 'random_strength' : params['random_strength'],
                 'max_depth' : params['max_depth'],
                 'l2_leaf_reg' : params['l2_leaf_reg'],
                 'min_child_samples' : params['min_child_samples'],
                 'max_bin' : params['max_bin'],
                 'od_type' : params['od_type']}
        
        self.model = CatBoostRegressor(**param, task_type='GPU', devices='0')
        self.model.fit(self.X_train, self.y_train,
                       cat_features=self.cat_features,
                       eval_set=(self.X_valid, self.y_valid),
                       early_stopping_rounds=100,
                       verbose= 100)
    
    def predict(self) :
        
        y_pred = self.model.predict(self.sub)
        
        return y_pred
    
    
    def feature(self) :
        
        result = self.model.get_feature_importance() 
    
        return result