In [4]:
#import data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os

from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_row', 50)

path = '../data/'

#train = pd.read_csv('../data/train_ratings.csv')
#test = pd.read_csv('../data/test_ratings.csv')

train = pd.read_csv(path+'ksy_train_rating_0n.csv')
test = pd.read_csv(path+'ksy_test_rating_0n.csv')

def rmse(real: list, predict: list) -> float:
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

SEED = 42
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(SEED)

print(train.shape)
print(test.shape)
train.info()

(306795, 10)
(76699, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306795 entries, 0 to 306794
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             306795 non-null  float64
 1   isbn                306795 non-null  object 
 2   rating              306795 non-null  float64
 3   book_author         306795 non-null  object 
 4   publisher           306795 non-null  object 
 5   language            306795 non-null  object 
 6   category_high       306795 non-null  object 
 7   years               306795 non-null  int64  
 8   fix_location_state  306795 non-null  object 
 9   fix_age             306795 non-null  int64  
dtypes: float64(2), int64(2), object(6)
memory usage: 23.4+ MB


In [5]:
#n을 여러번 실험해봄
for n in range(0, 1, 1):
    tem = train['user_id'].value_counts()
    tem = list(tem[tem >= n].index)
    train['id'] = train['user_id'].copy()
    train['id'][~train['id'].isin(tem)] = -1
    test['id'] = test['user_id'].copy()
    test['id'][~test['id'].isin(tem)] = -1

    tem = train['isbn'].value_counts()
    tem = list(tem[tem >= n].index)
    train['bn'] = train['isbn'].copy()
    train['bn'][~train['bn'].isin(tem)] = 1000000000
    test['bn'] = test['isbn'].copy()
    test['bn'][~test['bn'].isin(tem)] = 1000000000

    train['years'] = train['years'].astype('str')
    train['fix_age'] = train['fix_age'].astype('str')
    train['id'] = train['id'].astype('str')
    train['bn'] = train['bn'].astype('str')

    test['years'] = test['years'].astype('str')
    test['fix_age'] = test['fix_age'].astype('str')
    test['id'] = test['id'].astype('str')
    test['bn'] = test['bn'].astype('str')

    params_cat = {
            "task_type" : "GPU",
            "devices" : '0',
            "random_state": SEED,
            "learning_rate": 0.05,
            "n_estimators": 2000,
            "verbose" : 1,
            "objective" : "RMSE",
            "max_depth": 10,#trial.suggest_int("max_depth", 1, 16),
            "colsample_bylevel": 1,#trial.suggest_float("colsample_bylevel", 0.8, 1.0),
            #"subsample": 0.8, #trial.suggest_float("subsample", 0.3, 1.0), GPU 사용시 안될수도.
            "min_child_samples": 50, #trial.suggest_int("min_child_samples", 5, 100),
            "max_bin": 300, #trial.suggest_int("max_bin", 200, 500),
            "cat_features" : ['book_author', 'publisher', 'language', 'category_high', 'years', 'fix_location_state', 'fix_age','id','bn']
    }

    X_tr, X_val, y_tr, y_val = train_test_split(train.drop(['user_id', 'isbn', 'rating'],axis = 1), train['rating'], test_size=0.2)

    model = CatBoostRegressor(**params_cat)
    model.fit(
        X_tr,
        y_tr,
        eval_set=[(X_val, y_val)],
        #early_stopping_rounds=10,
        verbose=False,
    )

    cat_pred = model.predict(X_val)
    log_score = rmse(y_val, cat_pred)

    print(n, log_score)

0 2.14485303233334


In [6]:
model = CatBoostRegressor(**params_cat)
model.fit(
    train.drop(['user_id', 'isbn', 'rating'],axis = 1),
    train['rating'],
    verbose=False,
)
pred = model.predict(test.drop(['user_id', 'isbn', 'rating'],axis = 1))
test['rating'] = pred
test.sample(3)

Unnamed: 0,user_id,isbn,rating,book_author,publisher,language,category_high,years,fix_location_state,fix_age,id,bn
23402,125519.0,393046974,8.02995,Andre Dubus III,W.W. Norton & Company,en,fiction,2000,california,50,125519.0,393046974
53048,192176.0,553099957,7.958506,Connie Willis,Bantam,en,fiction,2000,oregon,100,192176.0,553099957
9088,145218.0,380727501,7.496244,Bill Bryson,Perennial,en,travel,2000,kildare,50,145218.0,380727501


In [9]:
test = test[['user_id', 'isbn', 'rating']]
test.to_csv('../submit/20221028_Catboost_n_0.csv', index = False)

In [39]:
model.get_feature_importance()

array([11.55273067,  2.58375524, 11.81268072, 19.06237711,  1.89119746,
        9.13091546, 11.13630988, 18.17682839, 14.65320508])