In [14]:
#import data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_row', 50)

path = '../data/'

train = pd.read_csv(path+'ksy_train_rating_10n.csv')
test = pd.read_csv(path+'ksy_test_rating_10n.csv')
submit = pd.read_csv(path + 'sample_submission.csv')

def rmse(real: list, predict: list) -> float:
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

SEED = 42
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(SEED)

print(train.shape)
print(test.shape)

(306795, 10)
(76699, 10)


In [15]:
tem = train['user_id'].value_counts()
tem = list(tem[tem >= 10].index)
train['id'] = train['user_id'].copy()
train['id'][~train['id'].isin(tem)] = -1
test['id'] = test['user_id'].copy()
test['id'][~test['id'].isin(tem)] = -1

In [16]:
tem = train['isbn'].value_counts()
tem = list(tem[tem >= 10].index)
train['bn'] = train['isbn'].copy()
train['bn'][~train['bn'].isin(tem)] = 1000000000
test['bn'] = test['isbn'].copy()
test['bn'][~test['bn'].isin(tem)] = 1000000000

In [17]:
train['years'] = train['years'].astype('str')
train['fix_age'] = train['fix_age'].astype('str')
train['id'] = train['id'].astype('str')
train['bn'] = train['bn'].astype('str')

test['years'] = test['years'].astype('str')
test['fix_age'] = test['fix_age'].astype('str')
test['id'] = test['id'].astype('str')
test['bn'] = test['bn'].astype('str')

In [18]:
test.head(4)

Unnamed: 0,user_id,isbn,rating,book_author,publisher,language,category_high,years,fix_location_state,fix_age,id,bn
0,11676.0,2005018,0.0,others,Flamingo,en,actresses,2020,california,10,11676.0,1000000000
1,116866.0,2005018,0.0,others,Flamingo,en,actresses,2020,ontario,10,116866.0,1000000000
2,152827.0,60973129,0.0,others,Perennial,en,others,2000,ontario,50,-1.0,1000000000
3,157969.0,374157065,0.0,others,Farrar Straus Giroux,en,medical,2000,colorado,35,-1.0,1000000000


In [19]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

params_cat = {
        "task_type" : "GPU",
        "devices" : '0',
        "random_state": SEED,
        "learning_rate": 0.005,
        "n_estimators": 2000,
        "verbose" : 1,
        "objective" : "RMSE",
        "max_depth": 10,#trial.suggest_int("max_depth", 1, 16),
        "colsample_bylevel": 1,#trial.suggest_float("colsample_bylevel", 0.8, 1.0),
        #"subsample": 0.8, #trial.suggest_float("subsample", 0.3, 1.0), GPU 사용시 안될수도.
        "min_child_samples": 50, #trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": 300, #trial.suggest_int("max_bin", 200, 500),
        "cat_features" : ['book_author', 'publisher', 'language', 'category_high', 'years', 'fix_location_state', 'fix_age','id','bn']
}

X_tr, X_val, y_tr, y_val = train_test_split(train.drop(['user_id', 'isbn', 'rating'],axis = 1), train['rating'], test_size=0.2)

model = CatBoostRegressor(**params_cat)
model.fit(
    X_tr,
    y_tr,
    eval_set=[(X_val, y_val)],
    #early_stopping_rounds=10,
    verbose=False,
)

cat_pred = model.predict(X_val)
log_score = rmse(y_val, cat_pred)

In [20]:
log_score

2.18521831168444

In [21]:
model = CatBoostRegressor(**params_cat)
model.fit(
    train.drop(['user_id', 'isbn', 'rating'],axis = 1),
    train['rating'],
    verbose=False,
)
pred = model.predict(test.drop(['user_id', 'isbn', 'rating'],axis = 1))
test['rating'] = pred
test.sample(3)

Unnamed: 0,user_id,isbn,rating,book_author,publisher,language,category_high,years,fix_location_state,fix_age,id,bn
23402,125519.0,393046974,8.366275,others,W.W. Norton & Company,en,fiction,2000,california,50,125519.0,393046974
53048,192176.0,553099957,7.156848,Connie Willis,Bantam,en,fiction,2000,oregon,100,-1.0,1000000000
9088,145218.0,380727501,7.425812,Bill Bryson,Perennial,en,travel,2000,kildare,50,-1.0,380727501


In [22]:
submit['rating'] = test['rating']
submit.to_csv('../data/20221027_Catboost_Plueid_real.csv', index = False)

In [None]:
``

In [23]:
test['rating'].mean()

7.065673214939921

In [24]:
model.get_feature_importance()

array([12.30924967,  8.69239693,  0.81356884,  2.9380092 ,  4.64537524,
        5.83521268,  5.84544522, 47.93507372, 10.9856685 ])

In [25]:
sum(test['rating'] > 10)

0