In [2]:
#import data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_row', 50)

path = '../data/'

train = pd.read_csv(path+'ksy_train_rating_10n.csv')
test = pd.read_csv(path+'ksy_test_rating_10n.csv')
submit = pd.read_csv(path + 'sample_submission.csv')

def rmse(real: list, predict: list) -> float:
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

SEED = 42
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(SEED)

print(train.shape)
print(test.shape)

(306795, 10)
(76699, 10)


In [3]:
tem = train['user_id'].value_counts()
tem = list(tem[tem >= 0].index)
train['id'] = train['user_id'].copy()
train['id'][~train['id'].isin(tem)] = -1
test['id'] = test['user_id'].copy()
test['id'][~test['id'].isin(tem)] = -1

In [4]:
tem = train['isbn'].value_counts()
tem = list(tem[tem >= 0].index)
train['bn'] = train['isbn'].copy()
train['bn'][~train['bn'].isin(tem)] = '0500000000'
test['bn'] = test['isbn'].copy()
test['bn'][~test['bn'].isin(tem)] =  '0500000000'


In [5]:
train['bn'].value_counts()

0316666343    566
0971880107    465
0385504209    390
0312195516    307
0060928336    256
             ... 
0451210239      1
0451209788      1
0312982291      1
0451208439      1
0670528951      1
Name: bn, Length: 129777, dtype: int64

In [6]:
train['years'] = train['years'].astype('str')
train['fix_age'] = train['fix_age'].astype('str')
train['id'] = train['id'].astype('str')
train['bn'] = train['bn'].astype('str')

test['years'] = test['years'].astype('str')
test['fix_age'] = test['fix_age'].astype('str')
test['id'] = test['id'].astype('str')
test['bn'] = test['bn'].astype('str')

In [7]:
train['id']

0              8.0
1              8.0
2              8.0
3              8.0
4              8.0
            ...   
306790    278376.0
306791    278621.0
306792    278636.0
306793    278659.0
306794    278713.0
Name: id, Length: 306795, dtype: object

In [8]:
train['user_id']

0              8.0
1              8.0
2              8.0
3              8.0
4              8.0
            ...   
306790    278376.0
306791    278621.0
306792    278636.0
306793    278659.0
306794    278713.0
Name: user_id, Length: 306795, dtype: float64

In [9]:
for col in train.columns:
    print(train[col].nunique())

59803
129777
10
2102
1217
11
209
5
234
7
59803
129777


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76699 entries, 0 to 76698
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             76699 non-null  float64
 1   isbn                76699 non-null  object 
 2   rating              76699 non-null  float64
 3   book_author         76699 non-null  object 
 4   publisher           76699 non-null  object 
 5   language            76699 non-null  object 
 6   category_high       76699 non-null  object 
 7   years               76699 non-null  object 
 8   fix_location_state  76699 non-null  object 
 9   fix_age             76699 non-null  object 
 10  id                  76699 non-null  object 
 11  bn                  76699 non-null  object 
dtypes: float64(2), object(10)
memory usage: 7.0+ MB


In [122]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

params_cat = {
        "task_type" : "GPU",
        "devices" : '0',
        "random_state": SEED,
        "learning_rate": 0.05,
        "n_estimators": 2000,
        "verbose" : 1,
        "objective" : "RMSE",
        "max_depth": 8,#trial.suggest_int("max_depth", 1, 16),
        "colsample_bylevel": 1,#trial.suggest_float("colsample_bylevel", 0.8, 1.0),
        #"subsample": 0.8, #trial.suggest_float("subsample", 0.3, 1.0), GPU 사용시 안될수도.
        "min_child_samples": 50, #trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": 300, #trial.suggest_int("max_bin", 200, 500),
        "cat_features" : ['book_author', 'publisher', 'language', 'category_high', 'years', 'fix_location_state', 'fix_age', 'id','bn'],
        "one_hot_max_size" : 256
}

X_tr, X_val, y_tr, y_val = train_test_split(train.drop(['user_id', 'isbn', 'rating'],axis = 1), train['rating'], test_size=0.2)

model = CatBoostRegressor(**params_cat)
model.fit(
    X_tr,
    y_tr,
    eval_set=[(X_val, y_val)],
    #early_stopping_rounds=10,
    verbose=False,
)

cat_pred = model.predict(X_val)
log_score = rmse(y_val, cat_pred)

In [123]:
log_score

2.1492861762158175

In [124]:
model = CatBoostRegressor(**params_cat)
model.fit(
    train.drop(['user_id', 'isbn', 'rating'],axis = 1),
    train['rating'],
    verbose=False,
)
pred = model.predict(test.drop(['user_id', 'isbn', 'rating'],axis = 1))
test['rating'] = pred
test.sample(3)

KeyboardInterrupt: 

In [None]:
submit['rating'] = test['rating']
submit.to_csv('../data/20221027_Catboost_Plueid_real.csv', index = False)

In [None]:
submit['rating']

0        6.504841
1        7.539897
2        7.715839
3        8.038805
4        7.576549
           ...   
76694    6.230977
76695    6.227105
76696    6.699028
76697    4.897236
76698    5.817564
Name: rating, Length: 76699, dtype: float64

In [None]:
test['rating'].mean()

7.058322251464207

In [None]:
model.get_feature_importance()

array([12.82086349, 12.34843084,  1.76719337,  4.56322283,  5.85829562,
       11.96193687,  8.95466609, 35.96012476,  5.76526613])

In [None]:
sum(test['rating'] > 10)

48