# XGBoost와 Hyper Parameter Tuning

## 기본 데이터

### 0. 데이터 불러오기

In [3]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb



In [6]:
base_path = os.path.join(os.curdir, 'data')
userv2_path = os.path.join(base_path, 'users_v2.csv') # age를 random 추출
bookv4_path = os.path.join(base_path, 'books_v4.csv') # 증원님까지 합친 데이터
rating_path = os.path.join(base_path, 'train_ratings.csv')

In [7]:
# 파일 불러와서 차원 파악
usersv2 = pd.read_csv(userv2_path, encoding='utf-8')
bookv4 = pd.read_csv(bookv4_path, encoding='utf-8')
ratings = pd.read_csv(rating_path, encoding='utf-8')

merge_ = ratings.merge(bookv4, how='left', on='isbn')
data = merge_.merge(usersv2, how='inner', on='user_id')
data.nunique()

user_id                 59803
isbn                   129777
rating                     10
book_title             115473
book_author             54716
year_of_publication        92
publisher                1408
img_url                129777
language                   24
summary                 69758
img_path               129777
category                   15
location                13888
age                        91
dtype: int64

In [5]:
import re
data['book_author'] = data['book_author'].apply(lambda x: re.sub(r'[^\w\s]', '', x).lower())
data['publisher'] = data['publisher'].apply(lambda x: re.sub(r'[^\w\s]', '', x).lower())

In [6]:
data.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,img_url,language,summary,img_path,category,location,age
0,8,0002005018,4,clara callan,richard bruce wright,2001.0,collins,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,fiction1,"timmins,ontario,canada",32.0
1,8,074322678X,4,where you ll find me and other stories,ann beattie,2002.0,pocket,http://images.amazon.com/images/P/074322678X.0...,en,now back in print ann beattie 39 s finest shor...,images/074322678X.01.THUMBZZZ.jpg,fiction1,"timmins,ontario,canada",32.0
2,8,0887841740,2,the middle stories,sheila heti,2004.0,harperbusiness,http://images.amazon.com/images/P/0887841740.0...,en,,images/0887841740.01.THUMBZZZ.jpg,fiction1,"timmins,ontario,canada",32.0
3,8,1552041778,2,jane doe,r j kaiser,1999.0,firefly books ltd,http://images.amazon.com/images/P/1552041778.0...,en,,images/1552041778.01.THUMBZZZ.jpg,fiction1,"timmins,ontario,canada",32.0
4,8,1567407781,6,the witchfinder amos walker mystery series,loren d estleman,1998.0,llewellyn publications,http://images.amazon.com/images/P/1567407781.0...,en,,images/1567407781.01.THUMBZZZ.jpg,fiction1,"timmins,ontario,canada",32.0


### 1. 간단한 테스트
- category, book_author, publisher, age 만 feature로 선택

In [7]:
# 기타 카테고리로 채움 
data['category'].fillna('etc', inplace=True)

In [8]:
# 피처 선택과 타겟 선택 
X_cat = data['category']
X_author = data['book_author']
X_publisher = data['publisher']
X_age = data['age']
y = data['rating']

In [9]:
# 라벨 인코딩
le_cat = LabelEncoder()
le_author = LabelEncoder()
le_publisher = LabelEncoder()
X_cat = le_cat.fit_transform(X_cat)
X_author = le_author.fit_transform(X_author)
X_publisher = le_publisher.fit_transform(X_author)

In [10]:
# age는 vectorize
vectorizer = TfidfVectorizer()
X_age = vectorizer.fit_transform(X_age.astype(str))

In [11]:
# concat 후 X로 넘기기
X = pd.concat([pd.Series(X_cat), pd.Series(X_author), pd.Series(X_publisher), pd.DataFrame(X_age.toarray())], axis=1)

In [12]:
# 중복되는 column 확인 
X.columns[X.columns.duplicated()]

Int64Index([0, 1, 2], dtype='int64')

In [13]:
# 중복 column 을 unique하게 설정 
new_columns = []
for i, col in enumerate(X.columns):
    if X.columns.duplicated()[i]:
        new_columns.append(f"{col}_{i}")
    else:
        new_columns.append(col)
X.columns = new_columns

In [14]:
# train, test dataset 나누기 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# XGBoost 모델 학습 및 기본 파라미터 설정 
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
param = {                                       # 파라미터 설정
    'max_depth': 10,                            # 트리의 최대 깊이
    'eta': 0.4,                                 # 학습률
    'objective': 'reg:squarederror',            # 회귀 문제인 경우 reg:squarederror 사용
}
num_round = 100                                 # 트리 개수
bst = xgb.train(param, dtrain, num_round)

In [26]:
# 위 파라미터로 설정 시 RMSE 확인 
preds = bst.predict(dtest)
rmse = ((preds - y_test) ** 2).mean() ** 0.5
print("RMSE:", rmse)

RMSE: 2.389406015593661


#### rounds

In [27]:
# num_boost_round 설정 후 best model 저장 
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
param = {
    'max_depth': 10, 
    'eta': 0.4,
    'objective': 'reg:squarederror',
}
num_round = 100
bst = xgb.train(param, dtrain, num_round, num_boost_round=999,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10)

[0]	Test-rmse:4.61843
[1]	Test-rmse:3.37601
[2]	Test-rmse:2.79667
[3]	Test-rmse:2.55763
[4]	Test-rmse:2.46010
[5]	Test-rmse:2.42671
[6]	Test-rmse:2.41415
[7]	Test-rmse:2.40965
[8]	Test-rmse:2.40665
[9]	Test-rmse:2.40469
[10]	Test-rmse:2.40283
[11]	Test-rmse:2.40273
[12]	Test-rmse:2.40280
[13]	Test-rmse:2.40254
[14]	Test-rmse:2.40231
[15]	Test-rmse:2.40228
[16]	Test-rmse:2.39945
[17]	Test-rmse:2.39952
[18]	Test-rmse:2.39968
[19]	Test-rmse:2.39958
[20]	Test-rmse:2.39925
[21]	Test-rmse:2.39919
[22]	Test-rmse:2.39845
[23]	Test-rmse:2.39829
[24]	Test-rmse:2.39698
[25]	Test-rmse:2.39698
[26]	Test-rmse:2.39660
[27]	Test-rmse:2.39674
[28]	Test-rmse:2.39677
[29]	Test-rmse:2.39596
[30]	Test-rmse:2.39539
[31]	Test-rmse:2.39479
[32]	Test-rmse:2.39482
[33]	Test-rmse:2.39401
[34]	Test-rmse:2.39387
[35]	Test-rmse:2.39344
[36]	Test-rmse:2.39334
[37]	Test-rmse:2.39322
[38]	Test-rmse:2.39329
[39]	Test-rmse:2.39334
[40]	Test-rmse:2.39281
[41]	Test-rmse:2.39221
[42]	Test-rmse:2.39220
[43]	Test-rmse:2.3915

In [29]:
# best RMSE 의 rounds 확인 
print("Best RMSE: {:.2f} with {} rounds".format(
                 bst.best_score,
                 bst.best_iteration+1))

Best RMSE: 2.39 with 55 rounds


In [30]:
# cross validation 
cv_results = xgb.cv( 
    param, 
    dtrain, 
    num_boost_round=999, 
    seed=42, 
    nfold=5, 
    metrics={'rmse'}, 
    early_stopping_rounds=10 
)

cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,4.627793,0.001134,4.628926,0.004887
1,3.380854,0.001983,3.384252,0.005551
2,2.794511,0.001792,2.802341,0.007884
3,2.547568,0.004111,2.559324,0.008233
4,2.448483,0.00382,2.46401,0.009607
5,2.410624,0.003532,2.428613,0.009741
6,2.393455,0.003321,2.415037,0.009735
7,2.384015,0.003802,2.409734,0.010582
8,2.378465,0.002448,2.40721,0.011413
9,2.375259,0.002824,2.406123,0.01173


In [32]:
# Cross validation 결과 확인 
cv_results['test-rmse-mean'].min()

2.397117829900066

#### max_depth, min_child_weight

In [33]:
# 하아퍼 파라미터 grid search 수행 
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [35]:
# 기본 파라미터 설정 
param = {
    'max_depth': 6,
    'eta': 0.3,
    'objective': 'reg:squarederror', 
}

In [36]:
# initialize
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # 파라미터 업데이트 
    param['max_depth'] = max_depth
    param['min_child_weight'] = min_child_weight
    # Cross validation 
    cv_results = xgb.cv(
        param,
        dtrain,
        num_boost_round=999,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # best RMSE에 대해 파라미터 업데이트 
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

CV with max_depth=9, min_child_weight=5
	RMSE 2.3909665950694583 for 103 rounds
CV with max_depth=9, min_child_weight=6
	RMSE 2.3909543619489884 for 104 rounds
CV with max_depth=9, min_child_weight=7
	RMSE 2.3909585006885523 for 89 rounds
CV with max_depth=10, min_child_weight=5
	RMSE 2.3912565038170017 for 96 rounds
CV with max_depth=10, min_child_weight=6
	RMSE 2.3908701617133876 for 73 rounds
CV with max_depth=10, min_child_weight=7
	RMSE 2.3898018431797765 for 95 rounds
CV with max_depth=11, min_child_weight=5
	RMSE 2.391966742751217 for 67 rounds
CV with max_depth=11, min_child_weight=6
	RMSE 2.390831684064169 for 71 rounds
CV with max_depth=11, min_child_weight=7
	RMSE 2.3914210533135587 for 72 rounds
Best params: 10, 7, RMSE: 2.3898018431797765


In [37]:
# 위 결과로 max_depth, min_child_weight 설정 
param['max_depth'] = 10
param['min_child_weight'] = 7

#### Subsampling, Colsampling 

In [38]:
# subsampling, colsampling grid search 정의 
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [39]:
min_rmse = float("Inf")
best_params = None
# 큰 값 -> 작은 값으로 최적의 파라미터 탐색 
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # 파라미터 업데이트 
    param['subsample'] = subsample
    param['colsample_bytree'] = colsample
    # Cross validation
    cv_results = xgb.cv(
        param,
        dtrain,
        num_boost_round=999,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # best score 업데이트  
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (subsample,colsample)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))


CV with subsample=1.0, colsample=1.0
	RMSE 2.3898018431797765 for 95 rounds
CV with subsample=1.0, colsample=0.9
	RMSE 2.3913196958983347 for 119 rounds
CV with subsample=1.0, colsample=0.8
	RMSE 2.3905695508774256 for 76 rounds
CV with subsample=1.0, colsample=0.7
	RMSE 2.389008337944726 for 111 rounds
CV with subsample=0.9, colsample=1.0
	RMSE 2.3913417759826623 for 92 rounds
CV with subsample=0.9, colsample=0.9
	RMSE 2.3913535744288033 for 92 rounds
CV with subsample=0.9, colsample=0.8
	RMSE 2.3900899859381726 for 90 rounds
CV with subsample=0.9, colsample=0.7
	RMSE 2.3888946411354928 for 83 rounds
CV with subsample=0.8, colsample=1.0
	RMSE 2.3923349221431005 for 71 rounds
CV with subsample=0.8, colsample=0.9
	RMSE 2.3918959684289938 for 84 rounds
CV with subsample=0.8, colsample=0.8
	RMSE 2.3922983963566846 for 71 rounds
CV with subsample=0.8, colsample=0.7
	RMSE 2.3915865734197483 for 80 rounds
CV with subsample=0.7, colsample=1.0
	RMSE 2.395132599930482 for 67 rounds
CV with subs

In [40]:
# 위 결과를 통해 최적의 subsample, colsample 찾음
param['subsample'] = 0.9
param['colsample_bytree'] = 0.7

#### eta

In [None]:
%time
min_rmse = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # 파라미터 업데이트
    param['eta'] = eta
    # cross validation 
    %time cv_results = xgb.cv(param,dtrain,num_boost_round=999,seed=42,nfold=5,metrics=['rmse'],early_stopping_rounds=10)
    # best score 업데이트 
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = eta
print("Best params: {}, RMSE: {}".format(best_params, min_rmse))

In [42]:
# 위 결과를 통해 best eta 설정
param['eta'] = 0.05

In [43]:
# 최종 Param 확인 
param

{'max_depth': 10,
 'eta': 0.05,
 'objective': 'reg:squarederror',
 'min_child_weight': 7,
 'subsample': 0.9,
 'colsample_bytree': 0.7}

In [44]:
# 최종 Test
model = xgb.train(
    param,
    dtrain,
    num_boost_round=999,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-rmse:6.68867
[1]	Test-rmse:6.39872
[2]	Test-rmse:6.12505
[3]	Test-rmse:5.86729
[4]	Test-rmse:5.62437
[5]	Test-rmse:5.39572
[6]	Test-rmse:5.18068
[7]	Test-rmse:4.97858
[8]	Test-rmse:4.78880
[9]	Test-rmse:4.61124
[10]	Test-rmse:4.44461
[11]	Test-rmse:4.28877
[12]	Test-rmse:4.14299
[13]	Test-rmse:4.00706
[14]	Test-rmse:3.88026
[15]	Test-rmse:3.76231
[16]	Test-rmse:3.65256
[17]	Test-rmse:3.55008
[18]	Test-rmse:3.45529
[19]	Test-rmse:3.36711
[20]	Test-rmse:3.28592
[21]	Test-rmse:3.21086
[22]	Test-rmse:3.14132
[23]	Test-rmse:3.07732
[24]	Test-rmse:3.01854
[25]	Test-rmse:2.96462
[26]	Test-rmse:2.91491
[27]	Test-rmse:2.86908
[28]	Test-rmse:2.82737
[29]	Test-rmse:2.78916
[30]	Test-rmse:2.75431
[31]	Test-rmse:2.72246
[32]	Test-rmse:2.69341
[33]	Test-rmse:2.66690
[34]	Test-rmse:2.64264
[35]	Test-rmse:2.62063
[36]	Test-rmse:2.60057
[37]	Test-rmse:2.58226
[38]	Test-rmse:2.56581
[39]	Test-rmse:2.55085
[40]	Test-rmse:2.53714
[41]	Test-rmse:2.52480
[42]	Test-rmse:2.51373
[43]	Test-rmse:2.5036

In [45]:
print("Best RMSE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

Best RMSE: 2.37 in 749 rounds


## 주의님 데이터

In [8]:
topic_path = os.path.join(base_path, 'books_topic_modeling_v2.csv')
topic_book = pd.read_csv(topic_path, encoding='utf-8')

# books 합치기 
topic_book = topic_book[['isbn', 'summary_topic', 'category_topic']]
merge_book = bookv4.merge(topic_book, how='left', on='isbn')
merge_book.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149570 entries, 0 to 149569
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   isbn                 149570 non-null  object 
 1   book_title           149570 non-null  object 
 2   book_author          149570 non-null  object 
 3   year_of_publication  149570 non-null  float64
 4   publisher            149570 non-null  object 
 5   img_url              149570 non-null  object 
 6   language             149570 non-null  object 
 7   summary              82340 non-null   object 
 8   img_path             149570 non-null  object 
 9   category             148582 non-null  object 
 10  summary_topic        82343 non-null   float64
 11  category_topic       80719 non-null   float64
dtypes: float64(3), object(9)
memory usage: 14.8+ MB


In [9]:
merge_book.nunique()

isbn                   149570
book_title             132713
book_author             62059
year_of_publication        95
publisher                1523
img_url                149570
language                   26
summary                 79521
img_path               149570
category                   15
summary_topic             350
category_topic            112
dtype: int64

In [71]:
merge_book.isnull().sum()

isbn                       0
book_title                 0
book_author                0
year_of_publication        0
publisher                  0
img_url                    0
language                   0
summary                67230
img_path                   0
category                 988
summary_topic          67227
category_topic         68851
dtype: int64

In [72]:
merge_book.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,summary,img_path,category,summary_topic,category_topic
0,2005018,clara callan,Richard Bruce Wright,2001.0,Collins,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,fiction1,-1.0,108.0
1,60973129,decision in normandy,Carlo D'Este,1991.0,Perennial,http://images.amazon.com/images/P/0060973129.0...,en,here for the first time in paperback is an out...,images/0060973129.01.THUMBZZZ.jpg,fiction1,-1.0,68.0
2,374157065,flu the story of the great influenza pandemic ...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,en,describes the great flu epidemic of 1918 an ou...,images/0374157065.01.THUMBZZZ.jpg,fiction1,131.0,6.0
3,399135782,the kitchen god s wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,en,a chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg,fiction1,31.0,33.0
4,425176428,what if the world s foremost military historia...,Robert Cowley,2000.0,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,en,essays by respected military historians includ...,images/0425176428.01.THUMBZZZ.jpg,fiction1,-1.0,21.0


In [10]:
# 특수문자 제거 및 소문자 변환
import re
merge_book['book_author'] = merge_book['book_author'].apply(lambda x: re.sub(r'[^\w\s]', '', x).lower())
merge_book['publisher'] = merge_book['publisher'].apply(lambda x: re.sub(r'[^\w\s]', '', x).lower())

In [74]:
merge_book.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,summary,img_path,category,summary_topic,category_topic
0,2005018,clara callan,richard bruce wright,2001.0,collins,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,fiction1,-1.0,108.0
1,60973129,decision in normandy,carlo deste,1991.0,perennial,http://images.amazon.com/images/P/0060973129.0...,en,here for the first time in paperback is an out...,images/0060973129.01.THUMBZZZ.jpg,fiction1,-1.0,68.0
2,374157065,flu the story of the great influenza pandemic ...,gina bari kolata,1999.0,farrar straus giroux,http://images.amazon.com/images/P/0374157065.0...,en,describes the great flu epidemic of 1918 an ou...,images/0374157065.01.THUMBZZZ.jpg,fiction1,131.0,6.0
3,399135782,the kitchen god s wife,amy tan,1991.0,putnam pub group,http://images.amazon.com/images/P/0399135782.0...,en,a chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg,fiction1,31.0,33.0
4,425176428,what if the world s foremost military historia...,robert cowley,2000.0,berkley publishing group,http://images.amazon.com/images/P/0425176428.0...,en,essays by respected military historians includ...,images/0425176428.01.THUMBZZZ.jpg,fiction1,-1.0,21.0


In [75]:
# 계속 사용할 것 같아서 따로 저장 
# merge_book.to_csv('books_v5_merge.csv', index=False)

In [80]:
merge_book.size

1794840

In [11]:
merge_ = ratings.merge(merge_book, how='left', on='isbn')
df = merge_.merge(usersv2, how='inner', on='user_id')
df.nunique()

user_id                 59803
isbn                   129777
rating                     10
book_title             115473
book_author             52679
year_of_publication        92
publisher                1402
img_url                129777
language                   24
summary                 69758
img_path               129777
category                   15
summary_topic             350
category_topic            112
location                13888
age                        91
dtype: int64

In [77]:
df.isna().sum()

user_id                     0
isbn                        0
rating                      0
book_title                  0
book_author                 0
year_of_publication         0
publisher                   0
img_url                     0
language                    0
summary                119086
img_path                    0
category                  894
summary_topic          119084
category_topic         121221
location                    0
age                         0
dtype: int64

In [81]:
df.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,img_url,language,summary,img_path,category,summary_topic,category_topic,location,age
0,8,0002005018,4,clara callan,richard bruce wright,2001.0,collins,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,fiction1,-1.0,108.0,"timmins,ontario,canada",32.0
1,8,074322678X,4,where you ll find me and other stories,ann beattie,2002.0,pocket,http://images.amazon.com/images/P/074322678X.0...,en,now back in print ann beattie 39 s finest shor...,images/074322678X.01.THUMBZZZ.jpg,fiction1,-1.0,33.0,"timmins,ontario,canada",32.0
2,8,0887841740,2,the middle stories,sheila heti,2004.0,harperbusiness,http://images.amazon.com/images/P/0887841740.0...,en,,images/0887841740.01.THUMBZZZ.jpg,fiction1,,,"timmins,ontario,canada",32.0
3,8,1552041778,2,jane doe,r j kaiser,1999.0,firefly books ltd,http://images.amazon.com/images/P/1552041778.0...,en,,images/1552041778.01.THUMBZZZ.jpg,fiction1,,,"timmins,ontario,canada",32.0
4,8,1567407781,6,the witchfinder amos walker mystery series,loren d estleman,1998.0,llewellyn publications,http://images.amazon.com/images/P/1567407781.0...,en,,images/1567407781.01.THUMBZZZ.jpg,fiction1,,,"timmins,ontario,canada",32.0


In [83]:
df['category'].fillna('etc', inplace=True)

In [84]:
X_cat = df['category']
X_cat_topic = df['category_topic']
X_sum_topic = df['summary_topic']
X_author = df['book_author']
X_publisher = df['publisher']
X_age = df['age']
y = df['rating']

In [85]:
le_cat = LabelEncoder()
le_cat_topic = LabelEncoder()
le_sum_topic = LabelEncoder()
le_author = LabelEncoder()
le_publisher = LabelEncoder()
X_cat = le_cat.fit_transform(X_cat)
X_cat_topic = le_cat_topic.fit_transform(X_cat_topic)
X_sum_topic = le_sum_topic.fit_transform(X_sum_topic)
X_author = le_author.fit_transform(X_author)
X_publisher = le_publisher.fit_transform(X_author)

In [86]:
vectorizer = TfidfVectorizer()
X_age = vectorizer.fit_transform(X_age.astype(str))

In [117]:
X = pd.concat([ # pd.Series(X_cat), 
               pd.Series(X_cat_topic), 
               pd.Series(X_sum_topic), 
               pd.Series(X_author), 
               pd.Series(X_publisher), 
               pd.DataFrame(X_age.toarray())], axis=1)

In [110]:
X.columns[X.columns.duplicated()]

Int64Index([0, 1, 2, 3], dtype='int64')

In [111]:
new_columns = []
for i, col in enumerate(X.columns):
    if X.columns.duplicated()[i]:
        new_columns.append(f"{col}_{i}")
    else:
        new_columns.append(col)
X.columns = new_columns

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [113]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [114]:
param = {
    'max_depth': 10,
    'eta': 0.5,
    'objective': 'reg:squarederror', 
    'min_child_weight': 7,
    'subsample': 0.9,
    'colsample_bytree': 0.7
}

In [115]:
num_round = 100 
bst = xgb.train(param, dtrain, num_round,num_boost_round=999,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10)

[0]	Test-rmse:4.07818
[1]	Test-rmse:2.92106
[2]	Test-rmse:2.55219
[3]	Test-rmse:2.44712
[4]	Test-rmse:2.41776
[5]	Test-rmse:2.41208
[6]	Test-rmse:2.41009
[7]	Test-rmse:2.40457
[8]	Test-rmse:2.40403
[9]	Test-rmse:2.40226
[10]	Test-rmse:2.40188
[11]	Test-rmse:2.40154
[12]	Test-rmse:2.40130
[13]	Test-rmse:2.40109
[14]	Test-rmse:2.39953
[15]	Test-rmse:2.39618
[16]	Test-rmse:2.39620
[17]	Test-rmse:2.39610
[18]	Test-rmse:2.39638
[19]	Test-rmse:2.39625
[20]	Test-rmse:2.39577
[21]	Test-rmse:2.39562
[22]	Test-rmse:2.39466
[23]	Test-rmse:2.39465
[24]	Test-rmse:2.39440
[25]	Test-rmse:2.39459
[26]	Test-rmse:2.39481
[27]	Test-rmse:2.39427
[28]	Test-rmse:2.39453
[29]	Test-rmse:2.39433
[30]	Test-rmse:2.39457
[31]	Test-rmse:2.39409
[32]	Test-rmse:2.39420
[33]	Test-rmse:2.39458
[34]	Test-rmse:2.39432
[35]	Test-rmse:2.39476
[36]	Test-rmse:2.39488
[37]	Test-rmse:2.39497
[38]	Test-rmse:2.39488
[39]	Test-rmse:2.39490
[40]	Test-rmse:2.39520
[41]	Test-rmse:2.39553


In [116]:
print("Best RMSE: {:.2f} with {} rounds".format(
                 bst.best_score,
                 bst.best_iteration+1))

Best RMSE: 2.39 with 32 rounds


In [108]:
cv_results = xgb.cv( 
    param, 
    dtrain, 
    num_boost_round=999, 
    seed=42, 
    nfold=5, 
    metrics={'rmse'}, 
    early_stopping_rounds=10 
)
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,6.700390,0.001136,6.700476,0.004881
1,6.409855,0.001072,6.410040,0.004894
2,6.135848,0.001041,6.136182,0.004921
3,5.877569,0.001010,5.878074,0.004925
4,5.634294,0.000916,5.634971,0.005003
...,...,...,...,...
666,2.262000,0.002034,2.385621,0.010174
667,2.261831,0.002002,2.385628,0.010179
668,2.261709,0.001999,2.385607,0.010173
669,2.261568,0.002045,2.385619,0.010167


In [96]:
cv_results['test-rmse-mean'].min()

2.4001076951416223

In [97]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [98]:
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    
    param['max_depth'] = max_depth
    param['min_child_weight'] = min_child_weight
    
    cv_results = xgb.cv(
        param,
        dtrain,
        num_boost_round=999,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

CV with max_depth=9, min_child_weight=5
	RMSE 2.400714287310458 for 41 rounds
CV with max_depth=9, min_child_weight=6
	RMSE 2.398915658307208 for 26 rounds
CV with max_depth=9, min_child_weight=7
	RMSE 2.399430269799994 for 29 rounds
CV with max_depth=10, min_child_weight=5
	RMSE 2.401736702828961 for 25 rounds
CV with max_depth=10, min_child_weight=6
	RMSE 2.401832768722494 for 25 rounds
CV with max_depth=10, min_child_weight=7
	RMSE 2.4001076951416223 for 25 rounds
CV with max_depth=11, min_child_weight=5
	RMSE 2.4032844912250533 for 16 rounds
CV with max_depth=11, min_child_weight=6
	RMSE 2.4027562809294203 for 19 rounds
CV with max_depth=11, min_child_weight=7
	RMSE 2.402036434340466 for 18 rounds
Best params: 9, 6, RMSE: 2.398915658307208


In [99]:
param['max_depth'] = 9
param['min_child_weight'] = 6

In [100]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [101]:
min_rmse = float("Inf")
best_params = None

for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    
    param['subsample'] = subsample
    param['colsample_bytree'] = colsample
    
    cv_results = xgb.cv(
        param,
        dtrain,
        num_boost_round=999,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (subsample,colsample)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

CV with subsample=1.0, colsample=1.0
	RMSE 2.3987346142483372 for 29 rounds
CV with subsample=1.0, colsample=0.9
	RMSE 2.3980004720439445 for 34 rounds
CV with subsample=1.0, colsample=0.8
	RMSE 2.398648439188441 for 28 rounds
CV with subsample=1.0, colsample=0.7
	RMSE 2.3984340876802386 for 38 rounds
CV with subsample=0.9, colsample=1.0
	RMSE 2.4026657408524206 for 31 rounds
CV with subsample=0.9, colsample=0.9
	RMSE 2.401828322292724 for 25 rounds
CV with subsample=0.9, colsample=0.8
	RMSE 2.401538407865804 for 24 rounds
CV with subsample=0.9, colsample=0.7
	RMSE 2.398915658307208 for 26 rounds
CV with subsample=0.8, colsample=1.0
	RMSE 2.403521513675182 for 20 rounds
CV with subsample=0.8, colsample=0.9
	RMSE 2.4035721185896004 for 21 rounds
CV with subsample=0.8, colsample=0.8
	RMSE 2.403287458382646 for 24 rounds
CV with subsample=0.8, colsample=0.7
	RMSE 2.402512106712691 for 31 rounds
CV with subsample=0.7, colsample=1.0
	RMSE 2.4062311137810966 for 21 rounds
CV with subsample=0

In [102]:
param['subsample'] = 1.0
param['colsample_bytree'] = 0.9

In [None]:
%time
min_rmse = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    
    param['eta'] = eta
    
    %time cv_results = xgb.cv(param,dtrain,num_boost_round=999,seed=42,nfold=5,metrics=['rmse'],early_stopping_rounds=10)
    
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = eta
print("Best params: {}, RMSE: {}".format(best_params, min_rmse))

In [104]:
param['eta'] = 0.05

In [105]:
# 최종 Test
model = xgb.train(
    param,
    dtrain,
    num_boost_round=999,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-rmse:6.68856
[1]	Test-rmse:6.39816
[2]	Test-rmse:6.12433
[3]	Test-rmse:5.86626
[4]	Test-rmse:5.62321
[5]	Test-rmse:5.39451
[6]	Test-rmse:5.17934
[7]	Test-rmse:4.97718
[8]	Test-rmse:4.78758
[9]	Test-rmse:4.60966
[10]	Test-rmse:4.44279
[11]	Test-rmse:4.28687
[12]	Test-rmse:4.14111
[13]	Test-rmse:4.00509
[14]	Test-rmse:3.87817
[15]	Test-rmse:3.75998
[16]	Test-rmse:3.65012
[17]	Test-rmse:3.54777
[18]	Test-rmse:3.45291
[19]	Test-rmse:3.36516
[20]	Test-rmse:3.28395
[21]	Test-rmse:3.20883
[22]	Test-rmse:3.13942
[23]	Test-rmse:3.07556
[24]	Test-rmse:3.01648
[25]	Test-rmse:2.96242
[26]	Test-rmse:2.91279
[27]	Test-rmse:2.86727
[28]	Test-rmse:2.82554
[29]	Test-rmse:2.78736
[30]	Test-rmse:2.75243
[31]	Test-rmse:2.72038
[32]	Test-rmse:2.69133
[33]	Test-rmse:2.66483
[34]	Test-rmse:2.64066
[35]	Test-rmse:2.61847
[36]	Test-rmse:2.59848
[37]	Test-rmse:2.58020
[38]	Test-rmse:2.56369
[39]	Test-rmse:2.54870
[40]	Test-rmse:2.53513
[41]	Test-rmse:2.52273
[42]	Test-rmse:2.51155
[43]	Test-rmse:2.5013

In [106]:
print("Best RMSE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

Best RMSE: 2.38 in 628 rounds


In [107]:
param

{'max_depth': 9,
 'eta': 0.05,
 'objective': 'reg:squarederror',
 'min_child_weight': 6,
 'subsample': 1.0,
 'colsample_bytree': 0.9}

In [None]:
cv_results = xgb.cv( 
    param, 
    dtrain, 
    num_boost_round=999, 
    seed=42, 
    nfold=5, 
    metrics={'rmse'}, 
    early_stopping_rounds=10 
)
cv_results

### 주의님 데이터에서 dropna로 성능만 확인 

In [129]:
merge_book.isnull().sum()

isbn                       0
book_title                 0
book_author                0
year_of_publication        0
publisher                  0
img_url                    0
language                   0
summary                67230
img_path                   0
category                 988
summary_topic          67227
category_topic         68851
dtype: int64

In [130]:
# merge 후 최종 데이터 
drop_merge = ratings.merge(merge_book, how='left', on='isbn')
drop_df = drop_merge.merge(usersv2, how='inner', on='user_id')
drop_df = drop_df.dropna()
drop_df.nunique()

user_id                43364
isbn                   70680
rating                    10
book_title             65404
book_author            32999
year_of_publication       82
publisher                745
img_url                70680
language                  24
summary                68365
img_path               70680
category                  15
summary_topic            350
category_topic           112
location               10765
age                       88
dtype: int64

In [131]:
drop_df.isna().sum()

user_id                0
isbn                   0
rating                 0
book_title             0
book_author            0
year_of_publication    0
publisher              0
img_url                0
language               0
summary                0
img_path               0
category               0
summary_topic          0
category_topic         0
location               0
age                    0
dtype: int64

In [132]:
X_cat = drop_df['category']
X_cat_topic = drop_df['category_topic']
X_sum_topic = drop_df['summary_topic']
X_author = drop_df['book_author']
X_publisher = drop_df['publisher']
X_age = drop_df['age']
y = drop_df['rating']

In [133]:
le_cat = LabelEncoder()
le_cat_topic = LabelEncoder()
le_sum_topic = LabelEncoder()
le_author = LabelEncoder()
le_publisher = LabelEncoder()
X_cat = le_cat.fit_transform(X_cat)
X_cat_topic = le_cat_topic.fit_transform(X_cat_topic)
X_sum_topic = le_sum_topic.fit_transform(X_sum_topic)
X_author = le_author.fit_transform(X_author)
X_publisher = le_publisher.fit_transform(X_author)

In [134]:
vectorizer = TfidfVectorizer()
X_age = vectorizer.fit_transform(X_age.astype(str))

In [135]:
X = pd.concat([pd.Series(X_cat), 
               pd.Series(X_cat_topic), 
               pd.Series(X_sum_topic), 
               pd.Series(X_author), 
               pd.Series(X_publisher), 
               pd.DataFrame(X_age.toarray())], axis=1)

In [136]:
X.columns[X.columns.duplicated()]

Int64Index([0, 1, 2, 3, 4], dtype='int64')

In [137]:
new_columns = []
for i, col in enumerate(X.columns):
    if X.columns.duplicated()[i]:
        new_columns.append(f"{col}_{i}")
    else:
        new_columns.append(col)
X.columns = new_columns

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [139]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [140]:
param = {
    'max_depth': 10, 
    'eta': 0.5, 
    'objective': 'reg:squarederror', 
    'min_child_weight': 7,
    'subsample': 0.9,
    'colsample_bytree': 0.7
}

In [141]:
num_round = 100  # 트리 개수
bst = xgb.train(param, dtrain, num_round,num_boost_round=999,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10)

[0]	Test-rmse:4.08375
[1]	Test-rmse:2.92138
[2]	Test-rmse:2.55056
[3]	Test-rmse:2.44858
[4]	Test-rmse:2.42247
[5]	Test-rmse:2.41692
[6]	Test-rmse:2.41472
[7]	Test-rmse:2.41099
[8]	Test-rmse:2.41161
[9]	Test-rmse:2.41159
[10]	Test-rmse:2.41218
[11]	Test-rmse:2.41212
[12]	Test-rmse:2.41131
[13]	Test-rmse:2.41147
[14]	Test-rmse:2.41125
[15]	Test-rmse:2.41079
[16]	Test-rmse:2.40942
[17]	Test-rmse:2.40954
[18]	Test-rmse:2.40985
[19]	Test-rmse:2.40954
[20]	Test-rmse:2.40949
[21]	Test-rmse:2.40938
[22]	Test-rmse:2.40923
[23]	Test-rmse:2.40826
[24]	Test-rmse:2.40819
[25]	Test-rmse:2.40932
[26]	Test-rmse:2.40987
[27]	Test-rmse:2.40985
[28]	Test-rmse:2.40995
[29]	Test-rmse:2.40997
[30]	Test-rmse:2.40963
[31]	Test-rmse:2.41032
[32]	Test-rmse:2.41063
[33]	Test-rmse:2.41054


In [142]:
print("Best RMSE: {:.2f} with {} rounds".format(
                 bst.best_score,
                 bst.best_iteration+1))

Best RMSE: 2.41 with 25 rounds


In [143]:
cv_results = xgb.cv( 
    param, 
    dtrain, 
    num_boost_round=999, 
    seed=42, 
    nfold=5, 
    metrics={'rmse'}, 
    early_stopping_rounds=10 
)
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,4.086951,0.003645,4.088941,0.010058
1,2.9102,0.006031,2.919388,0.009609
2,2.519414,0.003764,2.54018,0.010582
3,2.407134,0.006159,2.434927,0.009739
4,2.370668,0.004691,2.405858,0.011499
5,2.357177,0.005737,2.397989,0.011983
6,2.350372,0.005488,2.395885,0.011778
7,2.344192,0.004606,2.394348,0.011644
8,2.339313,0.004691,2.394449,0.01153
9,2.333945,0.004911,2.394616,0.012249


In [144]:
cv_results['test-rmse-mean'].min()

2.3928867293254497

In [145]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [146]:
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    param['max_depth'] = max_depth
    param['min_child_weight'] = min_child_weight

    cv_results = xgb.cv(
        param,
        dtrain,
        num_boost_round=999,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )

    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

CV with max_depth=9, min_child_weight=5
	RMSE 2.3941712781561506 for 17 rounds
CV with max_depth=9, min_child_weight=6
	RMSE 2.394871836186971 for 12 rounds
CV with max_depth=9, min_child_weight=7
	RMSE 2.3932704441115584 for 14 rounds
CV with max_depth=10, min_child_weight=5
	RMSE 2.3955605532204958 for 12 rounds
CV with max_depth=10, min_child_weight=6
	RMSE 2.3941059703430634 for 12 rounds
CV with max_depth=10, min_child_weight=7
	RMSE 2.3928867293254497 for 13 rounds
CV with max_depth=11, min_child_weight=5
	RMSE 2.3973484470485493 for 13 rounds
CV with max_depth=11, min_child_weight=6
	RMSE 2.3983611301867227 for 7 rounds
CV with max_depth=11, min_child_weight=7
	RMSE 2.3946816586872575 for 8 rounds
Best params: 10, 7, RMSE: 2.3928867293254497


In [147]:
param['max_depth'] = 10
param['min_child_weight'] = 7

In [148]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [149]:
min_rmse = float("Inf")
best_params = None

for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))

    param['subsample'] = subsample
    param['colsample_bytree'] = colsample

    cv_results = xgb.cv(
        param,
        dtrain,
        num_boost_round=999,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )

    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (subsample,colsample)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

CV with subsample=1.0, colsample=1.0
	RMSE 2.395106292139295 for 8 rounds
CV with subsample=1.0, colsample=0.9
	RMSE 2.395765336233732 for 14 rounds
CV with subsample=1.0, colsample=0.8
	RMSE 2.3941049840679867 for 20 rounds
CV with subsample=1.0, colsample=0.7
	RMSE 2.3932044980489104 for 12 rounds
CV with subsample=0.9, colsample=1.0
	RMSE 2.3966264966413235 for 8 rounds
CV with subsample=0.9, colsample=0.9
	RMSE 2.3948105353706097 for 11 rounds
CV with subsample=0.9, colsample=0.8
	RMSE 2.395609800658901 for 13 rounds
CV with subsample=0.9, colsample=0.7
	RMSE 2.3928867293254497 for 13 rounds
CV with subsample=0.8, colsample=1.0
	RMSE 2.3982658962760377 for 11 rounds
CV with subsample=0.8, colsample=0.9
	RMSE 2.398234544671859 for 8 rounds
CV with subsample=0.8, colsample=0.8
	RMSE 2.3972249819225393 for 8 rounds
CV with subsample=0.8, colsample=0.7
	RMSE 2.3971540008724226 for 16 rounds
CV with subsample=0.7, colsample=1.0
	RMSE 2.399448286484978 for 7 rounds
CV with subsample=0.7,

In [150]:
param['subsample'] = 0.9
param['colsample_bytree'] = 0.7

In [151]:
%time

min_rmse = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))

    param['eta'] = eta

    %time cv_results = xgb.cv(param,dtrain,num_boost_round=999,seed=42,nfold=5,metrics=['rmse'],early_stopping_rounds=10)

    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = eta
print("Best params: {}, RMSE: {}".format(best_params, min_rmse))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.15 µs
CV with eta=0.3
CPU times: user 3min 26s, sys: 632 ms, total: 3min 26s
Wall time: 27.5 s
	RMSE 2.3858414736242652 for 38 rounds

CV with eta=0.2
CPU times: user 6min 23s, sys: 856 ms, total: 6min 24s
Wall time: 49.8 s
	RMSE 2.381420386752217 for 86 rounds

CV with eta=0.1
CPU times: user 13min 54s, sys: 1.55 s, total: 13min 56s
Wall time: 1min 46s
	RMSE 2.3754456830275634 for 200 rounds

CV with eta=0.05
CPU times: user 26min 38s, sys: 2.41 s, total: 26min 40s
Wall time: 3min 24s
	RMSE 2.373027096400185 for 392 rounds

CV with eta=0.01
CPU times: user 1h 6min 12s, sys: 6.24 s, total: 1h 6min 18s
Wall time: 8min 26s
	RMSE 2.3766976720868422 for 998 rounds

CV with eta=0.005
CPU times: user 1h 7min 56s, sys: 8.15 s, total: 1h 8min 4s
Wall time: 8min 43s
	RMSE 2.3847519984919336 for 998 rounds

Best params: 0.05, RMSE: 2.373027096400185


In [152]:
param['eta'] = 0.05

In [153]:
param

{'max_depth': 10,
 'eta': 0.05,
 'objective': 'reg:squarederror',
 'min_child_weight': 7,
 'subsample': 0.9,
 'colsample_bytree': 0.7}

In [154]:
# 최종 Test
model = xgb.train(
    param,
    dtrain,
    num_boost_round=999,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-rmse:6.70338
[1]	Test-rmse:6.41247
[2]	Test-rmse:6.13798
[3]	Test-rmse:5.87912
[4]	Test-rmse:5.63529
[5]	Test-rmse:5.40588
[6]	Test-rmse:5.19000
[7]	Test-rmse:4.98740
[8]	Test-rmse:4.79725
[9]	Test-rmse:4.61887
[10]	Test-rmse:4.45208
[11]	Test-rmse:4.29563
[12]	Test-rmse:4.14970
[13]	Test-rmse:4.01325
[14]	Test-rmse:3.88566
[15]	Test-rmse:3.76712
[16]	Test-rmse:3.65669
[17]	Test-rmse:3.55444
[18]	Test-rmse:3.45956
[19]	Test-rmse:3.37147
[20]	Test-rmse:3.28998
[21]	Test-rmse:3.21462
[22]	Test-rmse:3.14504
[23]	Test-rmse:3.08096
[24]	Test-rmse:3.02178
[25]	Test-rmse:2.96751
[26]	Test-rmse:2.91760
[27]	Test-rmse:2.87203
[28]	Test-rmse:2.83018
[29]	Test-rmse:2.79182
[30]	Test-rmse:2.75674
[31]	Test-rmse:2.72441
[32]	Test-rmse:2.69512
[33]	Test-rmse:2.66793
[34]	Test-rmse:2.64313
[35]	Test-rmse:2.62092
[36]	Test-rmse:2.60071
[37]	Test-rmse:2.58255
[38]	Test-rmse:2.56603
[39]	Test-rmse:2.55096
[40]	Test-rmse:2.53728
[41]	Test-rmse:2.52475
[42]	Test-rmse:2.51343
[43]	Test-rmse:2.5032

In [155]:
print("Best RMSE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

Best RMSE: 2.39 in 365 rounds


# user 정보도 같이 파악하기

- 가정: 유저 별 평균 등 사용하면 성능이 올라갈 것
- 검증 방법: 평균 활용은 biased SVD 를 사용해서 stacking 해보자
- 후기: stacking 하다가 중간에 모델을 바꿨는데, feature로 추가하는게 어렵지 않아서 빠르게 해볼걸 그랬다. 다음에 시도하자.

In [12]:
df.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,img_url,language,summary,img_path,category,summary_topic,category_topic,location,age
0,8,0002005018,4,clara callan,richard bruce wright,2001.0,collins,http://images.amazon.com/images/P/0002005018.0...,en,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,fiction1,-1.0,108.0,"timmins,ontario,canada",32.0
1,8,074322678X,4,where you ll find me and other stories,ann beattie,2002.0,pocket,http://images.amazon.com/images/P/074322678X.0...,en,now back in print ann beattie 39 s finest shor...,images/074322678X.01.THUMBZZZ.jpg,fiction1,-1.0,33.0,"timmins,ontario,canada",32.0
2,8,0887841740,2,the middle stories,sheila heti,2004.0,harperbusiness,http://images.amazon.com/images/P/0887841740.0...,en,,images/0887841740.01.THUMBZZZ.jpg,fiction1,,,"timmins,ontario,canada",32.0
3,8,1552041778,2,jane doe,r j kaiser,1999.0,firefly books ltd,http://images.amazon.com/images/P/1552041778.0...,en,,images/1552041778.01.THUMBZZZ.jpg,fiction1,,,"timmins,ontario,canada",32.0
4,8,1567407781,6,the witchfinder amos walker mystery series,loren d estleman,1998.0,llewellyn publications,http://images.amazon.com/images/P/1567407781.0...,en,,images/1567407781.01.THUMBZZZ.jpg,fiction1,,,"timmins,ontario,canada",32.0


In [13]:
df.nunique()

user_id                 59803
isbn                   129777
rating                     10
book_title             115473
book_author             52679
year_of_publication        92
publisher                1402
img_url                129777
language                   24
summary                 69758
img_path               129777
category                   15
summary_topic             350
category_topic            112
location                13888
age                        91
dtype: int64