## Library

In [1]:
import sys
sys.path.append("~e/book/code")
from src.data.context_data import process_context_data, str2list, split_location
import wandb
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
import optuna
import regex
import json

In [2]:
run = str(input("run 이름을 입력하세요 :"))
# selected_model = str(input("model 명을 입력하세요 (xgb/rf) :"))
# opt = bool(input("Optuna 사용 여부를 입력하세요 (뭐라도 입력 시 사용) :"))

wandb.init(
    settings=wandb.Settings(start_method="thread"),
    dir=None,  # 로컬에 로그 저장하지 않음
    entity="remember-us", # team name,
    project="active", # project name
    name=run, # run name
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkyk709[0m ([33mremember-us[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Data Load

In [3]:
data_path: str = "/data/ephemeral/home/book/data/"
users = pd.read_csv(data_path + "users.csv")
books = pd.read_csv(data_path + "books.csv")
train = pd.read_csv(data_path + "train_ratings.csv")
test = pd.read_csv(data_path + "test_ratings.csv")
sub = pd.read_csv(data_path + 'sample_submission.csv')

## Data Preprocessing

In [4]:
def str2list(x: str) -> list:
    '''문자열을 리스트로 변환하는 함수'''
    return x[2:-2].split(', ')

In [5]:
def split_location(x: str) -> list:
    '''
    Parameters
    ----------
    x : str
        location 데이터

    Returns
    -------
    res : list
        location 데이터를 나눈 뒤, 정제한 결과를 반환합니다.
        순서는 country, state, city, ... 입니다.
    '''
    res = x.split(',')
    res = [i.strip().lower() for i in res]
    res = [regex.sub(r'[^a-zA-Z/ ]', '', i) for i in res]  # remove special characters
    res = [i if i not in ['n/a', ''] else np.nan for i in res]  # change 'n/a' into NaN
    res.reverse()  # reverse the list to get country, state, city, ... order

    for i in range(len(res)-1, 0, -1):
        if (res[i] in res[:i]) and (not pd.isna(res[i])):  # remove duplicated values if not NaN
            res.pop(i)

    return res

In [6]:
def text_preprocessing(summary: str) -> str:
    """
    주어진 텍스트 요약을 전처리합니다.

    1. 특수 문자 제거
    2. 알파벳과 숫자, 공백을 제외한 모든 문자 제거
    3. 여러 개의 공백을 하나의 공백으로
    4. 문자열의 앞뒤 공백 제거
    5. 모든 문자를 소문자로 변환

    Args:
        summary (str): 전처리할 텍스트 문자열

    Returns:
        str: 전처리된 텍스트 문자열. 입력이 NaN인 경우 "unknown" 반환.
    """
    if pd.isna(summary):
        return 'unknown'  # NaN일 경우 "unknown" 반환
    
    summary = regex.sub('[.,\'\"''\"!?]', '', summary)  # 특수 문자 제거
    summary = regex.sub('[^0-9a-zA-Z\s]', '', summary)  # 알파벳과 숫자, 공백 제외한 문자 제거
    summary = regex.sub('\s+', ' ', summary)  # 여러 개의 공백을 하나의 공백으로
    summary = summary.lower()  # 소문자로 변환
    summary = summary.strip()  # 앞뒤 공백 제거
    return summary

In [7]:
def categorize_publication(x: int, a: int) -> int:
    """
    주어진 연도를 특정 기준에 따라 카테고리화하는 함수입니다.

    Parameters
    ----------
    x : int
        책의 발행 연도.
    a : int
        연도를 그룹화할 때 사용할 기준값 (예: 5년 단위로 그룹화).

    Returns
    -------
    int
        카테고리화된 연도를 반환합니다. 
        - 1970년 이하의 연도는 1970으로 반환합니다.
        - 2000년 초과의 연도는 2006으로 반환합니다.
        - 나머지 연도는 a 값에 맞게 그룹화하여 반환합니다.

    Example
    -------
    books['years'] = books['year_of_publication'].apply(lambda x: categorize_publication(x, 5))
    print(books['years'].value_counts())
    """
    if x <= 1970:
        return 1970
    elif x > 2000:
        return 2006
    else:
        return x // a * a

In [8]:
def extract_language_from_isbn(isbn):
    """
    ISBN 정보를 사용하여 언어 코드를 추출하는 함수입니다.

    Parameters
    ----------
    isbn : str
        책의 ISBN 번호.

    Returns
    -------
    str
        ISBN에서 추출한 언어 코드. ISBN이 비어있거나 형식에 맞지 않을 경우 최빈값 'en'을 반환합니다.
        - isbn_language_map 참고
        - 기타 언어 코드: isbn_language_map에 정의된 국가 코드를 기반으로 반환
    """
    isbn_language_map = {
        '0': 'en', '1': 'en', '2': 'fr', '3': 'de', '4': 'ja',
        '5': 'ru', '7': 'zh-CN', '82': 'no', '84': 'es', '87': 'da',
        '88': 'it', '89': 'ko', '94': 'nl', '600': 'fa', '602': 'ms',
        '606': 'ro', '604': 'vi', '618': 'el', '967': 'ms', '974': 'th',
        '989': 'pt'
    }
    if not isbn or not isbn.isdigit():
        return 'en'  # 기본값 영어권
    for prefix, language in isbn_language_map.items():
        if isbn.startswith(prefix):
            return language
    return 'en'  # 기본값 영어권

In [9]:
def replace_language_using_isbn(books):
    """
    ISBN 정보를 활용하여 language 결측치를 대체하는 함수입니다.

    Parameters
    ----------
    books : pd.DataFrame
        책 정보가 담긴 DataFrame. 반드시 'isbn' 및 'language' 열을 포함해야 합니다.

    Returns
    -------
    pd.DataFrame
        language 결측치가 ISBN 정보를 사용해 대체된 DataFrame. ISBN에서 언어를 추출할 수 없는 경우
        기본값 'en'으로 대체됩니다.

    Example
    -------
    books = replace_language_using_isbn(books)
    """
    books['extracted_language'] = books['isbn'].apply(extract_language_from_isbn)
    books['language'] = books.apply(
        lambda row: row['extracted_language'] if pd.isna(row['language']) else row['language'],
        axis=1
    )
    books.drop(columns=['extracted_language'], inplace=True)
    return books

In [10]:
def categorize_age(x: int, a: int) -> int:
    """
    주어진 나이를 특정 기준에 따라 카테고리화하는 함수입니다.

    Parameters
    ----------
    x : int
        유저의 나이.
    a : int
        나이를 그룹화할 때 사용할 기준값 (예: 10년 단위로 그룹화).

    Returns
    -------
    int
        카테고리화된 나이를 반환합니다. 
        - 20년 미만의 나이는 10으로 반환합니다.
        - 60년 이상의 나이는 60으로 반환합니다.
        - 나머지 나이는 a 값에 맞게 그룹화하여 반환합니다.
    """
    if x < 20:
        return 10
    elif x >= 60:
        return 60
    else:
        return x // a * a

In [11]:
users_ = users.copy()
books_ = books.copy()

In [12]:
# 데이터 전처리
##################### books
books_['book_title'] = books_['book_title'].apply(text_preprocessing)
books_['book_author'] = books_['book_author'].apply(text_preprocessing)
books_['publisher'] = books_['publisher'].apply(text_preprocessing)
books_['publication_range'] = books_['year_of_publication'].apply(lambda x: categorize_publication(x, 5))
books_ = replace_language_using_isbn(books_)
books_['category'] = books_['category'].apply(lambda x: str2list(x)[0] if not pd.isna(x) else np.nan)
books_['category'] = books_['category'].apply(text_preprocessing)
high_categories = ['fiction', 'biography', 'history', 'religion', 'nonfiction', 'social', 'science', 'humor', 'body', 
                'business', 'economics', 'cook', 'health', 'fitness', 'famil', 'relationship', 
                'computer', 'travel', 'selfhelp', 'psychology', 'poetry', 'art', 'critic', 'nature', 'philosophy', 
                'reference','drama', 'sports', 'politic', 'comic', 'novel', 'craft', 'language', 'education', 'crime', 'music', 'pet', 
                'child', 'collection', 'mystery', 'garden', 'medical', 'author', 'house','technology', 'engineering', 'animal', 'photography',
                'adventure', 'game', 'science fiction', 'architecture', 'law', 'fantasy', 'antique', 'friend', 'brother', 'sister', 'cat',
                'math', 'christ', 'bible', 'fairy', 'horror', 'design', 'adolescence', 'actor', 'dog', 'transportation', 'murder', 'adultery', 'short', 'bear'
                ]
# high_category 열을 초기화
books_['high_category'] = None
# 각 카테고리에 대해 반복하며 매핑
for high_category in high_categories:
    # category 열에서 high_category가 포함된 행을 찾고, 해당 행의 high_category 열을 업데이트
    books_.loc[books_['category'].str.contains(high_category, case=False, na=False), 'high_category'] = high_category
books_['high_category'] = books_['high_category'].fillna('others') # 결측치를 'others'로 대체

##################### users
users_['age'] = users_['age'].fillna(users_['age'].mean())
users_['age_range'] = users_['age'].apply(lambda x: categorize_age(x, 10))

users_['location_list'] = users_['location'].apply(lambda x: split_location(x)) 
users_['location_country'] = users_['location_list'].apply(lambda x: x[0])
users_['location_state'] = users_['location_list'].apply(lambda x: x[1] if len(x) > 1 else np.nan)
users_['location_city'] = users_['location_list'].apply(lambda x: x[2] if len(x) > 2 else np.nan)
for idx, row in users_.iterrows():
    if (not pd.isna(row['location_state'])) and pd.isna(row['location_country']):
        fill_country = users_[users_['location_state'] == row['location_state']]['location_country'].mode()
        fill_country = fill_country[0] if len(fill_country) > 0 else np.nan
        users_.loc[idx, 'location_country'] = fill_country
    elif (not pd.isna(row['location_city'])) and pd.isna(row['location_state']):
        if not pd.isna(row['location_country']):
            fill_state = users_[(users_['location_country'] == row['location_country']) 
                                & (users_['location_city'] == row['location_city'])]['location_state'].mode()
            fill_state = fill_state[0] if len(fill_state) > 0 else np.nan
            users_.loc[idx, 'location_state'] = fill_state
        else:
            fill_state = users_[users_['location_city'] == row['location_city']]['location_state'].mode()
            fill_state = fill_state[0] if len(fill_state) > 0 else np.nan
            fill_country = users_[users_['location_city'] == row['location_city']]['location_country'].mode()
            fill_country = fill_country[0] if len(fill_country) > 0 else np.nan
            users_.loc[idx, 'location_country'] = fill_country
            users_.loc[idx, 'location_state'] = fill_state

In [13]:
# location_country 열의 최빈값을 계산
most_frequent_country = users_['location_country'].mode()[0]
# NaN 값을 최빈값으로 대체
users_['location_country'] = users_['location_country'].fillna(most_frequent_country)

In [14]:
users_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68092 entries, 0 to 68091
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           68092 non-null  int64  
 1   location          68092 non-null  object 
 2   age               68092 non-null  float64
 3   age_range         68092 non-null  float64
 4   location_list     68092 non-null  object 
 5   location_country  68092 non-null  object 
 6   location_state    67368 non-null  object 
 7   location_city     65305 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 4.2+ MB


In [15]:
books_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149570 entries, 0 to 149569
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   isbn                 149570 non-null  object 
 1   book_title           149570 non-null  object 
 2   book_author          149570 non-null  object 
 3   year_of_publication  149570 non-null  float64
 4   publisher            149570 non-null  object 
 5   img_url              149570 non-null  object 
 6   language             149570 non-null  object 
 7   category             149570 non-null  object 
 8   summary              82343 non-null   object 
 9   img_path             149570 non-null  object 
 10  publication_range    149570 non-null  float64
 11  high_category        149570 non-null  object 
dtypes: float64(2), object(10)
memory usage: 13.7+ MB


In [16]:
# # 성택 
# user_rating_counts = train['user_id'].value_counts()
# active_user_threshold = 4  # 평점 수 기준
# users['is_active_user'] = users['user_id'].apply(
#     lambda x: 1 if user_rating_counts.get(x, 0) >= active_user_threshold else 0
# )
            
# users = users.drop(['location'], axis=1)

In [17]:
print(books_.shape, users_.shape, train.shape, test.shape)

(149570, 12) (68092, 8) (306795, 3) (76699, 3)


In [None]:
users_final = users_[['user_id', 'age_range', 'location_country']]
books_final = books_[['isbn', 'book_title', 'book_author', 'publisher', 'language', 'high_category', 'publication_range']]

# 여기서부터 모델

In [19]:
# 데이터 병합
train = train.merge(users_final, on='user_id').merge(books_final, on='isbn')
test = test.merge(users_final, on='user_id').merge(books_final, on='isbn')

In [20]:
train.columns

Index(['user_id', 'isbn', 'rating', 'age_range', 'location_country',
       'book_title', 'book_author', 'publisher', 'language', 'high_category',
       'publication_range'],
      dtype='object')

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306795 entries, 0 to 306794
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   user_id            306795 non-null  int64  
 1   isbn               306795 non-null  object 
 2   rating             306795 non-null  int64  
 3   age_range          306795 non-null  float64
 4   location_country   306795 non-null  object 
 5   book_title         306795 non-null  object 
 6   book_author        306795 non-null  object 
 7   publisher          306795 non-null  object 
 8   language           306795 non-null  object 
 9   high_category      306795 non-null  object 
 10  publication_range  306795 non-null  float64
dtypes: float64(2), int64(2), object(7)
memory usage: 25.7+ MB


In [22]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76699 entries, 0 to 76698
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   user_id            76699 non-null  int64  
 1   isbn               76699 non-null  object 
 2   rating             76699 non-null  int64  
 3   age_range          76699 non-null  float64
 4   location_country   76699 non-null  object 
 5   book_title         76699 non-null  object 
 6   book_author        76699 non-null  object 
 7   publisher          76699 non-null  object 
 8   language           76699 non-null  object 
 9   high_category      76699 non-null  object 
 10  publication_range  76699 non-null  float64
dtypes: float64(2), int64(2), object(7)
memory usage: 6.4+ MB


In [23]:
user_id_counts = train['user_id'].value_counts()

train['review_counts'] = train['user_id'].map(user_id_counts)
test['review_counts'] = test['user_id'].map(user_id_counts)
test['review_counts'] = test['review_counts'].fillna(0)

In [24]:
cat_col = ['isbn', 'book_title', 'book_author', 'publisher', 'language', 'high_category', 'publication_range', 'user_id', 'age_range', 'location_country']
num_col = ['rating', 'review_counts']

for df in [train, test] :
    for cat in cat_col :
        df[cat] = df[cat].astype('str')
    for num in num_col :
        df[num] = df[num].astype('int')


In [25]:
from sklearn.preprocessing import LabelEncoder
import tqdm

In [26]:
test

Unnamed: 0,user_id,isbn,rating,age_range,location_country,book_title,book_author,publisher,language,high_category,publication_range,review_counts
0,11676,0002005018,0,30.0,usa,clara callan,richard bruce wright,harperflamingo canada,en,others,2006.0,5520
1,116866,0002005018,0,30.0,canada,clara callan,richard bruce wright,harperflamingo canada,en,others,2006.0,49
2,152827,0060973129,0,40.0,canada,decision in normandy,carlo deste,harperperennial,en,others,1990.0,5
3,157969,0374157065,0,30.0,usa,flu the story of the great influenza pandemic ...,gina bari kolata,farrar straus giroux,en,medical,1995.0,2
4,67958,0399135782,0,30.0,usa,the kitchen gods wife,amy tan,putnam pub group,en,fiction,1990.0,11
...,...,...,...,...,...,...,...,...,...,...,...,...
76694,278543,1576734218,0,30.0,usa,on becoming childwise,gary ezzo,multnomah,en,relationship,1995.0,3
76695,278563,3492223710,0,30.0,austria,michael khlmeiers sagen des klassischen altert...,michael khlmeier,piper,de,others,1995.0,4
76696,278633,1896095186,0,30.0,usa,poolhopping,anne fleming,polestar book publishers,en,fiction,2000.0,31
76697,278668,8408044079,0,40.0,spain,la muerte del decano,gonzalo torrrente ballester,planeta publishing corporation,es,others,2006.0,3


In [None]:
for col in cat_col:
    combined_values = pd.concat([train[col], test[col]]).unique()
    train[col] = pd.Categorical(train[col], categories=combined_values).codes
    test[col] = pd.Categorical(test[col], categories=combined_values).codes

In [28]:
# for col in cat_col :
#     le = LabelEncoder()
#     train[col] = le.fit_transform(train[col])
#     for label in tqdm.tqdm((test[col].unique())) :
#         if label not in le.classes_ : 
#             le.classes_ = np.append(le.classes_, label)
#     test[col] = le.transform(test[col])

In [29]:
test

Unnamed: 0,user_id,isbn,rating,age_range,location_country,book_title,book_author,publisher,language,high_category,publication_range,review_counts
0,13,0,0,0,1,0,0,0,0,0,0,5520
1,13426,0,0,0,0,0,0,0,0,0,0,49
2,26761,1,0,4,0,1,1,1,0,0,1,5
3,16495,2,0,0,1,2,2,2,0,1,2,2
4,6225,3,0,0,1,3,3,3,0,2,1,11
...,...,...,...,...,...,...,...,...,...,...,...,...
76694,7728,149565,0,0,1,132671,1218,306,0,34,2,3
76695,47785,149566,0,0,17,132672,10266,108,1,0,2,4
76696,4209,149567,0,0,1,132673,59535,7573,0,2,3,31
76697,40779,149568,0,4,12,14352,59536,4914,3,0,0,3


In [30]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
import matplotlib.pyplot as plt

In [31]:
# METRIC 함수
def calculate_metrics(y_true, y_pred):
    metrics = {
        'RMSE' : root_mean_squared_error(y_true, y_pred),
        'MSE' : mean_squared_error(y_true, y_pred),
        'MAE' : mean_absolute_error(y_true, y_pred)
    }
    return metrics

In [None]:
X_data, y_data = train.drop(columns = 'rating'), train['rating']

skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
valid_rmse = []
valid_mse = []
valid_mae = []
pred_df = pd.DataFrame()

for fold, (train_idx, valid_idx) in tqdm.tqdm(enumerate(skf.split(X_data, y_data)), total = skf.n_splits) : 
    
    # Train Set과 Valid Set 분할    
    X_train, y_train = X_data.iloc[train_idx], y_data.iloc[train_idx]
    X_valid, y_valid = X_data.iloc[valid_idx], y_data.iloc[valid_idx]
    
    train_data = Pool(data = X_train, label = y_train, cat_features = cat_col)
    valid_data = Pool(data = X_valid, label = y_valid, cat_features = cat_col)
    # Best Parameter
    param = {
        'learning_rate': 0.5,
        'depth': 7,
        'colsample_bylevel': 0.5,
        'boosting_type': 'Plain',
        'bootstrap_type': 'MVS'
    }
    
    cat_model = CatBoostRegressor(**param, iterations = 1000, 
                                loss_function = 'RMSE', eval_metric = 'RMSE', 
                                use_best_model = True, random_state = 42,
                                cat_features = [i for i in range(0, 10)])
    cat_model.fit(train_data, eval_set = [train_data, valid_data], use_best_model = True,
                verbose = 500, early_stopping_rounds = 100)
    
    # 모델 RMSE
    valid_metrics = calculate_metrics(y_valid, cat_model.predict(X_valid))
    print(f"Fold {fold + 1} Valid RMSE: {valid_metrics['RMSE']}")
    print(f"Fold {fold + 1} Valid MSE:  {valid_metrics['MSE']}")
    print(f"Fold {fold + 1} Valid MAE:  {valid_metrics['MAE']}")
    valid_rmse.append(valid_metrics['RMSE'])
    valid_mse.append(valid_metrics['MSE'])
    valid_mae.append(valid_metrics['MAE'])
    
    # Predict
    pred = cat_model.predict(test.drop(['rating'], axis = 1))
    pred_df[f'pred_{fold}'] = pred
    
print(f'RMSE 평균 : {np.array(valid_rmse).mean():.4f} \n')

param = json.dumps(param)
wandb.log({
    "Valid RMSE": np.array(valid_rmse).mean(),
    "Valid MSE": np.array(valid_mse).mean(),
    "Valid MAE": np.array(valid_mae).mean(),
    "param": param,
    "features": X_data.columns
})
wandb.finish()

  0%|          | 0/10 [00:00<?, ?it/s]

0:	learn: 2.3117084	test: 2.2287168	test1: 2.2878334	best: 2.2878334 (0)	total: 429ms	remaining: 35m 46s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 2.132403812
bestIteration = 105

Shrink model to first 106 iterations.
Fold 1 Valid RMSE: 2.132403811979189
Fold 1 Valid MSE:  4.547146017343376
Fold 1 Valid MAE:  1.60265727672518


 10%|█         | 1/10 [00:56<08:25, 56.20s/it]

0:	learn: 2.3093869	test: 2.2281050	test1: 2.2858580	best: 2.2858580 (0)	total: 274ms	remaining: 22m 48s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 2.123506093
bestIteration = 144

Shrink model to first 145 iterations.
Fold 2 Valid RMSE: 2.1235060932788294
Fold 2 Valid MSE:  4.509278128192316
Fold 2 Valid MAE:  1.5971281959526327


 20%|██        | 2/10 [02:02<08:19, 62.38s/it]

0:	learn: 2.3078682	test: 2.2267903	test1: 2.2877165	best: 2.2877165 (0)	total: 297ms	remaining: 24m 46s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 2.127819481
bestIteration = 129

Shrink model to first 130 iterations.
Fold 3 Valid RMSE: 2.127819481255274
Fold 3 Valid MSE:  4.527615744809463
Fold 3 Valid MAE:  1.5931923344310357


 30%|███       | 3/10 [03:07<07:24, 63.53s/it]

0:	learn: 2.3112608	test: 2.2297081	test1: 2.2862126	best: 2.2862126 (0)	total: 296ms	remaining: 24m 41s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 2.13212785
bestIteration = 155

Shrink model to first 156 iterations.
Fold 4 Valid RMSE: 2.1321278503825893
Fold 4 Valid MSE:  4.545969170377081
Fold 4 Valid MAE:  1.5997967245067286


 40%|████      | 4/10 [04:16<06:32, 65.44s/it]

0:	learn: 2.3054697	test: 2.2275023	test1: 2.2892198	best: 2.2892198 (0)	total: 295ms	remaining: 24m 34s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 2.132494553
bestIteration = 110

Shrink model to first 111 iterations.
Fold 5 Valid RMSE: 2.1324945533712456
Fold 5 Valid MSE:  4.547533020158029
Fold 5 Valid MAE:  1.6045224542999128


 50%|█████     | 5/10 [05:14<05:15, 63.04s/it]

0:	learn: 2.3013285	test: 2.2029602	test1: 2.2735544	best: 2.2735544 (0)	total: 248ms	remaining: 20m 41s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 2.132054527
bestIteration = 163

Shrink model to first 164 iterations.
Fold 6 Valid RMSE: 2.1320545268142372
Fold 6 Valid MSE:  4.545656505309082
Fold 6 Valid MAE:  1.592737763166217


 60%|██████    | 6/10 [06:27<04:25, 66.29s/it]

0:	learn: 2.3003359	test: 2.2032385	test1: 2.2735527	best: 2.2735527 (0)	total: 277ms	remaining: 23m 7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 2.131550487
bestIteration = 120

Shrink model to first 121 iterations.
Fold 7 Valid RMSE: 2.1315504870204043
Fold 7 Valid MSE:  4.543507478716922
Fold 7 Valid MAE:  1.597095894987623


 70%|███████   | 7/10 [07:27<03:12, 64.06s/it]

0:	learn: 2.2996061	test: 2.2006453	test1: 2.2755337	best: 2.2755337 (0)	total: 265ms	remaining: 22m 7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 2.138021642
bestIteration = 107

Shrink model to first 108 iterations.
Fold 8 Valid RMSE: 2.1380216416459206
Fold 8 Valid MSE:  4.571136540146317
Fold 8 Valid MAE:  1.6023416823355512


 80%|████████  | 8/10 [08:24<02:03, 61.90s/it]

0:	learn: 2.3010651	test: 2.2022008	test1: 2.2743452	best: 2.2743452 (0)	total: 282ms	remaining: 23m 28s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 2.13140447
bestIteration = 136

Shrink model to first 137 iterations.
Fold 9 Valid RMSE: 2.131404469693428
Fold 9 Valid MSE:  4.542885013429123
Fold 9 Valid MAE:  1.6005258132799962


 90%|█████████ | 9/10 [09:34<01:04, 64.46s/it]

0:	learn: 2.2999251	test: 2.2021231	test1: 2.2725923	best: 2.2725923 (0)	total: 340ms	remaining: 28m 20s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 2.133026426
bestIteration = 140

Shrink model to first 141 iterations.
Fold 10 Valid RMSE: 2.1330264263756007
Fold 10 Valid MSE:  4.549801735616666
Fold 10 Valid MAE:  1.6012959600347765


100%|██████████| 10/10 [10:46<00:00, 64.64s/it]


RMSE 평균 : 2.1314 



VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Valid MAE,▁
Valid MSE,▁
Valid RMSE,▁

0,1
Valid MAE,1.59913
Valid MSE,4.54305
Valid RMSE,2.13144
features,"user_id, age_range, ..."
param,"{""learning_rate"": 0...."


In [33]:
sub

Unnamed: 0,user_id,isbn,rating
0,11676,0002005018,0
1,116866,0002005018,0
2,152827,0060973129,0
3,157969,0374157065,0
4,67958,0399135782,0
...,...,...,...
76694,278543,1576734218,0
76695,278563,3492223710,0
76696,278633,1896095186,0
76697,278668,8408044079,0


In [34]:
sub['rating'] = (pred_df['pred_0'] + pred_df['pred_1'] + pred_df['pred_2'] + pred_df['pred_3'] + pred_df['pred_4'] + 
                               pred_df['pred_5'] + pred_df['pred_6'] + pred_df['pred_7'] + pred_df['pred_8'] + pred_df['pred_9']) / 10
submit = sub[['user_id', 'isbn', 'rating']]
submit

Unnamed: 0,user_id,isbn,rating
0,11676,0002005018,6.797687
1,116866,0002005018,7.107888
2,152827,0060973129,7.385636
3,157969,0374157065,7.428739
4,67958,0399135782,7.586509
...,...,...,...
76694,278543,1576734218,6.272785
76695,278563,3492223710,5.913859
76696,278633,1896095186,6.313652
76697,278668,8408044079,5.436259


In [35]:
submit.to_csv('submit.csv', index = False)