In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import tqdm
import pdb
from scipy.sparse import csr_matrix, linalg
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
#import lightgbm as lgb

import random
import os
import re

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error

warnings.filterwarnings(action='ignore')

In [None]:
path= '../data/'

users = pd.read_csv(path+'users.csv')
books = pd.read_csv(path+'books.csv')
train_ratings = pd.read_csv(path+'train_ratings.csv')
test_ratings = pd.read_csv(path+'test_ratings.csv')
submit = pd.read_csv(path + 'sample_submission.csv')


def rmse(real: list, predict: list) -> float:
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

SEED = 42
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(SEED)

print('users shape: ', users.shape)
print('books shape: ', books.shape)
print('train_ratings shape: ', train_ratings.shape)

# 데이터 전처리

## Book 테이블

In [None]:
# books 테이블 전처리 부분 입니다.
# books의 이미지 변수를 지워줍니다.
# 제목과 요약 내용 변수를 지웁니다. 
# (이 변수들은 추후 사용 가능할수도 있으나 일단 지웁니다.)
# books의 publisher 변수 중 이름이 비슷한 변수들을 찾아 하나로 통일해줍니다.
books.drop(['book_title', 'summary', 'img_url', 'img_path'], axis = 1, inplace = True)

books_publishers = books.groupby('publisher')['isbn'].count().sort_values(ascending=False)
for i in books_publishers[books_publishers > 20].index: # 20 말고 10으로 하면 오류가 남..
    books['publisher'][books['publisher'].str.contains(i)] = i

In [None]:
# books의 카테고리 부분.

# 대괄호 써있는 카테고리 전치리
books.loc[books[books['category'].notnull()].index, 'category'] = books[books['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+',' ',x).strip())
# 모두 소문자로 통일
books['category'] = books['category'].str.lower()

# 수작업으로 higt 카테고리로 통합
categories = ['garden','crafts','physics','adventure','music','fiction','nonfiction','science','science fiction','social','homicide',
 'sociology','disease','religion','christian','philosophy','psycholog','mathemat','agricult','environmental',
 'business','poetry','drama','literary','travel','motion picture','children','cook','literature','electronic',
 'humor','animal','bird','photograph','computer','house','ecology','family','architect','camp','criminal','language','india']

books['category_high'] = books['category'].copy()
for category in categories:
    books.loc[books[books['category'].str.contains(category,na=False)].index,'category_high'] = category

In [None]:
books['category_high'].value_counts()

In [None]:
# language와 category_high NULL 값을 최빈값으로 채웁니다.
# 근거 : language == en일 때, category_high == fiction 일 때와
# 근거 : 값이 NULL 일 때 rating 평균이 7.0x로 유사한 형태.
books['language'].fillna('en', inplace = True)
books['category_high'].fillna('fiction', inplace = True)

In [None]:
# 출판연도 1970, 1980, 1990, 2000, 2020 으로 범주화 시킵니다.
# 딥러닝 과정에서 범주화 시키는 것이 유리합니다.
# 근거 : develop 파일에서 여러번 실험 결과 본 기준이 가장 rating을 잘 구분함.

books['years'] = books['year_of_publication'].copy()
books['years'][books['year_of_publication'] < 1970] = 1970
books['years'][(books['year_of_publication'] < 1980) * (books['year_of_publication'] >= 1970)] = 1980
books['years'][(books['year_of_publication'] < 1990) * (books['year_of_publication'] >= 1980)] = 1990
books['years'][(books['year_of_publication'] < 2000) * (books['year_of_publication'] >= 1990)] = 2000
books['years'][(books['year_of_publication'] >= 2000)] = 2020
books['years'] = books['years'].astype('str')
#books['years'] = books['years'].astype('int')
books.drop(['year_of_publication', 'category'], axis = 1, inplace = True)

## User 테이블 (미션 1 참고)

In [None]:
users['location'] = users['location'].str.replace(r'[^0-9a-zA-Z:,]', '') # 특수문자 제거

users['location_city'] = users['location'].apply(lambda x: x.split(',')[0].strip())
users['location_state'] = users['location'].apply(lambda x: x.split(',')[1].strip())
users['location_country'] = users['location'].apply(lambda x: x.split(',')[2].strip())

users = users.replace('na', np.nan) #특수문자 제거로 n/a가 na로 바뀌게 되었습니다. 따라서 이를 컴퓨터가 인식할 수 있는 결측값으로 변환합니다.
users = users.replace('', np.nan) # 일부 경우 , , ,으로 입력된 경우가 있었으므로 이런 경우에도 결측값으로 변환합니다.

modify_location = users[(users['location_country'].isna())&(users['location_city'].notnull())]['location_city'].values
location = users[(users['location'].str.contains('seattle'))&(users['location_country'].notnull())]['location'].value_counts().index[0]

location_list = []
for location in modify_location:
    try:
        right_location = users[(users['location'].str.contains(location))&(users['location_country'].notnull())]['location'].value_counts().index[0]
        location_list.append(right_location)
    except:
        pass

for location in location_list:
    users.loc[users[users['location_city']==location.split(',')[0]].index,'location_state'] = location.split(',')[1]
    users.loc[users[users['location_city']==location.split(',')[0]].index,'location_country'] = location.split(',')[2]

loc_city2idx = {v:k for k,v in enumerate(users['location_city'].unique())}
loc_state2idx = {v:k for k,v in enumerate(users['location_state'].unique())}
loc_country2idx = {v:k for k,v in enumerate(users['location_country'].unique())}

users['location_city'] = users['location_city'].map(loc_city2idx)
users['location_state'] = users['location_state'].map(loc_state2idx)
users['location_country'] = users['location_country'].map(loc_country2idx)

users.info()

In [None]:
# users['location_city'] = users['location'].apply(lambda x: x.split(',')[0])
# users['location_state'] = users['location'].apply(lambda x: x.split(',')[1])
# users['location_country'] = users['location'].apply(lambda x: x.split(',')[2])
# users = users.drop(['location'], axis=1)

# loc_city2idx = {v:k for k,v in enumerate(users['location_city'].unique())}
# loc_state2idx = {v:k for k,v in enumerate(users['location_state'].unique())}
# loc_country2idx = {v:k for k,v in enumerate(users['location_country'].unique())}

# users['location_city'] = users['location_city'].map(loc_city2idx)
# users['location_state'] = users['location_state'].map(loc_state2idx)
# users['location_country'] = users['location_country'].map(loc_country2idx)

In [None]:
# # users 테이블 전처리 입니다.
# # location이 지역, 주, 국가로 되어있어 이 부분 기초 전처리 진행 과정입니다.

# users['location'] = users['location'].str.replace(r'[^0-9a-zA-Z:,]', '') # 특수문자 제거

# # 지역, 주, 국가
# users['location_city'] = users['location'].apply(lambda x: x.split(',')[0].strip())
# users['location_state'] = users['location'].apply(lambda x: x.split(',')[1].strip())
# users['location_country'] = users['location'].apply(lambda x: x.split(',')[2].strip())

# users = users.replace('na', np.nan) #특수문자 제거로 n/a가 na로 바뀌게 되었습니다. 따라서 이를 컴퓨터가 인식할 수 있는 결측값으로 변환합니다.
# users = users.replace('', np.nan) # 일부 경우 , , ,으로 입력된 경우가 있었으므로 이런 경우에도 결측값으로 변환합니다.

# # 도시는 존재하는데 나라 정보가 없는 경우 채워주는 코드
# modify_location = users[(users['location_country'].isna())&(users['location_city'].notnull())]['location_city'].values
# location = users[(users['location'].str.contains('seattle'))&(users['location_country'].notnull())]['location'].value_counts().index[0]

# location_list = []
# for location in modify_location:
#     try:
#         right_location = users[(users['location'].str.contains(location))&(users['location_country'].notnull())]['location'].value_counts().index[0]
#         location_list.append(right_location)
#     except:
#         pass

# for location in location_list:
#     users.loc[users[users['location_city']==location.split(',')[0]].index,'location_state'] = location.split(',')[1]
#     users.loc[users[users['location_city']==location.split(',')[0]].index,'location_country'] = location.split(',')[2]

In [None]:
# # 저는 도시, 주, 국가 중 주를 선택했습니다.
# # 우선 모든 변수를 다 쓰는 건 아니라고 생각했어요. 도시 < 주 < 국가로 포함관계가 있기 때문이죠.
# # 데이터 분석 결과 주와 국가 단위가 비슷한 경우가 많은 것을 확인했습니다.
# # 조그만 섬 국가 < 미국 켈리포니아 주 < 미국 같은 경우죠.
# # 미국 같은 경우 미국으로 뭉뚱그리기 보다 주 단위로 나누는 것이 맞다고 판단했습니다.
# # 실제 미국 주 별로 rating 차이가 꽤 존재합니다.
# # 다만 도시 기준으로 나누면 너무 세분화 될 것 같다는 생각이 들었습니다.
# # 결론적으로 주를 지역을 나타내는 변수로 사용하기로 하고 결측값을 도시, 나라에서 채우기로 했습니다.

# def _fillna(x):
#     if pd.isna(x['location_country']):
#         # 만약 나라가 기록 안되있는 경우        
#         if pd.isna(x['location_city']):
#             # 도시까지 없다면 모든 정보가 없음. 최빈값 california 사용.
#             return 'california'
#         else:
#             tem = users['location_state'][users['location_city'] == x['location_city']].value_counts()
#             if len(tem) == 0: 
#                 # 만약 주 이름이 없는 도시이면 도시 이름을 주 이름으로 사용.
#                 return x['location_city'] 
#             else:
#                 # 그 도시에서 가장 자주 쓰이는 주 이름 사용.
#                 return tem.index[0]

#     else:
#         tem = users['location_state'][users['location_country'] == x['location_country']].value_counts()
#         if len(tem) == 0: 
#             # 만약 주 이름이 없는 나라이면 나라이름을 주 이름으로 사용.
#             return x['location_country'] 
#         else:
#             # 그 나라에서 가장 자주 쓰이는 주 이름 사용.
#             return tem.index[0]

# users['fix_location_state'] = users.apply(lambda x : _fillna(x) if pd.isna(x['location_state']) else x['location_state'], axis = 1)

In [None]:
users['fix_age'] = users['age'].copy()
users['fix_age'][users['age'] < 10] = 10
users['fix_age'][(users['age'] < 20) & (users['age'] >= 10)] = 20
users['fix_age'][(users['age'] < 30) & (users['age'] >= 20)] = 30
users['fix_age'][(users['age'] < 35) & (users['age'] >= 30)] = 35
users['fix_age'][(users['age'] < 40) & (users['age'] >= 35)] = 40
users['fix_age'][(users['age'] < 50) & (users['age'] >= 40)] = 50
users['fix_age'][users['age'] >= 50] = 100
users['fix_age'].fillna(10, inplace = True)
users['fix_age'] = users['fix_age'].astype('str') # users['fix_age'] = users['fix_age'].astype('int')

In [None]:
# users2 = pd.read_csv(path+'users.csv')
# users['age'] = users2['age']

In [None]:
# def age_map(x: int) -> int:
#     x = int(x)
#     if x < 20:
#         return 1
#     elif x >= 20 and x < 30:
#         return 2
#     elif x >= 30 and x < 40:
#         return 3
#     elif x >= 40 and x < 50:
#         return 4
#     elif x >= 50 and x < 60:
#         return 5
#     else:
#         return 6

# users['age'] = users['age'].fillna(users['age'].mean())
# users['age'] = users['age'].apply(age_map)

# users['age'].value_counts()

In [None]:
users = users[['user_id', 'location_city', 'location_state', 'location_country','fix_age']]
users.info()

In [None]:
# n = 1

# def make_others(_column):
#     tem = pd.DataFrame(users[_column].value_counts()).reset_index()
#     tem.columns = ['names','count']
#     others_list = tem[tem['count'] <= n]['names'].values  # n은 초기에 설정함. 바꿀 수 있음.
#     users.loc[users[users[_column].isin(others_list)].index, _column]= 'others'

# def make_others2(_column):
#     tem = pd.DataFrame(books[_column].value_counts()).reset_index()
#     tem.columns = ['names','count']
#     others_list = tem[tem['count'] <= n]['names'].values  # n은 초기에 설정함. 바꿀 수 있음.
#     books.loc[books[books[_column].isin(others_list)].index, _column]= 'others'

# make_others('location_city')
# make_others('location_state')
# make_others('location_country')

# make_others2('book_author')
# make_others2('publisher')
# make_others2('category_high')

## rating 테이블과 merge

In [None]:
# 전처리 완료한 books와 users 테이블을 이용해 rating 테이블과 merge 하기.
train_ratings = pd.read_csv(path+'train_ratings.csv')
test_ratings = pd.read_csv(path+'test_ratings.csv')

train_ratings = pd.merge(train_ratings,books, how='right',on='isbn')
train_ratings.dropna(subset=['rating'], inplace = True)
train_ratings = pd.merge(train_ratings, users, how='right',on='user_id')
train_ratings.dropna(subset=['rating'], inplace = True)

test_ratings['index'] = test_ratings.index
test_ratings = pd.merge(test_ratings,books, how='right',on='isbn')
test_ratings.dropna(subset=['rating'], inplace = True)
test_ratings = pd.merge(test_ratings, users, how='right',on='user_id')
test_ratings.dropna(subset=['rating'], inplace = True)
test_ratings = test_ratings.sort_values('index')
test_ratings.drop(['index'], axis=1, inplace=True)

train_ratings['user_id'] = train_ratings['user_id'].astype('str')
test_ratings['user_id'] = test_ratings['user_id'].astype('str')

train_ratings['location_city'] = train_ratings['location_city'].astype('str')
test_ratings['location_city'] = test_ratings['location_city'].astype('str')

train_ratings['location_state'] = train_ratings['location_state'].astype('str')
test_ratings['location_state'] = test_ratings['location_state'].astype('str')

train_ratings['location_country'] = train_ratings['location_country'].astype('str')
test_ratings['location_country'] = test_ratings['location_country'].astype('str')


In [None]:
train_ratings.info()

# 모델링

In [None]:
# params_cat = {
#             "task_type" : "GPU",
#             "devices" : '0',
#             "random_state": SEED,
#             "learning_rate": 0.05,
#             "n_estimators": 2000,
#             "verbose" : 1,
#             "objective" : "RMSE",
#             "max_depth": 10,#trial.suggest_int("max_depth", 1, 16),
#             "colsample_bylevel": 1,#trial.suggest_float("colsample_bylevel", 0.8, 1.0),
#             #"subsample": 0.8, #trial.suggest_float("subsample", 0.3, 1.0), GPU 사용시 안될수도.
#             "min_child_samples": 50, #trial.suggest_int("min_child_samples", 5, 100),
#             "max_bin": 300, #trial.suggest_int("max_bin", 200, 500),
#             "cat_features" : list(train_ratings.drop(['rating'],axis = 1).columns)
#     }

params_cat = {
    "task_type" : "GPU",
    "devices" : '0',
    "random_state": SEED,
    'learning_rate': 0.04574578205475402, 
    'bagging_temperature': 0.12172958098369972, 
    'n_estimators': 8459, 
    'max_depth': 8, 
    'random_strength': 28, 
    'l2_leaf_reg': 1.6285455533915874e-05, 
    'min_child_samples': 18, 
    'max_bin': 441, 
    'od_type': 'Iter',
    "cat_features" : list(train_ratings.drop(['rating'],axis = 1).columns),
}

X_tr, X_val, y_tr, y_val = train_test_split(train_ratings.drop(['rating'],axis = 1), train_ratings['rating'], test_size=0.2)

model = CatBoostRegressor(**params_cat)
model.fit(
    X_tr,
    y_tr,
    eval_set=[(X_val, y_val)],
    #early_stopping_rounds=10,
    verbose=False,
)

cat_pred = model.predict(X_val)
log_score = rmse(y_val, cat_pred)

print(log_score)

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
folds = []
for train_idx, valid_idx in skf.split(train_ratings, train_ratings['rating']):
  folds.append((train_idx,valid_idx))

In [None]:
random.seed(SEED)
cat_models={}

cat_features = list(range(1, 10))

def objective(trial):
    param = {
        "random_state":SEED, # 42 -> SEED로 변경
        "objective" : "RMSE",
        "cat_features" : list(train_ratings.drop(['rating'],axis = 1).columns),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        "n_estimators":trial.suggest_int("n_estimators", 1000, 10000),
        "max_depth":trial.suggest_int("max_depth", 4, 16),
        'random_strength' :trial.suggest_int('random_strength', 0, 100),
    #   "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0), 이거 때메 GPU 안돌아감
        "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
    }

    model = CatBoostRegressor(**param, task_type = 'GPU', devices = '0')

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=100
    )

    cat_pred = model.predict(X_valid)
    log_score = rmse(y_valid, cat_pred)

    return log_score

for fold in range(0,5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train = train_ratings.drop(['rating'],axis = 1).iloc[train_idx]
    X_valid = train_ratings.drop(['rating'],axis = 1).iloc[valid_idx]
    y_train = train_ratings['rating'].iloc[train_idx]
    y_valid = train_ratings['rating'].iloc[valid_idx]

    sampler = optuna.samplers.TPESampler(seed=SEED) # 42-> SEED로 변경
    study = optuna.create_study(
        study_name = 'cat_parameter_opt',
        direction = 'minimize',
        sampler = sampler,
    )
    study.optimize(objective, n_trials=10)


    model = CatBoostRegressor(**study.best_params, task_type = 'GPU', devices = '0', random_state = SEED, objective = 'RMSE', cat_features = list(train_ratings.drop(['rating'],axis = 1).columns))
    model.fit(X_train, y_train)
                
    pred = model.predict(test_ratings.drop(['rating'],axis = 1))
    test_ratings[f'pred_{fold}'] = pred
    print(f'================================================================================\n\n')

In [None]:
test_ratings['rating'] = (test_ratings['pred_0'] + test_ratings['pred_1'] + test_ratings['pred_2'] + test_ratings['pred_3'] + test_ratings['pred_4']) / 5
test = test_ratings[['user_id', 'isbn', 'rating']]
test.to_csv('../submit/KSY_5KFold_Optima2.csv', index = False)
test_ratings.to_csv('../data/KSY_5KFold_Optima2.csv', index = False)

In [None]:
test.to_csv('../submit/KSY_5KFold_Optima.csv', index = False)

In [None]:
def objective(trial):
    param = {
        "random_state":42,
        "objective" : "RMSE",
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        "n_estimators":trial.suggest_int("n_estimators", 1000, 10000),
        "max_depth":trial.suggest_int("max_depth", 4, 16),
        'random_strength' :trial.suggest_int('random_strength', 0, 100),
    #   "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0), 이거 때메 GPU 안돌아감
        "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
    }
    train_x, val_x, train_y, val_y = train_test_split(train_ratings.drop(['rating'],axis = 1), train_ratings['rating'], test_size=0.2)

    model = CatBoostRegressor(**param, task_type = 'GPU')

    model.fit(
        train_x,
        train_y,
        eval_set=[(val_x, val_y)],
        cat_features = list(train_ratings.drop(['rating'],axis = 1).columns),
        verbose=100
    )

    cat_pred = model.predict(val_x)
    log_score = rmse(val_y, cat_pred)

    return log_score

# Optuna를 이용해 최적의 Hyperparameter 찾기

In [None]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'minimize',
    sampler = sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

# 변수 중요도 확인

In [None]:
def plot_feature_importance(importance,names,model_type):
    
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(20,9))

    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])

    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')
    
    plt.show()


plot_feature_importance(model.get_feature_importance(), train_ratings.drop(['rating'],axis = 1).columns, 'CATBOOST')