In [13]:
# import packages
import pandas as pd
import numpy as np
import warnings

import re
from pandas.api.types import CategoricalDtype
from scipy import sparse
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier, LGBMRanker
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.tree import DecisionTreeRegressor

from surprise import Dataset, Reader, accuracy, SVD, KNNBasic, CoClustering
from surprise.dataset import DatasetAutoFolds

import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings(action='ignore')

# 데이터 불러오기, 전처리

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, Dataset

def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6

def process_context_data(users, books, ratings1, ratings2):
    """
    Parameters
    ----------
    users : pd.DataFrame
        users.csv를 인덱싱한 데이터
    books : pd.DataFrame
        books.csv를 인덱싱한 데이터
    ratings1 : pd.DataFrame
        train 데이터의 rating
    ratings2 : pd.DataFrame
        test 데이터의 rating
    ----------
    """

    users['location_city'] = users['location'].apply(lambda x: x.split(',')[0])
    users['location_state'] = users['location'].apply(lambda x: x.split(',')[1])
    users['location_country'] = users['location'].apply(lambda x: x.split(',')[2])
    users = users.drop(['location'], axis=1)

    ratings = pd.concat([ratings1, ratings2]).reset_index(drop=True)

    # 인덱싱 처리된 데이터 조인
    context_df = ratings.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')
    train_df = ratings1.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')
    test_df = ratings2.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')

    # 인덱싱 처리
    loc_city2idx = {v:k for k,v in enumerate(context_df['location_city'].unique())}
    loc_state2idx = {v:k for k,v in enumerate(context_df['location_state'].unique())}
    loc_country2idx = {v:k for k,v in enumerate(context_df['location_country'].unique())}

    train_df['location_city'] = train_df['location_city'].map(loc_city2idx)
    train_df['location_state'] = train_df['location_state'].map(loc_state2idx)
    train_df['location_country'] = train_df['location_country'].map(loc_country2idx)
    test_df['location_city'] = test_df['location_city'].map(loc_city2idx)
    test_df['location_state'] = test_df['location_state'].map(loc_state2idx)
    test_df['location_country'] = test_df['location_country'].map(loc_country2idx)

    train_df['age'] = train_df['age'].fillna(int(train_df['age'].mean()))
    train_df['age'] = train_df['age'].apply(age_map)
    test_df['age'] = test_df['age'].fillna(int(test_df['age'].mean()))
    test_df['age'] = test_df['age'].apply(age_map)

    # book 파트 인덱싱
    category2idx = {v:k for k,v in enumerate(context_df['category'].unique())}
    publisher2idx = {v:k for k,v in enumerate(context_df['publisher'].unique())}
    language2idx = {v:k for k,v in enumerate(context_df['language'].unique())}
    author2idx = {v:k for k,v in enumerate(context_df['book_author'].unique())}

    train_df['category'] = train_df['category'].map(category2idx)
    train_df['publisher'] = train_df['publisher'].map(publisher2idx)
    train_df['language'] = train_df['language'].map(language2idx)
    train_df['book_author'] = train_df['book_author'].map(author2idx)
    test_df['category'] = test_df['category'].map(category2idx)
    test_df['publisher'] = test_df['publisher'].map(publisher2idx)
    test_df['language'] = test_df['language'].map(language2idx)
    test_df['book_author'] = test_df['book_author'].map(author2idx)

    idx = {
        "loc_city2idx":loc_city2idx,
        "loc_state2idx":loc_state2idx,
        "loc_country2idx":loc_country2idx,
        "category2idx":category2idx,
        "publisher2idx":publisher2idx,
        "language2idx":language2idx,
        "author2idx":author2idx,
    }

    return idx, train_df, test_df

users = pd.read_csv('./data/' + 'users.csv')
books = pd.read_csv('./data/' + 'books.csv')
train = pd.read_csv('./data/' + 'train_ratings.csv')
test = pd.read_csv('./data/' + 'test_ratings.csv')
sub = pd.read_csv('./data/' + 'sample_submission.csv')

ids = pd.concat([train['user_id'], sub['user_id']]).unique()
isbns = pd.concat([train['isbn'], sub['isbn']]).unique()

idx2user = {idx:id for idx, id in enumerate(ids)}
idx2isbn = {idx:isbn for idx, isbn in enumerate(isbns)}

user2idx = {id:idx for idx, id in idx2user.items()}
isbn2idx = {isbn:idx for idx, isbn in idx2isbn.items()}

train['user_id'] = train['user_id'].map(user2idx)
sub['user_id'] = sub['user_id'].map(user2idx)
test['user_id'] = test['user_id'].map(user2idx)
users['user_id'] = users['user_id'].map(user2idx)

train['isbn'] = train['isbn'].map(isbn2idx)
sub['isbn'] = sub['isbn'].map(isbn2idx)
test['isbn'] = test['isbn'].map(isbn2idx)
books['isbn'] = books['isbn'].map(isbn2idx)

idx, context_train, context_test = process_context_data(users, books, train, test)
field_dims = np.array([len(user2idx), len(isbn2idx),
                6, len(idx['loc_city2idx']), len(idx['loc_state2idx']), len(idx['loc_country2idx']),
                len(idx['category2idx']), len(idx['publisher2idx']), len(idx['language2idx']), len(idx['author2idx'])], dtype=np.uint32)

data = {
'train':context_train,
'test':context_test.drop(['rating'], axis=1),
'field_dims':field_dims,
'users':users,
'books':books,
'sub':sub,
'idx2user':idx2user,
'idx2isbn':idx2isbn,
'user2idx':user2idx,
'isbn2idx':isbn2idx,
}

X_train, X_valid, y_train, y_valid = train_test_split(
                                            data['train'].drop(['rating'], axis=1),
                                            data['train']['rating'],
                                            test_size=0.2,
                                            random_state=42,
                                            shuffle=True
                                            )
data['X_train'], data['X_valid'], data['y_train'], data['y_valid'] = X_train, X_valid, y_train, y_valid

train_dataset = TensorDataset(torch.LongTensor(data['X_train'].values), torch.LongTensor(data['y_train'].values))
valid_dataset = TensorDataset(torch.LongTensor(data['X_valid'].values), torch.LongTensor(data['y_valid'].values))
test_dataset = TensorDataset(torch.LongTensor(data['test'].values))

train_dataloader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=1024, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

data['train_dataloader'], data['valid_dataloader'], data['test_dataloader'] = train_dataloader, valid_dataloader, test_dataloader


In [15]:
data['X_train'] = data['X_train'].drop(['location_city', 'location_state'], axis=1)

# CatBoostClassifier 사용

In [None]:
import tqdm
param = {}
params = {}
params['iterations'] = 100
params['learning_rate']=0.1
params['depth']=8
param['catboost'] = params
catboost_cl = CatBoostClassifier(**params, verbose=True, random_state=42)

catboost_cl.fit(data['X_train'].select_dtypes(exclude='object'), y_train, early_stopping_rounds=100)

In [16]:
pred = catboost_cl.predict(data['test'].select_dtypes(exclude='object'))
submission = pd.read_csv('./data/sample_submission.csv')
submission['rating'] = pred.squeeze(1)
submission.to_csv('./code/submit/first.csv', index=False)

# CatBoostRegressor 사용

In [None]:
catboost_r = CatBoostRegressor(**params,od_pval=0, l2_leaf_reg=0, verbose=True, random_state=42)
catboost_r.fit(data['X_train'], data['y_train'], early_stopping_rounds=100)

In [124]:
catboost_pred_r = catboost_r.predict(data['test'].select_dtypes(exclude='object'))

In [126]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['rating'] = catboost_pred_r
submission.to_csv('./code/submit/second.csv', index=False)

# KFold 이용하여 CatBoostRegressor 사용

In [None]:
from sklearn.model_selection import KFold
is_holdout = False
n_splits = 5
iterations = 100
patience = 50

cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)

scores = []
models = []


models = []
for tri, vai in cv.split(X_train):
    print("="*50)
    preds = []

    model = CatBoostRegressor(iterations=iterations,random_state=42,task_type="GPU",eval_metric="RMSE",one_hot_max_size=4)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=patience ,verbose = 100)
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["RMSE"])
    if is_holdout:
        break    

# CatBoostRegressor grid_search 사용해서 하이퍼 파라미터 최적값 찾기

In [None]:
from sklearn.model_selection import GridSearchCV
# , l2_leaf_reg=True
model = CatBoostRegressor(verbose=True, random_state=42)
params = {'iterations':[50, 100, 200], 'learning_rate':[0.1, 0.05, 0.01], 'depth':[6, 8, 10]}
grid_search_result = model.grid_search(params, X=data['X_train'], y=data['y_train'])
# catboost_r.fit(data['X_train'], data['y_train'], early_stopping_rounds=200)

In [11]:
pred = model.predict(data['test'].select_dtypes(exclude='object'))
submission = pd.read_csv('./data/sample_submission.csv')
submission['rating'] = pred
submission.to_csv('./code/submit/second.csv', index=False)

In [155]:
model.predict(data['test'])

array([7.6625509 , 7.86972948, 7.59137493, ..., 7.17719288, 6.15119298,
       7.01595701])