# 1. Import Library

In [1]:
import os
print(os.listdir())

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn

import os
import re
import random

from tqdm import tqdm

from collections import defaultdict
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import category_encoders as ce

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, make_scorer

['Data_Search-total.ipynb', 'temp.txt', 'Data_Search-Final.ipynb', 'evaluation.py', '.git', 'ensemble.py', '.gitignore', 'catboost_info', 'src', 'main.py', '.ipynb_checkpoints']


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
# seed 고정
SEED = 42
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(SEED)

# lower case
def str_lower(x):
    try:
        return x.lower()
    except:
        return x

# 2. Data Load

In [4]:
path = '/opt/ml/data/'

books_path = path + '/books.csv'
users_path = path + '/users.csv'
train_ratings_path = path + '/train_ratings.csv'
test_ratings_path = path + '/test_ratings.csv'

In [5]:
df_books = pd.read_csv(books_path)
df_users = pd.read_csv(users_path)
df_train_ratings = pd.read_csv(train_ratings_path)
df_test_ratings = pd.read_csv(test_ratings_path)

print('users shape: ', df_users.shape)
print('books shape: ', df_books.shape)
print('train ratings shape: ', df_train_ratings.shape)
print('test ratings shape: ', df_test_ratings.shape)

df_users['user_id'] = df_users['user_id'].astype(str)
df_train_ratings['user_id'] = df_train_ratings['user_id'].astype(str)
df_test_ratings['user_id'] = df_test_ratings['user_id'].astype(str)

users shape:  (68092, 3)
books shape:  (149570, 10)
train ratings shape:  (306795, 3)
test ratings shape:  (76699, 3)


# 3. Preprocessing

## 1) Users

In [6]:
df_users['user_id'].nunique()

68092

### 1-1) location

In [7]:
df_users['location'] = df_users['location'].apply(lambda x : re.sub(r'[^a-zA-Z,]', '', x)) # 특수문자 제거(숫자도 포함해서 제거)

df_users['location_city'] = df_users['location'].apply(lambda x: x.split(',')[0].strip()) # # city
df_users['location_state'] = df_users['location'].apply(lambda x: x.split(',')[1].strip()) # state
df_users['location_country'] = df_users['location'].apply(lambda x: x.split(',')[2].strip()) # country

df_users = df_users.replace('na', np.nan)
df_users = df_users.replace('', np.nan)

In [8]:
df_users['location'] = df_users['location'].apply(lambda x: ','.join(map(str, ['na' if '' == i else i for i in x.split(',')])))

In [9]:
def fill_location(df, null_feature, using_feature):
    modify_location = set(df[(df[null_feature].isna()) & ((df[using_feature].notnull()))][using_feature].values)
    location_list = []
    for location in tqdm(modify_location): # usinf_feature값은 존재하는데 null_feature가 없는 경우의 using_feature값
        try:
            candidates = df[(df[using_feature]==location) & ((df[using_feature].notnull()))]['location'].value_counts()
            first = candidates.idxmax()
            for k, v in dict(candidates).items():
                k = k.split(',')
                if 'na' not in k:
                    right_location = ','.join(map(str, k))
                    break
            else:
                right_location = first
            location_list.append(right_location)
        except:
            pass
    
    if null_feature == 'location_state':
        j = 1
    elif null_feature == 'location_country':
        j = 2
    
    if using_feature == 'location_city':
        i = 0
    elif using_feature == 'location_state':
        i = 1
    else: # using_feature == location_country
        i = 2
        
    for location in tqdm(location_list): # [지역, 주, 국가]
        df.loc[df[(df[using_feature]==location.split(',')[i]) & (df[null_feature].isna())].index, null_feature] = location.split(',')[j]

In [10]:
# 1. state의 결측치 채우기
## 1-1) state의 결측치를 city 정보를 활용해 채우기
fill_location(df_users, 'location_state', 'location_city')
## 1-2) state의 결측치를 country 정보를 활용해 채우기
fill_location(df_users, 'location_state', 'location_country')

# 2. state의 결측치 채우기
## 2-1) country의 결측치를 city 정보를 활용해 채우기
fill_location(df_users, 'location_country', 'location_city')
## 2-2) country의 결측치를 state 정보를 활용해 채우기
fill_location(df_users, 'location_country', 'location_state')

100%|██████████| 1556/1556 [00:12<00:00, 123.34it/s]
100%|██████████| 1556/1556 [00:12<00:00, 124.04it/s]
100%|██████████| 25/25 [00:00<00:00, 92.65it/s] 
100%|██████████| 25/25 [00:00<00:00, 125.87it/s]
100%|██████████| 1172/1172 [00:09<00:00, 122.79it/s]
100%|██████████| 1172/1172 [00:09<00:00, 125.33it/s]
100%|██████████| 2/2 [00:00<00:00, 87.50it/s]
100%|██████████| 2/2 [00:00<00:00, 120.62it/s]


In [11]:
# 남은 결측치는 unknown으로 채우기
df_users[['location_city', 'location_state', 'location_country']] = df_users[['location_city', 'location_state', 'location_country']].fillna('unknown')

# where가 들어간 city, state, country는 다 unknown으로 교체
df_users.loc[df_users[df_users['location_city'].str.contains('where')].index, 'location_city'] = 'unknown'
df_users.loc[df_users[df_users['location_state'].str.contains('where')].index, 'location_state'] = 'unknown'
df_users.loc[df_users[df_users['location_country'].str.contains('where')].index, 'location_country'] = 'unknown'

In [12]:
temp = df_users.groupby('location_city')['location_country'].value_counts().groupby('location_city').idxmax().apply(lambda x : x[-1]).reset_index().rename(columns = {'count':'location_country'})
city2country = dict(zip(temp['location_city'].values, temp['location_country'].values))

In [13]:
df_users['location_country'] = df_users['location_city'].map(city2country)

In [14]:
df_users = df_users.drop(columns='location', axis=1)

### 1-2) age

In [18]:
def binning_age(x):
    if np.isnan(x):
        return np.nan
    elif x < 20:
        x = 10
    elif 20 <= x < 30:
        x = 20
    elif 30 <= x < 40:
        x = 30
    elif 40 <= x < 50:
        x = 40
    elif 50 <= x < 60:
        x = 60 
    elif x >= 60:
        x = 60
    return x

In [19]:
temp = df_train_ratings.merge(df_users, on = 'user_id')
gia = temp.groupby('isbn')['age'].mean()
temp['gia'] = temp['isbn'].map(gia)
grouped_isbn_age_grouped_uid_gia = temp.groupby('user_id')['gia'].mean()
temp['giagua'] = temp['user_id'].map(grouped_isbn_age_grouped_uid_gia)
# temp['giagua'].isna().mean()

In [20]:
feature2index = dict(zip(temp.columns, range(len(temp.columns))))
temp['final'] = temp.apply(lambda x : x[feature2index['giagua']] if type(x[feature2index['age']]) == float and np.isnan(x[feature2index['age']]) else x[feature2index['age']], axis = 1)
uid2age = dict(zip(temp['user_id'].values, temp['final'].values))

In [21]:
df_users['age'] = df_users['user_id'].map(uid2age)
df_users['binning_age'] = df_users['age'].apply(binning_age)
df_users['binning_age'] = df_users['binning_age'].astype('str')

In [22]:
df_users.isna().sum()

user_id                 0
age                 13061
location_city           0
location_state          0
location_country        0
binning_age             0
dtype: int64

In [23]:
# 남은 결측치는 국가의 평균 나이로 fill
df_users['age'] = df_users.groupby('location_country')['age'].transform(lambda x: x.fillna(x.mean())) 

In [24]:
# 국가로 처리한 이후에 남은 결측값은 전체 평균으로 fill
df_users.loc[df_users[df_users['age'].isna()].index, 'age'] = df_users['age'].mean()

In [25]:
df_users.isna().sum()

user_id             0
age                 0
location_city       0
location_state      0
location_country    0
binning_age         0
dtype: int64

In [27]:
# users file
df_users.to_csv(path + 'users_preprocessed.csv', index=False)

## 2) Books

In [28]:
for column in df_books.columns:
    if column == 'isbn':
        continue
    if df_books[column].dtype == object:
        print(column)
        df_books[column] = df_books[column].apply(str_lower)

book_title
book_author
publisher
img_url
language
category
summary
img_path


In [29]:
df_books.isna().mean()

isbn                   0.000000
book_title             0.000000
book_author            0.000007
year_of_publication    0.000000
publisher              0.000000
img_url                0.000000
language               0.449468
category               0.460326
summary                0.449468
img_path               0.000000
dtype: float64

### 2-1) publisher

In [30]:
df_books['publisher'].nunique() # publisher의 cardinality가 높음

11428

In [31]:
df_books['pnumber'] = df_books['isbn'].apply(lambda x : x[:3]) # 출판사 cardinality 줄이기 위한 isbn으로 대략적인 group identifier + publisher identifier 추출

### 2-2) category

In [32]:
# 카테고리 줄이기
df_books.loc[df_books[df_books['category'].notnull()].index, 'category'] = df_books[df_books['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+',' ',x).lower().strip())

In [33]:
# 작가별 category 순위 집계 -> category를 book_author를 이용해서 채움
temp = df_books.groupby('book_author')['category'].value_counts().groupby('book_author').idxmax().apply(lambda x : x[-1])
temp = defaultdict(lambda : np.nan, temp.to_dict())

In [34]:
feature2index = dict(zip(df_books.columns, range(len(df_books.columns))))
df_books['category'] = df_books.apply(lambda x : temp[x[feature2index['book_author']]] if type(x[feature2index['category']]) == float and np.isnan(x[feature2index['category']]) else x[feature2index['category']], axis = 1)

In [35]:
df_books['category'].isna().mean()

0.17736177040850437

In [36]:
# 출판사별 category 순위 집계 -> category를 publisher(pnumber)를 이용해서 채움
temp = df_books.groupby('pnumber')['category'].value_counts().groupby('pnumber').idxmax().apply(lambda x : x[-1])
temp = defaultdict(lambda : np.nan, temp.to_dict())

In [37]:
feature2index = dict(zip(df_books.columns, range(len(df_books.columns))))
df_books['category'] = df_books.apply(lambda x : temp[x[feature2index['pnumber']]] if type(x[feature2index['category']]) == float and np.isnan(x[feature2index['category']]) else x[feature2index['category']], axis = 1)

In [38]:
df_books['category'].isna().mean()

0.0009360165808651468

In [39]:
words = defaultdict(int)
for value in df_books['category'].values:
    try:
        if len(value.split()) == 1:
            words[value] += 1
    except:
        pass
print(len(words))
categories = [(value, key) for key,value in words.items()]
categories.sort(reverse = True)

1270


In [40]:
# df_books['category'].value_counts()
for _, category in tqdm(categories):
    df_books.loc[df_books[df_books['category'].str.contains(category,na=False)].index,'category_high'] = category # 상위 카테고리 column

100%|██████████| 1270/1270 [01:07<00:00, 18.76it/s]


### 2-3) language

In [41]:
# language 결측치
print(df_books['language'].isna().mean())

0.449468476298723


In [42]:
# language를 book_author로 채우기
temp = df_books.groupby('book_author')['language'].value_counts().groupby('book_author').idxmax().apply(lambda x : x[-1])
temp = defaultdict(lambda : np.nan, temp.to_dict())
feature2index = dict(zip(df_books.columns, range(len(df_books.columns))))
df_books['language'] = df_books.apply(lambda x : temp[x[feature2index['book_author']]] if type(x[feature2index['language']]) == float and np.isnan(x[feature2index['language']]) else x[feature2index['language']], axis = 1)

In [43]:
# language를 publisher로 채우기
temp = df_books.groupby('pnumber')['language'].value_counts().groupby('pnumber').idxmax().apply(lambda x : x[-1])
temp = defaultdict(lambda : np.nan, temp.to_dict())
feature2index = dict(zip(df_books.columns, range(len(df_books.columns))))
df_books['language'] = df_books.apply(lambda x : temp[x[feature2index['pnumber']]] if type(x[feature2index['language']]) == float and np.isnan(x[feature2index['language']]) else x[feature2index['language']], axis = 1)

In [44]:
print(df_books['language'].isna().mean())

0.000822357424617236


In [45]:
def binning_year(x):
    if x < 1970:
        return '1970'
    elif 1970 <= x < 1980:
        return '1980'
    elif 1980 <= x < 1990:
        return '1990'
    elif 1990 <= x < 2000:
        return '2000'
    else:
        return 'Early'
    
df_books['binning_year'] = df_books['year_of_publication'].apply(binning_year)

In [49]:
df_books.isnull().mean()

isbn                   0.000000
book_title             0.000000
book_author            0.000007
year_of_publication    0.000000
publisher              0.000000
img_url                0.000000
language               0.000822
category               0.000936
summary                0.449468
img_path               0.000000
pnumber                0.000000
category_high          0.032486
binning_year           0.000000
dtype: float64

In [50]:
# books file
df_books.to_csv(path + 'books_preprocessed.csv', index=False)

---

In [46]:
temp = df_train_ratings.merge(df_books, how='left', on='isbn')
data = temp.merge(df_users, how='left', on='user_id')
print('merge 결과 shape: ', data.shape)

merge 결과 shape:  (306795, 20)


In [51]:
data = data[['rating', 'user_id', 'isbn', 'book_author', 'language', 'category_high','pnumber', 'binning_year','binning_age', 'location_city', 'location_state', 'location_country', 'book_title']]

data['book_author'] = data['book_author'].fillna(value = 'unknown')
data['language'] = data['language'].fillna(value = 'unknown')
data['binning_year'] = data['binning_year'].fillna(value = 'unknown')
data['category_high'] = data['category_high'].fillna(value = 'unknown')
data['binning_age'] = data['binning_age'].fillna(value = 'unknown')
data['location_city'] = data['location_city'].fillna(value = 'unknown')
data['location_state'] = data['location_state'].fillna(value = 'unknown')
data['location_country'] = data['location_country'].fillna(value = 'unknown')
data['book_title'] = data['book_title'].fillna(value = 'unknown')

In [52]:
data.to_csv(path + 'train_data.csv', index=False)

# 4. Modeling

In [53]:
X = data.drop('rating', axis = 1)
categorical_features_indices = np.where(X.dtypes == object)[0]
print(categorical_features_indices)
y = data['rating']

[ 0  1  2  3  4  5  6  7  8  9 10 11]


In [54]:
def rmse(y, y_pred):
    return mean_squared_error(y, y_pred, squared = False)

n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
parms = {'verbose' : 0, 'cat_features' : categorical_features_indices}
# parms = {'verbose' : 0, 'cat_features' : categorical_features_indices, 'task_type' : 'GPU'}
# parms = {'verbose' : 0, 'cat_features' : categorical_features_indices, 'task_type' : 'GPU', 'learning_rate': 0.04574578205475402, 'bagging_temperature': 0.12172958098369972, 'n_estimators': 8459, 'max_depth': 8, 'random_strength': 28, 'l2_leaf_reg': 1.6285455533915874e-05, 'min_child_samples': 18, 'max_bin': 441, 'od_type': 'Iter'}
# parms = {'verbose' : 0, 'cat_features' : categorical_features_indices, 'task_type' : 'GPU', 'learning_rate': 0.10952662748632554, 'bagging_temperature': 0.03613894271216528, 'n_estimators': 3629, 'max_depth': 8, 'random_strength': 46, 'l2_leaf_reg': 2.355742708217648e-05, 'min_child_samples': 24, 'max_bin': 354, 'od_type': 'IncToDec'}
creg = CatBoostRegressor(**parms)
creg.fit(X, y)
y_pred = creg.predict(X)
print(creg.__class__.__name__)
print(rmse(y, y_pred))

creg_val = cross_val_score(creg, X, y, cv = cv, scoring = make_scorer(rmse))
print('val acc :', creg_val.mean())

result = list(zip(creg.get_feature_importance(), X.columns))
result.sort(reverse = True)
print(result)

CatBoostRegressor
1.6934133598073444
val acc : 2.138551699166874
[(45.22383812371767, 'user_id'), (18.583104701753218, 'book_author'), (6.894390704544789, 'book_title'), (5.832228802104024, 'pnumber'), (3.96727341949621, 'location_city'), (3.74048363564615, 'binning_year'), (3.5496261413283925, 'binning_age'), (3.327729652609854, 'category_high'), (3.2722527207044814, 'location_state'), (3.147683263668848, 'location_country'), (1.6316698822030438, 'language'), (0.8297189522232724, 'isbn')]


In [55]:
y_pred = creg.predict(X)
print(creg.__class__.__name__)
print(rmse(y, y_pred))

CatBoostRegressor
1.6934133598073444


In [56]:
cat_encoder = ce.cat_boost.CatBoostEncoder()
cat_encoder.fit(X, y)
X = cat_encoder.transform(X)

In [57]:
lreg = LGBMRegressor()
lreg.fit(X, y)
y_pred = lreg.predict(X)
print(lreg.__class__.__name__)
print(rmse(y, y_pred))

lreg_val = cross_val_score(lreg, X, y, cv = cv, scoring = make_scorer(rmse))
print('val acc :', lreg_val.mean())

# result = list(zip(lreg.get_feature_importance(), X.columns))
# result.sort(reverse = True)
# print(result)

LGBMRegressor
1.7964382240638457
val acc : 1.8096937325127374


In [58]:
xreg = XGBRegressor(objective='reg:squarederror')
xreg.fit(X, y)
y_pred = xreg.predict(X)
print(xreg.__class__.__name__)
print(rmse(y, y_pred))

xreg_val = cross_val_score(xreg, X, y, cv = cv, scoring = make_scorer(rmse))
print('val acc :', xreg_val.mean())

# result = list(zip(xreg.get_feature_importance(), X.columns))
# result.sort(reverse = True)
# print(result)

XGBRegressor
1.756781562927977
val acc : 1.8115133472048666


---

In [59]:
path = '/opt/ml/data'
sample_submission_path = path + '/sample_submission.csv'

In [60]:
df_sample_submission = pd.read_csv(sample_submission_path)
df_sample_submission['user_id'] = df_sample_submission['user_id'].astype(str)

In [61]:
temp = df_sample_submission.merge(df_books, how='left', on='isbn')
test_data = temp.merge(df_users, how='left', on='user_id')
print('merge 결과 shape: ', test_data.shape)
test_data.head(3)

merge 결과 shape:  (76699, 20)


Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path,pnumber,category_high,binning_year,age,location_city,location_state,location_country,binning_age
0,11676,2005018,0,clara callan,richard bruce wright,2001.0,harperflamingo canada,http://images.amazon.com/images/p/0002005018.0...,en,actresses,"in a small town in canada, clara callan reluct...",images/0002005018.01.thumbzzz.jpg,0,actresses,Early,37.347285,unknown,unknown,usa,30.0
1,116866,2005018,0,clara callan,richard bruce wright,2001.0,harperflamingo canada,http://images.amazon.com/images/p/0002005018.0...,en,actresses,"in a small town in canada, clara callan reluct...",images/0002005018.01.thumbzzz.jpg,0,actresses,Early,40.833523,ottawa,ontario,canada,40.0
2,152827,60973129,0,decision in normandy,carlo d'este,1991.0,harperperennial,http://images.amazon.com/images/p/0060973129.0...,en,1940 1949,"here, for the first time in paperback, is an o...",images/0060973129.01.thumbzzz.jpg,6,0,2000,40.0,ottawa,ontario,canada,40.0


In [62]:
test_data = test_data[['rating', 'user_id', 'isbn', 'book_author', 'language', 'category_high','pnumber', 'binning_year','binning_age', 'location_city', 'location_state', 'location_country', 'book_title']]

test_data['book_author'] = test_data['book_author'].fillna(value = 'unknown')
test_data['language'] = test_data['language'].fillna(value = 'unknown')
test_data['binning_year'] = test_data['binning_year'].fillna(value = 'unknown')
test_data['category_high'] = test_data['category_high'].fillna(value = 'unknown')
test_data['binning_age'] = test_data['binning_age'].fillna(value = 'unknown')
test_data['location_city'] = test_data['location_city'].fillna(value = 'unknown')
test_data['location_state'] = test_data['location_state'].fillna(value = 'unknown')
test_data['location_country'] = test_data['location_country'].fillna(value = 'unknown')
test_data['book_title'] = test_data['book_title'].fillna(value = 'unknown')

In [65]:
X = test_data.drop('rating', axis = 1)
y = test_data['rating']

y_pred = creg.predict(X)

In [66]:
# df_sample_submission['rating'] = y_pred
df_sample_submission['rating'] = y_pred

In [67]:
df_sample_submission.head()

Unnamed: 0,user_id,isbn,rating
0,11676,2005018,6.650621
1,116866,2005018,7.118094
2,152827,60973129,7.623937
3,157969,374157065,7.902285
4,67958,399135782,7.762003


In [68]:
df_sample_submission['rating'].isna().sum()

0

In [69]:
df_sample_submission.to_csv('/opt/ml/submit' + '/cat_boost_submission.csv', index=False)