In [47]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import re
from tqdm import tqdm
from collections import defaultdict

In [110]:
path = '..'
data_path = path +'/data'
books_path = data_path + '/books.csv'
users_path = data_path + '/users.csv'
train_ratings_path = data_path + '/train_ratings.csv'
test_ratings_path = data_path + '/test_ratings.csv'

In [111]:
df_books = pd.read_csv(books_path)
df_users = pd.read_csv(users_path)
df_train_ratings = pd.read_csv(train_ratings_path)
df_test_ratings = pd.read_csv(test_ratings_path)

print('users shape: ', df_users.shape)
print('books shape: ', df_books.shape)
print('train ratings shape: ', df_train_ratings.shape)
print('test ratings shape: ', df_test_ratings.shape)

# df_users['user_id'] = df_users['user_id'].astype(str)
# df_train_ratings['user_id'] = df_train_ratings['user_id'].astype(str)
# df_test_ratings['user_id'] = df_test_ratings['user_id'].astype(str)

users shape:  (68092, 3)
books shape:  (149570, 10)
train ratings shape:  (306795, 3)
test ratings shape:  (76699, 3)


In [112]:
def str_lower(x):
    try:
        return x.lower()
    except:
        return x

## Users

In [51]:
for column in df_users.columns:
    if column == 'user_id':
        continue
    if df_users[column].dtype == object:
        df_users[column] = df_users[column].apply(str_lower)

In [52]:
# Age 결측치
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68092 entries, 0 to 68091
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   user_id   68092 non-null  int64  
 1   location  68092 non-null  object 
 2   age       40259 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.6+ MB


In [7]:
df_users.isna().mean()

user_id     0.000000
location    0.000000
age         0.408756
dtype: float64

### (1) location 결측치 해결

In [8]:
df_users['location'] = df_users['location'].apply(lambda x : re.sub(r'[^a-zA-Z,]', '', x))
df_users['location'] = df_users['location'].apply(lambda x: ','.join(map(str, ['na' if '' == i else i for i in x.split(',')])))

df_users['location_city'] = df_users['location'].apply(lambda x: x.split(',')[0].strip())
df_users['location_state'] = df_users['location'].apply(lambda x: x.split(',')[1].strip())
df_users['location_country'] = df_users['location'].apply(lambda x: x.split(',')[2].strip())

df_users = df_users.replace('na', np.nan)
df_users = df_users.replace('', np.nan)

**state 결측치는 city,country 정보, country 결측치는 city,state 정보 이용해서 해결**

In [9]:
def fillna_location(to_fill:int,to_use:int):
    column_name = {0:'location_city',1:'location_state',2:'location_country'}
    to_fill_colname,to_use_colname = column_name[to_fill],column_name[to_use]
    
    modify_location = set(df_users[(df_users[to_fill_colname].isna()) & ((df_users[to_use_colname].notnull()))][to_use_colname].values)
    location_list = []
    for location in tqdm(modify_location): 
        try:
            candidates = df_users[(df_users[to_use_colname]==location) & ((df_users[to_use_colname].notnull()))]['location'].value_counts()
            first = candidates.idxmax()
            for k, v in dict(candidates).items():
                k = k.split(',')
                if 'na' not in k:
                    right_location = ','.join(map(str, k))
                    break
            else:
                right_location = first
            location_list.append(right_location)
        except:
            pass

    for location in tqdm(location_list): # [지역, 주, 국가]
        df_users.loc[df_users[(df_users[to_use_colname]==location.split(',')[to_use]) & (df_users[to_fill_colname].isna())].index, to_fill_colname] = location.split(',')[to_fill]

fillna_location(1,0) # state의 결측치를 city의 정보를 활용해 채워보자
fillna_location(1,2) # state의 결측치를 country의 정보를 활용해 채워보자
fillna_location(2,0) # country의 결측치를 city의 정보를 활용해 채워보자
fillna_location(2,1) # country의 결측치를 state의 정보를 활용해 채워보자


# 남은 결측치는 unknown으로 채우기
df_users[['location_city', 'location_state', 'location_country']] = df_users[['location_city', 'location_state', 'location_country']].fillna('unknown')

# where가 들어간 city, state, country는 다 unknown으로 교체
df_users.loc[df_users[df_users['location_city'].str.contains('where')].index, 'location_city'] = 'unknown'
df_users.loc[df_users[df_users['location_state'].str.contains('where')].index, 'location_state'] = 'unknown'
df_users.loc[df_users[df_users['location_country'].str.contains('where')].index, 'location_country'] = 'unknown'

100%|██████████| 1556/1556 [00:12<00:00, 124.83it/s]
100%|██████████| 1556/1556 [00:12<00:00, 126.00it/s]
100%|██████████| 25/25 [00:00<00:00, 95.69it/s] 
100%|██████████| 25/25 [00:00<00:00, 127.93it/s]
100%|██████████| 1172/1172 [00:09<00:00, 125.17it/s]
100%|██████████| 1172/1172 [00:09<00:00, 125.75it/s]
100%|██████████| 2/2 [00:00<00:00, 89.64it/s]
100%|██████████| 2/2 [00:00<00:00, 122.65it/s]


**location city에 대해 location_country가 max인 나라로 채워서 noise 해결**

In [10]:
temp = df_users.groupby('location_city')['location_country'].value_counts().groupby('location_city').idxmax().apply(lambda x : x[-1]).reset_index().rename(columns = {'count':'location_country'})
city2country = dict(zip(temp['location_city'].values, temp['location_country'].values))

df_users['location_country'] = df_users['location_city'].map(city2country) #location city에 대해 Location_country가 max인 나라로 채워서 noise 해결

### (2) age 결측치 해결

In [11]:
def binning_age(x):
    if np.isnan(x):
        return np.nan
    elif x < 20:
        x = 10
    elif 20 <= x < 30:
        x = 20
    elif 30 <= x < 40:
        x = 30
    elif 40 <= x < 50:
        x = 40
    elif 50 <= x < 60:
        x = 60 
    elif x >= 60:
        x = 60
    return x

# df_users['binning_age'] = df_users['age'].apply(binning_age)

**같은 책을 읽은 사람 나이의 평균의 평균(유저당 여러권 책을 읽기 때문)**

In [12]:
temp = df_train_ratings.merge(df_users, on = 'user_id')

gia = temp.groupby('isbn')['age'].mean() # 같은 책을 읽은 사람 나이의 평균
temp['gia'] = temp['isbn'].map(gia)

grouped_isbn_age_grouped_uid_gia = temp.groupby('user_id')['gia'].mean() #같은 책을 읽은 사람 나이의 평균의 평균(유저당 여러권 책을 읽음)
temp['giagua'] = temp['user_id'].map(grouped_isbn_age_grouped_uid_gia)
temp['giagua'].isna().sum() # 같은 책을 읽은 사람들의 나이가 모두 NA인 경우

5540

In [13]:
feature2index = dict(zip(temp.columns, range(len(temp.columns))))
temp['final'] = temp.apply(lambda x : x[feature2index['giagua']] if type(x[feature2index['age']]) == float and np.isnan(x[feature2index['age']]) else x[feature2index['age']], axis = 1)

uid2age = dict(zip(temp['user_id'].values, temp['final'].values)) # 나이가 존재하면 그대로, 존재하지 않으면 giagua 사용

df_users['age'] = temp['user_id'].map(uid2age)

**(1) 나이 결측치가 안채워진 경우, user가 사는 나라의 평균 age로 결측치 채우기**    
**(2) 그래도 결측치 있으면 전체 user의 나이 평균으로 결측치 채우기**

In [14]:
df_users['age'] = df_users.groupby('location_country')['age'].transform(lambda x:x.fillna(x.mean()))
df_users.loc[df_users[df_users['age'].isna()].index,'age'] = temp['age'].mean()

In [15]:
df_users['binning_age'] = df_users['age'].apply(binning_age)
df_users['binning_age'] = df_users['binning_age'].astype('str')

In [16]:
df_users.isna().sum()

user_id             0
location            0
age                 0
location_city       0
location_state      0
location_country    0
binning_age         0
dtype: int64

In [17]:
df_users = df_users.drop(['location','age'],axis=1)

## Books

In [53]:
for column in df_books.columns:
    if column == 'isbn':
        continue
    if df_books[column].dtype == object:
        print(column)
        df_books[column] = df_books[column].apply(str_lower)

book_title
book_author
publisher
img_url
language
category
summary
img_path


In [19]:
df_books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path
0,2005018,clara callan,richard bruce wright,2001.0,harperflamingo canada,http://images.amazon.com/images/p/0002005018.0...,en,['actresses'],"in a small town in canada, clara callan reluct...",images/0002005018.01.thumbzzz.jpg
1,60973129,decision in normandy,carlo d'este,1991.0,harperperennial,http://images.amazon.com/images/p/0060973129.0...,en,['1940-1949'],"here, for the first time in paperback, is an o...",images/0060973129.01.thumbzzz.jpg
2,374157065,flu: the story of the great influenza pandemic...,gina bari kolata,1999.0,farrar straus giroux,http://images.amazon.com/images/p/0374157065.0...,en,['medical'],"describes the great flu epidemic of 1918, an o...",images/0374157065.01.thumbzzz.jpg
3,399135782,the kitchen god's wife,amy tan,1991.0,putnam pub group,http://images.amazon.com/images/p/0399135782.0...,en,['fiction'],a chinese immigrant who is convinced she is dy...,images/0399135782.01.thumbzzz.jpg
4,425176428,what if?: the world's foremost military histor...,robert cowley,2000.0,berkley publishing group,http://images.amazon.com/images/p/0425176428.0...,en,['history'],"essays by respected military historians, inclu...",images/0425176428.01.thumbzzz.jpg


In [54]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149570 entries, 0 to 149569
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   isbn                 149570 non-null  object 
 1   book_title           149570 non-null  object 
 2   book_author          149569 non-null  object 
 3   year_of_publication  149570 non-null  float64
 4   publisher            149570 non-null  object 
 5   img_url              149570 non-null  object 
 6   language             82343 non-null   object 
 7   category             80719 non-null   object 
 8   summary              82343 non-null   object 
 9   img_path             149570 non-null  object 
dtypes: float64(1), object(9)
memory usage: 11.4+ MB


In [55]:
df_books.isna().mean()

isbn                   0.000000
book_title             0.000000
book_author            0.000007
year_of_publication    0.000000
publisher              0.000000
img_url                0.000000
language               0.449468
category               0.460326
summary                0.449468
img_path               0.000000
dtype: float64

In [56]:
df_books['publisher'].nunique()

11428

### (1) Publisher Number(pnumber) feature 새로 생성 : isbn 활용

In [23]:
df_books['pnumber'] = df_books['isbn'].apply(lambda x : x[:3])

### (2) category 결측치 해결

- book author 이용해서 해결 (같은 작가가 쓴 카테고리는 동일할 것으로 예상)

In [24]:
df_books.loc[df_books[df_books['category'].notnull()].index, 'category'] = df_books[df_books['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+',' ',x).lower().strip())

temp = df_books.groupby('book_author')['category'].value_counts().groupby('book_author').idxmax().apply(lambda x : x[-1])
temp = defaultdict(lambda : np.nan, temp.to_dict())

feature2index = dict(zip(df_books.columns, range(len(df_books.columns))))
df_books['category'] = df_books.apply(lambda x : temp[x[feature2index['book_author']]] if type(x[feature2index['category']]) == float and np.isnan(x[feature2index['category']]) else x[feature2index['category']], axis = 1)

In [25]:
df_books['category'].isna().mean()

0.17736177040850437

- pnumber 이용해서 해결 (같은 출판사에서 나온 카테고리는 동일할 것으로 예상)

In [26]:
temp = df_books.groupby('pnumber')['category'].value_counts().groupby('pnumber').idxmax().apply(lambda x : x[-1])
temp = defaultdict(lambda : np.nan, temp.to_dict())

feature2index = dict(zip(df_books.columns, range(len(df_books.columns))))
df_books['category'] = df_books.apply(lambda x : temp[x[feature2index['pnumber']]] if type(x[feature2index['category']]) == float and np.isnan(x[feature2index['category']]) else x[feature2index['category']], axis = 1)

In [27]:
df_books['category'].isna().mean()

0.0009360165808651468

- category가 한 단어로 기재된 경우로 통일

In [28]:
words = defaultdict(int)
for value in df_books['category'].values:
    try:
        if len(value.split()) == 1: # category가 한 단어로 기재된 경우
            words[value] += 1
    except:
        pass
print(len(words))
categories = [(value, key) for key,value in words.items()] # category가 한 단어로 기재된 case마다 몇번 등장했는지
categories.sort(reverse = True)

1270


In [29]:
df_books['category'].value_counts()

category
fiction                                            82556
juvenile fiction                                   10587
biography autobiography                             4534
history                                             2467
religion                                            2389
                                                   ...  
coasts                                                 1
electronic journals                                    1
romania                                                1
aeronautics military                                   1
authors canadian english 20th century biography        1
Name: count, Length: 4105, dtype: int64

In [30]:
for _, category in categories:
    df_books.loc[df_books[df_books['category'].str.contains(category,na=False)].index,'category_high'] = category

### (3) language 결측치 해결

In [113]:
# 국가코드를 하나로 엮은 것에서 가장 많이 나온 MAXIMUM 언어로 결측치 해결
def country_code(isbn:str):
    prefix_1 = ('0','1','2','3','4','5','7')
    prefix_2 = tuple(map(str,range(80,94)))
    prefix_3 = tuple(list(map(str,range(950,960)))+list(map(str,range(961,969)))+list(map(str,range(970,985)))+['986','987'])
    if isbn.startswith(prefix_1):
        return isbn[0]
    elif isbn.startswith(prefix_2):
        return isbn[:2]
    elif isbn.startswith(prefix_3):
        return isbn[:3]
    else:
        return np.NaN

df_books['country_code'] = df_books['isbn'].map(country_code)


In [114]:
# 각 country에서 가장 많이 이용하는 language를 담은 dict 생성
top_languages = {}
for country_code, group in df_books.groupby('country_code'):
    try:
        top_language = group['language'].value_counts().idxmax()
        top_languages[country_code] = top_language
    except:
        pass
df_books['most_used_language'] = df_books['country_code'].map(top_languages)

In [115]:
# 해당 dict 이용해서 language 결측치 해결
df_books.loc[df_books[df_books['language'].isnull()].index,'language'] = df_books.loc[df_books[df_books['language'].isnull()].index,'most_used_language']

In [116]:
df_books = df_books.drop(['country_code','most_used_language'],axis=1)
# 채워지지 않는 국가 코드에 대해서는 language의 최빈값 'en'으로 채워넣기
df_books['language']  = df_books['language'].fillna('en')

- book author 이용해서 해결(같은 작가는 동일한 언어의 책을 낼 것으로 예상)

In [31]:
temp = df_books.groupby('book_author')['language'].value_counts().groupby('book_author').idxmax().apply(lambda x : x[-1])
temp = defaultdict(lambda : np.nan, temp.to_dict())
feature2index = dict(zip(df_books.columns, range(len(df_books.columns))))
df_books['language'] = df_books.apply(lambda x : temp[x[feature2index['book_author']]] if type(x[feature2index['book_author']]) == float and np.isnan(x[feature2index['language']]) else x[feature2index['language']], axis = 1)

- pnumber 이용해서 해결(같은 출판사는 동일한 언어의 책을 낼 것으로 예상)

In [32]:
temp = df_books.groupby('pnumber')['language'].value_counts().groupby('pnumber').idxmax().apply(lambda x : x[-1])# .reset_index().rename(columns = {'count' : 'language'})
temp = defaultdict(lambda : np.nan, temp.to_dict())

feature2index = dict(zip(df_books.columns, range(len(df_books.columns))))
df_books['language'] = df_books.apply(lambda x : temp[x[feature2index['pnumber']]] if type(x[feature2index['language']]) == float and np.isnan(x[feature2index['language']]) else x[feature2index['language']], axis = 1)

In [33]:
df_books['language'].isna().mean()

0.001785117336364244

### (4) year_of_publication binning

In [34]:
def binning_year(x):
    if x < 1970:
        return '1970'
    elif 1970 <= x < 1980:
        return '1980'
    elif 1980 <= x < 1990:
        return '1990'
    elif 1990 <= x < 2000:
        return '2000'
    else:
        return 'Early'
    
df_books['binning_year'] = df_books['year_of_publication'].apply(binning_year)

### (5) book_author cleaning

In [35]:
df_books['book_author'].fillna('unknown',inplace=True)
df_books['book_author'] = df_books['book_author'].apply(lambda x : re.sub(r'[^a-zA-Z0-9]', '', x))

In [36]:
df_books = df_books.drop(['summary','img_path','img_url','year_of_publication','publisher','category'],axis=1)

In [37]:
for column in df_books.columns:
    df_books[column] = df_books[column].fillna('unknown')

### 최종 데이터 처리 확인

In [39]:
df_users.isna().sum()

user_id             0
location_city       0
location_state      0
location_country    0
binning_age         0
dtype: int64

In [40]:
df_books.isna().sum()

isbn             0
book_title       0
book_author      0
language         0
pnumber          0
category_high    0
binning_year     0
dtype: int64

In [41]:
df_users.to_csv(data_path+'/users_preprocessed.csv',index=False)

In [42]:
df_books.to_csv(data_path+'/books_preprocessed.csv',index=False)

### train_ratings와 파일 병합

In [43]:
df_train_temp = df_train_ratings.merge(df_books, how='left', on='isbn')
df_train = df_train_temp.merge(df_users, how='left', on='user_id')
print('merge 결과 shape: ', df_train.shape)

merge 결과 shape:  (306795, 13)


In [44]:
df_train.columns

Index(['user_id', 'isbn', 'rating', 'book_title', 'book_author', 'language',
       'pnumber', 'category_high', 'binning_year', 'location_city',
       'location_state', 'location_country', 'binning_age'],
      dtype='object')

In [45]:
columns = ['book_author','language','binning_year','category_high','binning_age','location_city','location_state','location_country','book_title']
for column in columns:
    df_train[column] = df_train[column].fillna(value='unknown')

In [46]:
df_train.to_csv(data_path+'/train_preprocessed.csv',index=False)

### 모델에 data feeding 하기 위한 부분

In [None]:
X = df_train.drop('rating', axis = 1)
categorical_features_indices = np.where(X.dtypes == object)[0]
print(categorical_features_indices)
y = data['rating']

In [None]:
def rmse(y, y_pred):
    return mean_squared_error(y, y_pred, squared = False)

### catboost

In [None]:
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
parms = {'verbose' : 0, 'cat_features' : categorical_features_indices}
# parms = {'verbose' : 0, 'cat_features' : categorical_features_indices, 'task_type' : 'GPU'}
# parms = {'verbose' : 0, 'cat_features' : categorical_features_indices, 'task_type' : 'GPU', 'learning_rate': 0.04574578205475402, 'bagging_temperature': 0.12172958098369972, 'n_estimators': 8459, 'max_depth': 8, 'random_strength': 28, 'l2_leaf_reg': 1.6285455533915874e-05, 'min_child_samples': 18, 'max_bin': 441, 'od_type': 'Iter'}
# parms = {'verbose' : 0, 'cat_features' : categorical_features_indices, 'task_type' : 'GPU', 'learning_rate': 0.10952662748632554, 'bagging_temperature': 0.03613894271216528, 'n_estimators': 3629, 'max_depth': 8, 'random_strength': 46, 'l2_leaf_reg': 2.355742708217648e-05, 'min_child_samples': 24, 'max_bin': 354, 'od_type': 'IncToDec'}
creg = CatBoostRegressor(**parms)
creg.fit(X, y)
y_pred = creg.predict(X)
print(creg.__class__.__name__)
print(rmse(y, y_pred))

creg_val = cross_val_score(creg, X, y, cv = cv, scoring = make_scorer(rmse))
print('val acc :', creg_val.mean())

result = list(zip(creg.get_feature_importance(), X.columns))
result.sort(reverse = True)
print(result)

In [None]:
y_pred = creg.predict(X)
print(creg.__class__.__name__)
print(rmse(y, y_pred))

### LGBM

In [None]:
lreg = LGBMRegressor()
lreg.fit(X, y)
y_pred = lreg.predict(X)
print(lreg.__class__.__name__)
print(rmse(y, y_pred))

lreg_val = cross_val_score(lreg, X, y, cv = cv, scoring = make_scorer(rmse))
print('val acc :', lreg_val.mean())

# result = list(zip(lreg.get_feature_importance(), X.columns))
# result.sort(reverse = True)
# print(result)

### XGBM

In [None]:
xreg = XGBRegressor(objective='reg:squarederror')
xreg.fit(X, y)
y_pred = xreg.predict(X)
print(xreg.__class__.__name__)
print(rmse(y, y_pred))

xreg_val = cross_val_score(xreg, X, y, cv = cv, scoring = make_scorer(rmse))
print('val acc :', xreg_val.mean())

# result = list(zip(xreg.get_feature_importance(), X.columns))
# result.sort(reverse = True)
# print(result)

In [None]:
df_test_ratings['user_id'] = df_test_ratings['user_id'].astype(str)

In [None]:
df_test_ratings.head()

In [None]:
df_test = df_test_ratings.merge(df_books, how='left', on='isbn')
df_test = df_test.merge(df_users, how='left', on='user_id')
print('merge 결과 shape: ', df_test.shape)
df_test.head()

In [None]:
df_test = df_test[['rating', 'user_id', 'isbn', 'book_author', 'language', 'category_high','pnumber', 'binning_year','binning_age', 'location_city', 'location_state', 'location_country', 'book_title']]
columns = ['book_author','language','binning_year','category_high','binning_age','location_city','location_state','location_country','book_title']
for column in columns:
    df_test[column] = df_test[column].fillna(value='other')

In [None]:
X_test = df_test.drop('rating', axis = 1)
c_pred = creg.predict(X_test)

In [None]:
X_test = cat_encoder.transform(X_test)
l_pred = lreg.predict(X_test)
x_pred = xreg.predict(X_test)

y_pred = 0.4 * l_pred + 0.3 * c_pred + 0.3 * x_pred

In [None]:
df_test_ratings['rating'] = c_pred

In [None]:
df_test_ratings.to_csv(data_path + '/submission.csv', index=False)