In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys

from tqdm import tqdm
import re

from scipy.stats import mode

import warnings
warnings.filterwarnings('ignore')

In [2]:
def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6

In [3]:
data_path = "/opt/ml/data/"

In [4]:
users = pd.read_csv(data_path + 'users.csv')
books = pd.read_csv(data_path + 'books.csv')
train = pd.read_csv(data_path + 'train_ratings.csv')
test = pd.read_csv(data_path + 'test_ratings.csv')

In [5]:
def mission_1_EDA(users, books):
    print('-'*20, 'Mission1 EDA Start', '-'*20)
    # user preprocessing
    users['location_city'] = users['location'].str.replace(r'[^0-9a-zA-Z:,]', '') # 특수문자 제거
    users['location_city'] = users['location'].apply(lambda x: x.split(',')[0])
    users['location_state'] = users['location'].apply(lambda x: x.split(',')[1])
    users['location_country'] = users['location'].apply(lambda x: x.split(',')[2])
    users = users.replace('na', np.nan) #특수문자 제거로 n/a가 na로 바뀌게 되었습니다. 따라서 이를 컴퓨터가 인식할 수 있는 결측값으로 변환합니다.
    users = users.replace('', np.nan) # 일부 경우 , , ,으로 입력된 경우가 있었으므로 이런 경우에도 결측값으로 변환합니다.
    

    # city는 있는데 country 없는 경우 채우기
    modify_location = users[(users['location_country'].isna())&(users['location_city'].notnull())]['location_city'].values

    location_list = []
    for location in tqdm(modify_location, desc='preprocessing...(1/4)'):
        try:
            right_location = users[(users['location'].str.contains(location))&(users['location_country'].notnull())]['location'].value_counts().index[0]
            location_list.append(right_location)
        except:
            pass

    for location in tqdm(location_list, desc='preprocessing...(2/4)'):
        users.loc[users[users['location_city']==location.split(',')[0]].index,'location_state'] = location.split(',')[1]
        users.loc[users[users['location_city']==location.split(',')[0]].index,'location_country'] = location.split(',')[2]

    # book preprocessing

    # 유명 출판사 표기 오류로 그룹화되지 못하는 케이스 처리
    publisher_dict=(books['publisher'].value_counts()).to_dict()
    publisher_count_df = pd.DataFrame(list(publisher_dict.items()),columns = ['publisher','count'])
    publisher_count_df = publisher_count_df.sort_values(by=['count'], ascending = False)

    modify_list = publisher_count_df[publisher_count_df['count']>1].publisher.values

    for publisher in tqdm(modify_list, desc='preprocessing...(3/4)'):
        try:
            number = books[books['publisher']==publisher]['isbn'].apply(lambda x: x[:4]).value_counts().index[0]
            right_publisher = books[books['isbn'].apply(lambda x: x[:4])==number]['publisher'].value_counts().index[0]
            books.loc[books[books['isbn'].apply(lambda x: x[:4])==number].index,'publisher'] = right_publisher
        except: 
            pass

    # category 대괄호 제거 및 소문자 변환
    books.loc[books[books['category'].notnull()].index, 'category'] = books[books['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+',' ',x).strip())
    books['category'] = books['category'].str.lower()

    # 43개의 high-category로 묶기
    categories = ['garden','crafts','physics','adventure','music','fiction','nonfiction','science','science fiction','social','homicide',
                  'sociology','disease','religion','christian','philosophy','psycholog','mathemat','agricult','environmental',
                  'business','poetry','drama','literary','travel','motion picture','children','cook','literature','electronic',
                  'humor','animal','bird','photograph','computer','house','ecology','family','architect','camp','criminal','language','india']

    for category in tqdm(categories, desc='preprocessing...(4/4)'):
        books.loc[books[books['category'].str.contains(category,na=False)].index,'category_high'] = category

    # 5개 이하 항목 others로 묶기
    category_high_df = pd.DataFrame(books['category_high'].value_counts()).reset_index()
    category_high_df.columns = ['category','count']
    others_list = category_high_df[category_high_df['count']<5]['category'].values
    books.loc[books[books['category_high'].isin(others_list)].index, 'category_high']='others'

    # location은 이제 필요 없음
    users = users.drop(['location'], axis=1)
    print('-'*20, 'Mission1 EDA Done', '-'*20)
    return users, books

In [6]:
users_eda, books_eda = mission_1_EDA(users, books)

-------------------- Mission1 EDA Start --------------------


preprocessing...(1/4): 100%|██████████| 2097/2097 [00:56<00:00, 37.35it/s]
preprocessing...(2/4): 100%|██████████| 1948/1948 [00:18<00:00, 103.56it/s]
preprocessing...(3/4): 100%|██████████| 5276/5276 [02:28<00:00, 35.60it/s]
preprocessing...(4/4): 100%|██████████| 43/43 [00:02<00:00, 19.31it/s]


-------------------- Mission1 EDA Done --------------------


In [7]:
ratings = pd.concat([train, test]).reset_index(drop=True)

# 인덱싱 처리된 데이터 조인
context_df = ratings.merge(users_eda, on='user_id', how='left').merge(books_eda[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')
train_df = train.merge(users_eda, on='user_id', how='left').merge(books_eda[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')
test_df = test.merge(users_eda, on='user_id', how='left').merge(books_eda[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')

In [8]:
# 인덱싱 처리
loc_city2idx = {v:k for k,v in enumerate(context_df['location_city'].unique())}
loc_state2idx = {v:k for k,v in enumerate(context_df['location_state'].unique())}
loc_country2idx = {v:k for k,v in enumerate(context_df['location_country'].unique())}

train_df['location_city'] = train_df['location_city'].map(loc_city2idx)
train_df['location_state'] = train_df['location_state'].map(loc_state2idx)
train_df['location_country'] = train_df['location_country'].map(loc_country2idx)
test_df['location_city'] = test_df['location_city'].map(loc_city2idx)
test_df['location_state'] = test_df['location_state'].map(loc_state2idx)
test_df['location_country'] = test_df['location_country'].map(loc_country2idx)

In [9]:
train_df.shape

(306795, 11)

In [11]:
test_df.shape

(76699, 11)

In [10]:
context_city = context_df[['location_city', 'age']].copy()
context_city['location_city'] = context_city['location_city'].map(loc_city2idx)

In [12]:
age_replace_idx = np.where(context_city.groupby('location_city')['age'].count() > 1)[0]
age_replace = context_city.groupby('location_city')['age'].median()[age_replace_idx]
age_replace_df = pd.DataFrame({'location_city':age_replace.index, 'age_fillna':age_replace.values})

train_df = train_df.merge(age_replace_df, on='location_city', how='left')
train_df = train_df.drop(columns='age').rename(columns={'age_fillna':'age'})
train_df['age'] = train_df['age'].fillna(int(train_df['age'].mean()))
train_df['age'] = train_df['age'].apply(age_map)

test_df = test_df.merge(age_replace_df, on='location_city', how='left')
test_df = test_df.drop(columns='age').rename(columns={'age_fillna':'age'})
test_df['age'] = test_df['age'].fillna(int(test_df['age'].mean()))
test_df['age'] = test_df['age'].apply(age_map)

In [13]:
train_df.shape

(306795, 11)