In [3]:
import pandas as pd
import numpy as np
import os

In [6]:
base_path = "../../data"
users_path = os.path.join(base_path, 'users.csv')
books_path = os.path.join(base_path, 'books.csv')
train_ratings_path = os.path.join(base_path, 'train_ratings.csv')
test_ratings_path = os.path.join(base_path, 'test_ratings.csv')

In [7]:
raw_users = pd.read_csv(users_path)
raw_books = pd.read_csv(books_path)
raw_ratings = pd.read_csv(train_ratings_path)
train_ratings = pd.read_csv(train_ratings_path)
test_ratings = pd.read_csv(test_ratings_path)

In [13]:
raw_books.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path
0,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,en,['Actresses'],"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg
1,60973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,en,['1940-1949'],"Here, for the first time in paperback, is an o...",images/0060973129.01.THUMBZZZ.jpg
2,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,en,['Medical'],"Describes the great flu epidemic of 1918, an o...",images/0374157065.01.THUMBZZZ.jpg
3,399135782,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,en,['Fiction'],A Chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg
4,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000.0,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,en,['History'],"Essays by respected military historians, inclu...",images/0425176428.01.THUMBZZZ.jpg


In [8]:
def users_preprocess(raw_users) :
    users = raw_users.copy()

    # age (median impute)
    users['age'].fillna(users['age'].median(), inplace=True)
    bins = [0, 10, 20, 30, 40, 50, 60, 70, 100]
    users['age_bin'] = pd.cut(x=users['age'], bins=bins, right=False, labels=range(8))

    
    # location
    users['city'] = users['location'].apply(lambda x: x.split(',')[0].strip())
    users['state'] = users['location'].apply(lambda x: x.split(',')[1].strip())
    users['country'] = users['location'].apply(lambda x: x.split(',')[2].strip())
    users = users.replace('n/a', np.nan)
    users = users.replace('', np.nan)
    users.fillna('unknown', inplace=True)
    users.drop(columns=['location'], inplace=True)
    
    city_state_map = dict(users.groupby('city')['state']
                          .value_counts().sort_values().index.tolist())
    city_country_map = dict(users.groupby('city')['country']
                            .value_counts().sort_values().index.tolist())
    users['state'] = users['city'].map(city_state_map)
    users['country'] = users['city'].map(city_country_map)
    
    # users['location'] = users['country'].copy()
    # users['location'] = np.where(users['location']=='usa',
    #                          users['state'],
    #                          users['location'])
    # users['location'].fillna('na', inplace=True)
        
    return users

In [None]:
# def isbn_area(isbn) :
#     if isbn[0] in ('0', '1') : # English
#         return '1'
#     if isbn[0] in ('2', '3', '4', '5', '7', '8') : # French, German, Japan, USSR, China
#         return isbn[0]
#     # 6으로 시작하는 경우 없음
#     if isbn[0] == '8' :
#         return isbn[:2]
#     if isbn[0] == '9' :
#         if int(isbn[:2]) < 95 :
#             return isbn[:2]
#         if int(isbn[:2]) < 99 :
#             return isbn[:3]
#         else :
#             return isbn[:4]
#     else :
#         return 'others'

In [9]:
def isbn_area(isbn) :
    if isbn[0] in ('0', '1') : # English
        return '1'
    if isbn[0] in ('2', '3', '8') : # major
        return isbn[0]
    # 6으로 시작하는 경우 없음
    else :
        return 'others'

In [10]:
def books_ratings_preprocess(raw_books, raw_ratings) :
    books = raw_books.copy()
    ratings = raw_ratings.merge(raw_books[['isbn', 'img_url']], how='left', on='isbn')
    
    # isbn
    ratings['isbn'] = ratings['img_url'].apply(lambda x: x.split('P/')[1][:10])
    books['isbn'] = books['img_url'].apply(lambda x: x.split('P/')[1][:10])
    
    # book_author
    books['book_author'] = books['book_author'].str\
                        .lower().replace('[^0-9a-zA-Z]', '', regex=True)
    
    # year_of_publication
    bins = [0, 1900, 1950, 1960, 1970, 1980, 1990, 2000, 2010]
    books['year_of_publication'] = pd.cut(x=books['year_of_publication'],
                                          bins=bins, right=False, labels=range(8))
    
    # publisher
    books['publisher'] = books['publisher'].str\
                     .lower().replace('[^0-9a-zA-Z]', '', regex=True)
    
    # category
    books['category'] = books['category'].str\
                        .lower().replace('[^0-9a-zA-Z]', '', regex=True)
    author_cat_map = dict(books.groupby('book_author')['category']
                      .value_counts().sort_values().index.tolist())
    books['category'] = books['book_author'].map(author_cat_map)
    publisher_cat_map = dict(books.groupby('publisher')['category']
                      .value_counts().sort_values().index.tolist())
    books['category'] = books['category'].fillna(
                        books['publisher'].map(publisher_cat_map))
    books['category'].fillna('na', inplace=True)
    major_cat = ['fiction', 'juvenilefiction', 'juvenilenonfiction', 'biography',
            'histor', 'religio', 'science', 'social', 'politic', 'humor',
            'spirit', 'business', 'cook', 'health', 'famil', 'computer',
            'travel', 'self', 'poet', 'language', 'art', 'language art',
            'literary', 'criticism', 'nature', 'philosoph', 'reference', 'drama',
            'sport', 'transportation', 'comic', 'craft', 'education', 'crime',
            'music', 'animal', 'garden', 'detective', 'house', 'tech', 'photograph',
            'adventure', 'game', 'architect', 'law', 'antique', 'friend',
            'sciencefiction', 'fantasy', 'mathematic', 'design', 'actor',
            'horror', 'adultery']
    books['major_cat'] = books['category'].copy()
    for category in major_cat :
        books['major_cat'] = np.where(books['category'].str.contains(category),
                                     category, books['major_cat'])
        
    # summary
    books['summary'] = np.where(books['summary'].notnull(), 1, 0)
    
    # isbn_area
    books['isbn_area'] = books['isbn'].apply(isbn_area)

    ratings.drop(columns=['img_url'], inplace=True)
    books.drop(columns=['book_title', 'img_url', 'category', 'summary', 'img_path'],
               inplace=True)
    
    return books, ratings

In [11]:
users = users_preprocess(raw_users)
books, ratings = books_ratings_preprocess(raw_books, raw_ratings)

In [14]:
books.to_csv('books.csv', index=False)

In [15]:
ratings.to_csv("ratings.csv", index=False)