In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, Dataset

In [18]:
DATA_PATH = 'data/'
TEST_SIZE = 0.1
SEED = 42
BATCH_SIZE = 1024
DATA_SHUFFLE = True

In [9]:
users = pd.read_csv('data/' + 'users_preprocessed.csv')
books = pd.read_csv('data/' + 'books_merged.csv')
train = pd.read_csv('data/' + 'train_ratings.csv')
test = pd.read_csv('data/' + 'test_ratings.csv')
sub = pd.read_csv('data/' + 'sample_submission.csv')

In [10]:
users.sample(1)

Unnamed: 0,user_id,age,location_city,location_state,location_country
59744,262258,34.0,ohio_others,ohio,usa


In [11]:
books.sample(1)

Unnamed: 0,isbn,book_title,year_of_publication,publisher,img_url,language,summary,img_path,category_high,book_author,category,new_language,remove_country_code,book_author_over3,book_author_over5,book_author_over10,book_author_over50,book_author_over100
22678,671754254,MY TEACHER GLOWS IN THE DARK (RACK SIZE),1991.0,Aladdin,http://images.amazon.com/images/P/0671754254.0...,,,images/0671754254.01.THUMBZZZ.jpg,,Bruce Coville,,en,671754254,Bruce Coville,Bruce Coville,Bruce Coville,others,others


In [19]:
import inspect

def preprocess_age(users:pd.DataFrame):
    if not isinstance(users, pd.DataFrame):
        raise Exception(f"Error at {inspect.currentframe().f_code.co_name}\nnot pd.DataFrame")
    else:
        # 남은 users['age'] 결측치
        # global users['age']로 결측치 채우기
        users = users['age'].fillna(users['age'].mean())
        return users

def preprocess_location(users:pd.DataFrame):
    if not isinstance(users, pd.DataFrame):
        raise Exception(f"Error at {inspect.currentframe().f_code.co_name}\nnot pd.DataFrame")
    else:
    # location 결측치 채우기
    # 우선 location_country 결측치를 최빈 country로 채우기
        users['location_country'] = users['location_country'].fillna(users['location_country'].mode()[0])
    # state 최빈값 대치
        state_mode = users.groupby(['location_country'])['location_state'].agg(pd.Series.mode)
        idx = users[(users['location_state'].isna())].index
        for i in idx:
            try:
                tmp_country = users.loc[i, 'location_country']
                if isinstance(state_mode[tmp_country], str):
                    users.loc[i, 'location_state'] = state_mode[tmp_country]
                else:
                    users.loc[i, 'location_state'] = state_mode[tmp_country][0]
            except:
                pass
    # city 최빈값 대치
        city_mode1 = users.groupby(['location_country','location_state'])['location_city'].agg(pd.Series.mode)
        city_mode2 = users.groupby(['location_state'])['location_city'].agg(pd.Series.mode)
        city_mode3 = users.groupby(['location_country'])['location_city'].agg(pd.Series.mode)

        idx = users[(users['location_city'].isna())].index
        for i in idx:
            tmp_state = users.loc[i, 'location_state']
            tmp_country = users.loc[i, 'location_country']
            try:
                if isinstance(city_mode1[tmp_country,tmp_state], str):
                    users.loc[i, 'location_city'] = city_mode1[tmp_country, tmp_state]
                else:
                    users.loc[i, 'location_city'] = city_mode1[tmp_country, tmp_state][0]
            except:
                try:
                    if isinstance(city_mode2[tmp_state], str):
                        users.loc[i, 'location_city'] = city_mode2[tmp_state]
                    else:
                        users.loc[i, 'location_city'] = city_mode2[tmp_state][0]
                except:
                    try:
                        if isinstance(city_mode3[tmp_country], str):
                            users.loc[i, 'location_city'] = city_mode3[tmp_country]
                        else:
                            users.loc[i, 'location_city'] = city_mode3[tmp_country][0]
                    except:
                        pass
    # 너무 특이한 국가에서 사는 사람
        users['location_state'] = users['location_state'] = users['location_state'].fillna(users['location_state'].mode()[0])
        users['location_city'] = users['location_city'] = users['location_city'].fillna(users['location_city'].mode()[0])
    return users

In [20]:
# 책 전처리
import inspect

def preprocess_publisher(books:pd.DataFrame):
    if not isinstance(books, pd.DataFrame):
        raise Exception(f"Error at {inspect.currentframe().f_code.co_name}\nnot pd.DataFrame")
    else:
    # isbn 첫 네자리 활용하여 publisher 전처리
        publisher_dict=(books['publisher'].value_counts()).to_dict()
        publisher_count_df= pd.DataFrame(list(publisher_dict.items()),columns = ['publisher','count'])
        publisher_count_df = publisher_count_df.sort_values(by=['count'], ascending = False)

        modify_list = publisher_count_df[publisher_count_df['count']>1].publisher.values
        for publisher in modify_list:
            try:
                number = books[books['publisher']==publisher]['isbn'].apply(lambda x: x[:4]).value_counts().index[0]
                right_publisher = books[books['isbn'].apply(lambda x: x[:4])==number]['publisher'].value_counts().index[0]
                books.loc[books[books['isbn'].apply(lambda x: x[:4])==number].index,'publisher'] = right_publisher
            except: 
                pass
        return publisher

In [23]:
def users2idx(context_df, train_df, test_df, feature2idx:dict):
    def age_map(x: int) -> int:
        x = int(x)
        if x < 20:
            return 1
        elif x >= 20 and x < 30:
            return 2
        elif x >= 30 and x < 40:
            return 3
        elif x >= 40 and x < 50:
            return 4
        elif x >= 50 and x < 60:
            return 5
        else:
            return 6
    if not isinstance(context_df, pd.DataFrame) or not isinstance(train_df, pd.DataFrame) or \
        not isinstance(test_df, pd.DataFrame):
        raise Exception(f"Error at {inspect.currentframe().f_code.co_name}\nnot pd.DataFrame")
    else:
        train_df['age'] = train_df['age'].fillna(int(train_df['age'].mean()))
        train_df['age'] = train_df['age'].apply(age_map)
        test_df['age'] = test_df['age'].fillna(int(test_df['age'].mean()))
        test_df['age'] = test_df['age'].apply(age_map)
        
        for feature_name in ['city', 'state', 'country']:
            idx_name = 'loc' + feature_name + '2idx'
            feature_name = 'location_' + feature_name
            
            feature2idx[idx_name] = {v:k for k,v in enumerate(context_df[feature_name].unique())}
            train_df[feature_name] = train_df[feature_name].map(feature2idx[idx_name])
            test_df[feature_name] = test_df[feature_name].map(feature2idx[idx_name])
        
        return feature2idx, train_df, test_df

In [24]:
def books2idx(context_df, train_df, test_df, features_name:list, feature2idx:dict):
    if not isinstance(context_df, pd.DataFrame) or not isinstance(train_df, pd.DataFrame) or \
        not isinstance(test_df, pd.DataFrame):
        raise Exception(f"Error at {inspect.currentframe().f_code.co_name}\nnot pd.DataFrame")
    else:
        for feature_name in features_name:
            idx_name = feature_name + '2idx'
            feature2idx[idx_name] = {v:k for k,v in enumerate(context_df[feature_name].unique())}
            train_df[feature_name] = train_df[feature_name].map(feature2idx[idx_name])
            test_df[feature_name] = test_df[feature_name].map(feature2idx[idx_name])
        return feature2idx, train_df, test_df

In [25]:
def process_context_data(users, books, ratings1, ratings2, features_name:list):
    # publisher 전처리
    books = preprocess_publisher(books)
    
    # age, location 전처리
    users = preprocess_location(preprocess_age(users))
    
    ratings = pd.concat([ratings1, ratings2]).reset_index(drop=True)

    # 인덱싱 처리된 데이터 조인
    """
    users_preprocessed:
        dataframe
        user_id,age,location_city,location_state,location_country
        
    books_merged:
        dataframe
        isbn,book_title,year_of_publication,publisher,img_url,
        language,summary,img_path,category_high,book_author,category,
        new_language,remove_country_code,book_author_over3,book_author_over5,
        book_author_over10,book_author_over50,book_author_over100
    """   
    # user_id, isbn, age, city, state, country, category_high, publisher_4_digit, language, author_10
    context_df = ratings.merge(users, on='user_id', how='left').merge(books[features_name], on='isbn', how='left')
    train_df = ratings1.merge(users, on='user_id', how='left').merge(books[features_name], on='isbn', how='left')
    test_df = ratings2.merge(users, on='user_id', how='left').merge(books[features_name], on='isbn', how='left')

    # users 인덱싱
    idx, train_df, test_df = users2idx(context_df, train_df, test_df)
    
    # books 인덱싱
    idx, train_df, test_df = books2idx(context_df, train_df, test_df, idx)
    
    return idx, train_df, test_df

In [13]:
def dl_data_load(args):
    # user_id, isbn, age, city, state, country, category_high, publisher_4_digit, language, author_10
    features_name = args.ADD_CONTEXT
    
    users = pd.read_csv(DATA_PATH + 'users_preprocessed.csv')
    """
    books_merged:
        dataframe
        isbn,book_title,year_of_publication,publisher,img_url,
        language,summary,img_path,category_high,book_author,category,
        new_language,remove_country_code,book_author_over3,book_author_over5,
        book_author_over10,book_author_over50,book_author_over100
    """
    books = pd.read_csv(DATA_PATH + 'books_merged.csv')
    train = pd.read_csv(DATA_PATH + 'train_ratings.csv')
    test = pd.read_csv(DATA_PATH + 'test_ratings.csv')
    sub = pd.read_csv(DATA_PATH + 'sample_submission.csv')

    # 모든 유저
    ids = pd.concat([train['user_id'], sub['user_id']]).unique()
    # 모든 책
    isbns = pd.concat([train['isbn'], sub['isbn']]).unique()
    idx2user = {idx:id for idx, id in enumerate(ids)}
    idx2isbn = {idx:isbn for idx, isbn in enumerate(isbns)}
    user2idx = {id:idx for idx, id in idx2user.items()}
    isbn2idx = {isbn:idx for idx, isbn in idx2isbn.items()}
    
    train['user_id'] = train['user_id'].map(user2idx)
    sub['user_id'] = sub['user_id'].map(user2idx)
    test['user_id'] = test['user_id'].map(user2idx)

    train['isbn'] = train['isbn'].map(isbn2idx)
    sub['isbn'] = sub['isbn'].map(isbn2idx)
    test['isbn'] = test['isbn'].map(isbn2idx)
    
    idx, context_train, context_test = process_context_data(users, books, train, test, features_name)
    field_dims = np.array([len(user2idx), len(isbn2idx), 6], dtype=np.uint32)
    for idx_name in idx.keys():
        field_dims = np.append(field_dims, len(idx[idx_name]))

    data = {
            'train':train,
            'test':test.drop(['rating'], axis=1),
            'field_dims':field_dims,
            'users':users,
            'books':books,
            'sub':sub,
            'idx2user':idx2user,
            'idx2isbn':idx2isbn,
            'user2idx':user2idx,
            'isbn2idx':isbn2idx,
            }

    return data

In [14]:
def dl_data_split(args, data):
    X_train, X_valid, y_train, y_valid = train_test_split(
                                                        data['train'].drop(['rating'], axis=1),
                                                        data['train']['rating'],
                                                        test_size=args.TEST_SIZE,
                                                        random_state=args.SEED,
                                                        shuffle=True
                                                        )
    data['X_train'], data['X_valid'], data['y_train'], data['y_valid'] = X_train, X_valid, y_train, y_valid
    return data

In [15]:
def dl_data_loader(args, data):
    train_dataset = TensorDataset(torch.LongTensor(data['X_train'].values), torch.LongTensor(data['y_train'].values))
    valid_dataset = TensorDataset(torch.LongTensor(data['X_valid'].values), torch.LongTensor(data['y_valid'].values))
    test_dataset = TensorDataset(torch.LongTensor(data['test'].values))

    train_dataloader = DataLoader(train_dataset, batch_size=args.BATCH_SIZE, shuffle=args.DATA_SHUFFLE)
    valid_dataloader = DataLoader(valid_dataset, batch_size=args.BATCH_SIZE, shuffle=args.DATA_SHUFFLE)
    test_dataloader = DataLoader(test_dataset, batch_size=args.BATCH_SIZE, shuffle=False)

    data['train_dataloader'], data['valid_dataloader'], data['test_dataloader'] = train_dataloader, valid_dataloader, test_dataloader

    return data

In [6]:
books.isnull().sum()

isbn                       0
book_title                 0
year_of_publication        0
publisher                  0
img_url                    0
language               67227
summary                67227
img_path                   0
category_high          68851
book_author                0
category               69461
new_language            1100
remove_country_code     1100
book_author_over3          0
book_author_over5          0
book_author_over10         0
book_author_over50         0
book_author_over100        0
dtype: int64

In [7]:
users.isnull().sum()

user_id             0
age                 0
location_city       0
location_state      0
location_country    0
dtype: int64

In [None]:
data = dl_data_load()

In [None]:
data = dl_data_split(data)
data = dl_data_loader(data)