In [25]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
import time
import warnings
import random
import re

from torch.utils.data import TensorDataset, DataLoader, Dataset
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from lightgbm import LGBMRegressor
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold


In [26]:
def missing_data_imputation_mice(train_df, test_df):
    """
    Parameters
    ----------
    train_df : pd.DataFrame
    test_df : pd.DataFrame
    ----------
    """
    temp_columns = train_df.columns

    imputer_mice = IterativeImputer(random_state=9)
    train_df_imp = pd.DataFrame(imputer_mice.fit_transform(train_df)).applymap(lambda x:round(x))
    test_df_imp = pd.DataFrame(imputer_mice.fit_transform(test_df)).applymap(lambda x:round(x))
    train_df_imp.columns = temp_columns
    test_df_imp.columns = temp_columns
    return train_df_imp, test_df_imp

def missing_country_imputation_city(users):
    user_imp = users
    modify_location = users[(users['location_country'].isna())&(users['location_city'].notnull())]['location_city'].values
    location_list = []
    for location in modify_location:
        try:
            right_location = users[(users['location'].str.contains(location))&(users['location_country'].notnull())]['location'].value_counts().index[0]
            location_list.append(right_location)
        except:
            pass
    for location in location_list:
        user_imp.loc[users[users['location_city']==location.split(',')[0]].index,'location_state'] = location.split(',')[1]
        user_imp.loc[users[users['location_city']==location.split(',')[0]].index,'location_country'] = location.split(',')[2]
    return user_imp

def reduce_publisher(books):
    reduced_books = books
    publisher_dict=(books['publisher'].value_counts()).to_dict()
    publisher_count_df= pd.DataFrame(list(publisher_dict.items()),columns = ['publisher','count'])
    publisher_count_df = publisher_count_df.sort_values(by=['count'], ascending = False)
    modify_list = publisher_count_df[publisher_count_df['count']>1].publisher.values
    for publisher in modify_list:
        try:
            number = books[books['publisher']==publisher]['isbn'].apply(lambda x: x[:4]).value_counts().index[0]
            right_publisher = books[books['isbn'].apply(lambda x: x[:4])==number]['publisher'].value_counts().index[0]
            reduced_books.loc[books[books['isbn'].apply(lambda x: x[:4])==number].index,'publisher'] = right_publisher
        except:
            pass
    return reduced_books


def reduce_category(books):
    reduced_books = books
    reduced_books.loc[books[books['category'].notnull()].index, 'category'] = books[books['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+',' ',x).strip())
    reduced_books['category'] = reduced_books['category'].str.lower()

    categories = ['history','garden','crafts','physics','adventure','music','fiction','nonfiction','science','science fiction','social','homicide',
                'sociology','disease','religion','christian','philosophy','psycholog','mathemat','agricult','environmental',
                'business','poetry','drama','literary','travel','motion picture','children','cook','literature','electronic',
                'humor','animal','bird','photograph','computer','house','ecology','family','architect','camp','criminal','language','india']

    for category in categories:
        reduced_books.loc[reduced_books[reduced_books['category'].str.contains(category,na=False)].index,'category_high'] = category

    # others
    category_high_df = pd.DataFrame(reduced_books['category_high'].value_counts()).reset_index()
    category_high_df.columns = ['category','count']
    others_list = category_high_df[category_high_df['count']<5]['category'].values
    reduced_books.loc[reduced_books[reduced_books['category_high'].isin(others_list)].index, 'category_high']='others'

    return reduced_books


In [20]:
def data_processing(train,test,sub,book,user):
    ids = pd.concat([train['user_id'], sub['user_id']]).unique()
    isbns = pd.concat([train['isbn'], sub['isbn']]).unique()

    idx2user = {idx:id for idx, id in enumerate(ids)}
    idx2isbn = {idx:isbn for idx, isbn in enumerate(isbns)}

    user2idx = {id:idx for idx, id in idx2user.items()}
    isbn2idx = {isbn:idx for idx, isbn in idx2isbn.items()}

    train['user_id'] = train['user_id'].map(user2idx)
    sub['user_id'] = sub['user_id'].map(user2idx)
    test['user_id'] = test['user_id'].map(user2idx)
    user['user_id'] = user['user_id'].map(user2idx)

    train['isbn'] = train['isbn'].map(isbn2idx)
    sub['isbn'] = sub['isbn'].map(isbn2idx)
    test['isbn'] = test['isbn'].map(isbn2idx)
    book['isbn'] = book['isbn'].map(isbn2idx)


    user['location_city'] = user['location'].apply(lambda x: x.split(',')[0])
    user['location_state'] = user['location'].apply(lambda x: x.split(',')[1])
    user['location_country'] = user['location'].apply(lambda x: x.split(',')[2])

    user = missing_country_imputation_city(user)
    user = user.drop(['location'], axis=1)
    user = user.replace(['n/a','na',''], np.nan)

    book = reduce_publisher(book)
    book = reduce_category(book)

    ratings = pd.concat([train, test]).reset_index(drop=True)

    # 인덱싱 처리된 데이터 조인
    context_df = ratings.merge(user, on='user_id', how='left').merge(book[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')
    train_df = train.merge(user, on='user_id', how='left').merge(book[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')
    test_df = test.merge(user, on='user_id', how='left').merge(book[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')

    # 인덱싱 처리
    loc_city2idx = {v:k for k,v in enumerate(context_df['location_city'].unique())}
    loc_state2idx = {v:k for k,v in enumerate(context_df['location_state'].unique())}
    loc_country2idx = {v:k for k,v in enumerate(context_df['location_country'].unique())}
    # book 파트 인덱싱
    category2idx = {v:k for k,v in enumerate(context_df['category'].unique())}
    publisher2idx = {v:k for k,v in enumerate(context_df['publisher'].unique())}
    language2idx = {v:k for k,v in enumerate(context_df['language'].unique())}
    author2idx = {v:k for k,v in enumerate(context_df['book_author'].unique())}
    
    # imputation을 진행하기 위해 결측치는 유지시킨다.
    loc_city2idx[np.nan] = np.nan
    loc_state2idx[np.nan] = np.nan
    loc_country2idx[np.nan] = np.nan
    category2idx[np.nan] = np.nan
    publisher2idx[np.nan] = np.nan
    language2idx[np.nan] = np.nan
    author2idx[np.nan] = np.nan


    train_df['location_city'] = train_df['location_city'].map(loc_city2idx)
    train_df['location_state'] = train_df['location_state'].map(loc_state2idx)
    train_df['location_country'] = train_df['location_country'].map(loc_country2idx)
    test_df['location_city'] = test_df['location_city'].map(loc_city2idx)
    test_df['location_state'] = test_df['location_state'].map(loc_state2idx)
    test_df['location_country'] = test_df['location_country'].map(loc_country2idx)

    train_df['category'] = train_df['category'].map(category2idx)
    train_df['publisher'] = train_df['publisher'].map(publisher2idx)
    train_df['language'] = train_df['language'].map(language2idx)
    train_df['book_author'] = train_df['book_author'].map(author2idx)
    test_df['category'] = test_df['category'].map(category2idx)
    test_df['publisher'] = test_df['publisher'].map(publisher2idx)
    test_df['language'] = test_df['language'].map(language2idx)
    test_df['book_author'] = test_df['book_author'].map(author2idx)

    # 결측치 처리
    train_df, test_df = missing_data_imputation_mice(train_df, test_df)

    idx = {
        "loc_city2idx":loc_city2idx,
        "loc_state2idx":loc_state2idx,
        "loc_country2idx":loc_country2idx,
        "category2idx":category2idx,
        "publisher2idx":publisher2idx,
        "language2idx":language2idx,
        "author2idx":author2idx,
    }

    
    # 필드 차원 수 정해주기
    field_dim = np.array([len(user2idx), len(isbn2idx),
                            6, len('loc_city2idx'), len('loc_state2idx'), len('loc_country2idx'),
                            len('category2idx'), len('publisher2idx'), len('language2idx'), len('author2idx')], dtype=np.uint32)

        
    # 나중에 인덱싱한거 다시 되돌리기 용 및 기타 데이터 다 저장해서 넘기기 ~ data['train'] 이런식으로 조회 및 타 데이터 추가 가능하게
    data = {
            'train' : train_df,
            'test' : test_df.drop(['rating'], axis=1),
            'user':user,
            'book':book,
            'sub':sub,
            'idx2user':idx2user,
            'idx2isbn':idx2isbn,
            'user2idx':user2idx,
            'isbn2idx':isbn2idx,  
            'field_dim' : field_dim   
            }

    return data 

In [21]:
# 시드값 고정
seed = 9
random.seed(seed)
np.random.seed(seed)

In [22]:
def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6

In [23]:
def context_data_split(data):
    X_train, X_valid, y_train, y_valid = train_test_split(
                                                        data['train'].drop(['rating'], axis=1),
                                                        data['train']['rating'],
                                                        test_size=0.2,
                                                        random_state=seed,
                                                        shuffle=True
                                                        )
    data['X_train'], data['X_valid'], data['y_train'], data['y_valid'] = X_train, X_valid, y_train, y_valid

    return data

In [24]:
warnings.filterwarnings(action='ignore')

book = pd.read_csv('../../data/books.csv')
user = pd.read_csv('../../data/users.csv')
train = pd.read_csv('../../data/train_ratings.csv')
test = pd.read_csv('../../data/test_ratings.csv')
sub = pd.read_csv('../../data/sample_submission.csv')

def seed_all(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def rmse(real: list, predict: list) -> float:
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

In [10]:
seed_all(seed)
model = LGBMRegressor(n_estimators=5000, random_state=seed,learning_rate=0.1)

data = context_data_split(data_processing(train,test,sub,book,user))
evals = [(data['X_valid'],data['y_valid'])]
model.fit(data['X_train'], data['y_train'],early_stopping_rounds=30, eval_metric="rmse", eval_set=evals, verbose=True)
LGBM_preds = model.predict(data['X_valid'])
rmse_score = rmse(data['y_valid'].tolist(),LGBM_preds.tolist())
print(rmse_score)
sub['rating'] = model.predict(data['test'])

now = time.localtime()
now_date = time.strftime('%Y%m%d', now)
now_hour = time.strftime('%X', now)
save_time = now_date + '_' + now_hour.replace(':', '')
sub.to_csv('{}_{}_{}.csv'.format(save_time,"LGBM",rmse_score.round(5)), index=False)

[1]	valid_0's rmse: 2.41813	valid_0's l2: 5.84734
[2]	valid_0's rmse: 2.40524	valid_0's l2: 5.78517
[3]	valid_0's rmse: 2.39466	valid_0's l2: 5.73441
[4]	valid_0's rmse: 2.38648	valid_0's l2: 5.69529
[5]	valid_0's rmse: 2.37712	valid_0's l2: 5.65069
[6]	valid_0's rmse: 2.37028	valid_0's l2: 5.61824
[7]	valid_0's rmse: 2.36355	valid_0's l2: 5.58635
[8]	valid_0's rmse: 2.35764	valid_0's l2: 5.55848
[9]	valid_0's rmse: 2.35306	valid_0's l2: 5.53688
[10]	valid_0's rmse: 2.34803	valid_0's l2: 5.51325
[11]	valid_0's rmse: 2.34353	valid_0's l2: 5.49212
[12]	valid_0's rmse: 2.34001	valid_0's l2: 5.47566
[13]	valid_0's rmse: 2.33752	valid_0's l2: 5.46399
[14]	valid_0's rmse: 2.33474	valid_0's l2: 5.45103
[15]	valid_0's rmse: 2.33269	valid_0's l2: 5.44146
[16]	valid_0's rmse: 2.32976	valid_0's l2: 5.42777
[17]	valid_0's rmse: 2.32649	valid_0's l2: 5.41257
[18]	valid_0's rmse: 2.32383	valid_0's l2: 5.40018
[19]	valid_0's rmse: 2.32113	valid_0's l2: 5.38764
[20]	valid_0's rmse: 2.31957	valid_0's l

In [61]:
from sklearn.model_selection import StratifiedKFold

In [27]:
def stratified_kfold(data,n):
    skf = StratifiedKFold(n_splits= 5, shuffle=True, random_state=seed)
    counts = 0
    for train_index, valid_index in skf.split(data['train'].drop(['rating'], axis=1),data['train']['rating']):
        if counts == n:
            data['X_train'], data['y_train'] = data['train'].drop(['rating'], axis=1).loc[train_index], data['train']['rating'].loc[train_index]
            data['X_valid'], data['y_valid'] = data['train'].drop(['rating'], axis=1).loc[valid_index], data['train']['rating'].loc[valid_index]
            break
        else:
            counts += 1
    return data

In [28]:
### k-fold
data = data_processing(train,test,sub,book,user)

predicts_list = []
rmse_list = []
for i in range(5):
    data = stratified_kfold(data,i)
    evals = [(data['X_valid'],data['y_valid'])]
    model = LGBMRegressor(n_estimators=4000, random_state=seed,learning_rate=0.1)
    model.fit(data['X_train'], data['y_train'],early_stopping_rounds=30, eval_metric="rmse", eval_set=evals, verbose=False)
    lgbm_preds = model.predict(data['X_valid'])
    rmse_score = rmse(data['y_valid'].tolist(),lgbm_preds.tolist())
    print(rmse_score)
    rmse_list.append(rmse_score)
    predicts_list.append(model.predict(data['test']))

sub['rating'] = np.mean(predicts_list, axis=0)

now = time.localtime()
now_date = time.strftime('%Y%m%d', now)
now_hour = time.strftime('%X', now)
save_time = now_date + '_' + now_hour.replace(':', '')
sub.to_csv('{}_{}_{}.csv'.format(save_time,"lgbm",(sum(rmse_list)/len(rmse_list)).round(5)), index=False)

2.0923590542029085
2.0927372920379
2.1023457731116184
2.0952457954530406
2.1092584442338413
