In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import tqdm
import pdb
from scipy.sparse import csr_matrix, linalg
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import lightgbm as lgb

import random
import os

warnings.filterwarnings(action='ignore')

In [41]:
path= '/opt/ml/level1_bookratingprediction_recsys-level1-recsys-06/data/'

users = pd.read_csv(path+'users.csv')
books = pd.read_csv(path+'books.csv')
train_ratings = pd.read_csv(path+'train_ratings.csv')
test_ratings = pd.read_csv(path+'test_ratings.csv')
submission = pd.read_csv(path + 'sample_submission.csv')

print('users shape: ', users.shape)
print('books shape: ', books.shape)
print('train_ratings shape: ', train_ratings.shape)

users shape:  (68092, 3)
books shape:  (149570, 10)
train_ratings shape:  (306795, 3)


In [42]:
def rmse(real: list, predict: list) -> float:
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

SEED = 42
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(SEED)


In [43]:
tem = train['user_id'].value_counts()
tem = list(tem[tem >= 10].index)
train['id'] = train['user_id'].copy()
train['id'][~train['id'].isin(tem)] = -1
test['id'] = test['user_id'].copy()
test['id'][~test['id'].isin(tem)] = -1

In [44]:
ratings = pd.concat([train_ratings, test_ratings]).reset_index(drop=True)
context_df = ratings.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')
train_df = train_ratings.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')
test_df = test_ratings.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'publisher', 'language', 'book_author']], on='isbn', how='left')

In [45]:
loc_city2idx = {v:k for k,v in enumerate(context_df['location_city'].unique())}
loc_state2idx = {v:k for k,v in enumerate(context_df['location_state'].unique())}
loc_country2idx = {v:k for k,v in enumerate(context_df['location_country'].unique())}

In [46]:
train_df['location_city'] = train_df['location_city'].map(loc_city2idx)
train_df['location_state'] = train_df['location_state'].map(loc_state2idx)
train_df['location_country'] = train_df['location_country'].map(loc_country2idx)

test_df['location_city'] = test_df['location_city'].map(loc_city2idx)
test_df['location_state'] = test_df['location_state'].map(loc_state2idx)
test_df['location_country'] = test_df['location_country'].map(loc_country2idx)

In [47]:
def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6

In [48]:
train_df['age'] = train_df['age'].fillna(int(train_df['age'].mean()))
train_df['age'] = train_df['age'].apply(age_map)
test_df['age'] = test_df['age'].fillna(int(test_df['age'].mean()))
test_df['age'] = test_df['age'].apply(age_map)

In [49]:
category2idx = {v:k for k,v in enumerate(context_df['category'].unique())}
publisher2idx = {v:k for k,v in enumerate(context_df['publisher'].unique())}
language2idx = {v:k for k,v in enumerate(context_df['language'].unique())}
author2idx = {v:k for k,v in enumerate(context_df['book_author'].unique())}

In [50]:
train_df['category'] = train_df['category'].map(category2idx)
train_df['publisher'] = train_df['publisher'].map(publisher2idx)
train_df['language'] = train_df['language'].map(language2idx)
train_df['book_author'] = train_df['book_author'].map(author2idx)

test_df['category'] = test_df['category'].map(category2idx)
test_df['publisher'] = test_df['publisher'].map(publisher2idx)
test_df['language'] = test_df['language'].map(language2idx)
test_df['book_author'] = test_df['book_author'].map(author2idx)

In [51]:
train_df['isbn'] = train_df['isbn'].astype('category')
test_df['isbn'] = test_df['isbn'].astype('category')

In [52]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306795 entries, 0 to 306794
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   user_id           306795 non-null  int64   
 1   isbn              306795 non-null  category
 2   rating            306795 non-null  int64   
 3   age               306795 non-null  int64   
 4   location_city     306795 non-null  int64   
 5   location_state    306795 non-null  int64   
 6   location_country  306795 non-null  int64   
 7   category          306795 non-null  int64   
 8   publisher         306795 non-null  int64   
 9   language          306795 non-null  int64   
 10  book_author       306795 non-null  int64   
dtypes: category(1), int64(10)
memory usage: 31.9 MB


In [53]:
train_x, val_x, train_y, val_y = train_test_split(train_df.drop(['rating'], axis = 1), train_df['rating'], test_size=0.2, random_state=42)
train_ds = lgb.Dataset(train_x, label = train_y) 
val_ds = lgb.Dataset(val_x, label = val_y) 

In [56]:
params = {'learning_rate': 0.01, 
          'max_depth': 16, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'rmse', 
          'is_training_metric': True, 
          'num_leaves': 144, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':42}

In [58]:
model = lgb.train(params, train_ds, 2000, val_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15800
[LightGBM] [Info] Number of data points in the train set: 245436, number of used features: 10
[LightGBM] [Info] Start training from score 7.068747
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 2.35352
[200]	valid_0's rmse: 2.3377
[300]	valid_0's rmse: 2.32986
[400]	valid_0's rmse: 2.32636
[500]	valid_0's rmse: 2.3213
[600]	valid_0's rmse: 2.3167
[700]	valid_0's rmse: 2.31337
[800]	valid_0's rmse: 2.31196
[900]	valid_0's rmse: 2.30973
[1000]	valid_0's rmse: 2.30709
[1100]	valid_0's rmse: 2.30593
[1200]	valid_0's rmse: 2.30512
[1300]	valid_0's rmse: 2.30456
[1400]	valid_0's rmse: 2.30326
[1500]	valid_0's rmse: 2.30175
[1600]	valid_0's rmse: 2.30064
[1700]	valid_0's rmse: 2.29969
[1800]	valid_0's rmse: 2.29941
[1900]	valid_0's rmse: 2.2985
[2000]	valid_0's rmse: 2.2978
Did not meet early stopping.