# Module

In [None]:
import pandas as pd
import numpy as np
from itertools import product
import lightgbm as lgb
from lightgbm.callback import early_stopping, log_evaluation

# Data

In [2]:
data_path = "./data/"

sales_train = pd.read_csv(data_path + 'sales_train.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
shops = pd.read_csv(data_path + 'shops.csv')

test = pd.read_csv(data_path + 'test.csv')
sub = pd.read_csv(data_path + 'sample_submission.csv')

## Downcasting

- Downcasting은 데이터의 정보 손실 없이 변수의 자료형을 더 작은 크기로 변환하여 메모리 사용량을 줄임
    - `int64` $\rightarrow$ `int8` // `float64` $\rightarrow$ `float32`

In [3]:
def downcast(df, verbose=True):
    start_memory = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col]%1==0).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif dtype_name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_memory = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print(f"Memory usage reduced from {start_memory:.2f} MB to {end_memory:.2f} MB")
        
    return df

In [4]:
data_files = [sales_train, items, shops, item_categories, test]
for file in data_files:
    file = downcast(file)

Memory usage reduced from 134.39 MB to 61.60 MB
Memory usage reduced from 0.51 MB to 0.23 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 4.90 MB to 1.43 MB


## Feature Engineering

In [6]:
train = []

for i in sales_train['date_block_num'].unique():
    all_shop = sales_train.loc[sales_train['date_block_num'] == i, 'shop_id'].unique()
    all_item = sales_train.loc[sales_train['date_block_num'] == i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))
    
idx_features = ['date_block_num', 'shop_id', 'item_id']
train = pd.DataFrame(np.vstack(train), columns=idx_features)

train.head()

Unnamed: 0,date_block_num,shop_id,item_id
0,0,59,22154
1,0,59,2552
2,0,59,2554
3,0,59,2555
4,0,59,2564


In [7]:
group = sales_train.groupby(idx_features).agg({'item_cnt_day':'sum'}).reset_index().rename(columns={'item_cnt_day':'month_sales'})
group.head()

Unnamed: 0,date_block_num,shop_id,item_id,month_sales
0,0,0,32,6
1,0,0,33,3
2,0,0,35,1
3,0,0,43,1
4,0,0,51,2


In [8]:
train = train.merge(group, on=idx_features, how='left')
train.head(10)

Unnamed: 0,date_block_num,shop_id,item_id,month_sales
0,0,59,22154,1.0
1,0,59,2552,
2,0,59,2554,
3,0,59,2555,
4,0,59,2564,
5,0,59,2565,
6,0,59,2572,
7,0,59,2573,
8,0,59,2574,2.0
9,0,59,2593,


In [11]:
test['date_block_num'] = 34

In [17]:
all_data = pd.concat([train, test.drop(['ID'], axis=1)], ignore_index=True, keys=idx_features)
all_data.tail()

  all_data = pd.concat([train, test.drop(['ID'], axis=1)], ignore_index=True, keys=idx_features)


Unnamed: 0,date_block_num,shop_id,item_id,month_sales
11128045,34,45,18454,
11128046,34,45,16188,
11128047,34,45,15757,
11128048,34,45,19648,
11128049,34,45,969,


In [18]:
all_data['month_sales'] = all_data['month_sales'].fillna(0)
all_data.isnull().sum()

date_block_num    0
shop_id           0
item_id           0
month_sales       0
dtype: int64

In [19]:
all_data = all_data.merge(shops, on='shop_id', how='left')
all_data = all_data.merge(items, on='item_id', how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')
all_data = downcast(all_data)
all_data.head()

Memory usage reduced from 477.56 MB to 328.99 MB


Unnamed: 0,date_block_num,shop_id,item_id,month_sales,shop_name,item_name,item_category_id,item_category_name
0,0,59,22154,1,"Ярославль ТЦ ""Альтаир""",ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
1,0,59,2552,0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
2,0,59,2554,0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил
3,0,59,2555,0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства
4,0,59,2564,0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE Perihelion: Live In Concert DVD (К...,59,Музыка - Музыкальное видео


In [20]:
all_data = all_data.drop(['shop_name', 'item_name', 'item_category_name'], axis=1)
all_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,month_sales,item_category_id
0,0,59,22154,1,37
1,0,59,2552,0,58
2,0,59,2554,0,58
3,0,59,2555,0,56
4,0,59,2564,0,59


In [21]:
del shops, items, item_categories

# Model

In [22]:
x_train = all_data[all_data['date_block_num'] < 33]
x_train = x_train.drop(['month_sales'], axis=1)

x_val = all_data[all_data['date_block_num'] == 33]
x_val = x_val.drop(['month_sales'], axis=1)

x_test = all_data[all_data['date_block_num'] == 34]
x_test = x_test.drop(['month_sales'], axis=1)

y_train = all_data[all_data['date_block_num'] < 33]['month_sales'].clip(0, 20)
y_val = all_data[all_data['date_block_num'] == 33]['month_sales'].clip(0, 20)

- 데이터 제공시 제공된 설명에 따라 타겟값은 0~20 범위 내로 제한

In [23]:
del all_data

In [32]:
params = {
    'metric': 'rmse',
    'num_boost_round': 500,
    'num_leaves': 255,
    'learning_rate': 1e-2,
    'force_col_wise': True,
    'random_state': 2025
}

cat_features = ['shop_id', 'item_category_id']

dtrain = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
dval = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)

In [33]:
callbacks = [early_stopping(stopping_rounds=100), log_evaluation(period=50)]

lgb_model = lgb.train(
    params=params,
    train_set=dtrain,
    valid_sets=[dtrain, dval],
    callbacks=callbacks,
)

[LightGBM] [Info] Total Bins 427
[LightGBM] [Info] Number of data points in the train set: 10675678, number of used features: 4
[LightGBM] [Info] Start training from score 0.299125
Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 1.14688	valid_1's rmse: 1.0671
[100]	training's rmse: 1.11231	valid_1's rmse: 1.03723
[150]	training's rmse: 1.09453	valid_1's rmse: 1.02351
[200]	training's rmse: 1.08277	valid_1's rmse: 1.01617
[250]	training's rmse: 1.07401	valid_1's rmse: 1.01112
[300]	training's rmse: 1.0667	valid_1's rmse: 1.00718
[350]	training's rmse: 1.06081	valid_1's rmse: 1.00429
[400]	training's rmse: 1.05454	valid_1's rmse: 1.00199
[450]	training's rmse: 1.04913	valid_1's rmse: 1.00031
[500]	training's rmse: 1.04466	valid_1's rmse: 0.999524
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 1.04466	valid_1's rmse: 0.999524


val's rmse : 0.999524