In [37]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings(action='ignore')

data_path = '../data/'
sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path+'shops.csv')
items = pd.read_csv(data_path+'items.csv')
item_categories = pd.read_csv(data_path+'item_categories.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

In [38]:
sales_train = sales_train.rename(columns={'date': '날짜', 'date_block_num': '월ID',
                                          'shop_id': '상점ID', 'item_id': '상품ID',
                                          'item_price': '판매가', 'item_cnt_day': '판매량'})
sales_train.head()

Unnamed: 0,날짜,월ID,상점ID,상품ID,판매가,판매량
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [39]:
shops = shops.rename(columns = {'shop_name': '상점명', 'shop_id': '상점ID'})
items = items.rename(columns={'item_name': '상품명', 'item_id':'상품ID', 'item_category_id': '상품분류ID'})
item_categories = item_categories.rename(columns={'item_category_name': '상품분류명','item_category_id':'상품분류ID'})
test = test.rename(columns={'shop_id':'상점ID', 'item_id': '상품ID'})
test.head()

Unnamed: 0,ID,상점ID,상품ID
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [40]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast ='integer')
        else:
            df[col] = pd.to_numeric(df[col],downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'{100*(start_mem-end_mem)/start_mem:.1f}% 압축됨')

    return df

all_df = [sales_train, shops, items, item_categories, test]
for df in all_df:
    df = downcast(df)

62.5% 압축됨
38.6% 압축됨
54.2% 압축됨
39.9% 압축됨
70.8% 압축됨


In [41]:
from itertools import product

train = []

for i in sales_train['월ID'].unique():
    all_shop = sales_train.loc[sales_train['월ID']==i, '상점ID'].unique()
    all_item = sales_train.loc[sales_train['월ID']==i, '상품ID'].unique()
    train.append(np.array(list(product([i],all_shop,all_item))))

idx_features = ['월ID', '상점ID', '상품ID']

train = pd.DataFrame(np.vstack(train), columns = idx_features)
train

Unnamed: 0,월ID,상점ID,상품ID
0,0,59,22154
1,0,59,2552
2,0,59,2554
3,0,59,2555
4,0,59,2564
...,...,...,...
10913845,33,21,7635
10913846,33,21,7638
10913847,33,21,7640
10913848,33,21,7632


In [42]:
group = sales_train.groupby(idx_features).agg({'판매량':'sum'})
group = group.reset_index()

group = group.rename(columns = {'판매량':'월간 판매량'})
group

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량
0,0,0,32,6
1,0,0,33,3
2,0,0,35,1
3,0,0,43,1
4,0,0,51,2
...,...,...,...,...
1609119,33,59,22087,6
1609120,33,59,22088,2
1609121,33,59,22091,1
1609122,33,59,22100,1


In [43]:
train = train.merge(group, on=idx_features, how='left')
train

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량
0,0,59,22154,1.0
1,0,59,2552,
2,0,59,2554,
3,0,59,2555,
4,0,59,2564,
...,...,...,...,...
10913845,33,21,7635,
10913846,33,21,7638,
10913847,33,21,7640,
10913848,33,21,7632,


In [44]:
test['월ID'] = 34

In [45]:
all_data = pd.concat([train, test.drop('ID',axis = 1)],
                     ignore_index = True, keys = idx_features)

In [46]:
all_data = all_data.fillna(0)
all_data

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량
0,0,59,22154,1.0
1,0,59,2552,0.0
2,0,59,2554,0.0
3,0,59,2555,0.0
4,0,59,2564,0.0
...,...,...,...,...
11128045,34,45,18454,0.0
11128046,34,45,16188,0.0
11128047,34,45,15757,0.0
11128048,34,45,19648,0.0


In [47]:
import gc

del group
gc.collect()

0

In [48]:
all_data = all_data.merge(shops, on = '상점ID', how='left')
all_data = all_data.merge(items, on='상품ID', how = 'left')
all_data = all_data.merge(item_categories, on = '상품분류ID', how = 'left')

all_data = downcast(all_data)

del shops, items, item_categories
gc.collect();

26.4% 압축됨


In [49]:
all_data = all_data.drop(['상점명', '상품명', '상품분류명'], axis=1)
all_data

Unnamed: 0,월ID,상점ID,상품ID,월간 판매량,상품분류ID
0,0,59,22154,1,37
1,0,59,2552,0,58
2,0,59,2554,0,58
3,0,59,2555,0,56
4,0,59,2564,0,59
...,...,...,...,...,...
11128045,34,45,18454,0,55
11128046,34,45,16188,0,64
11128047,34,45,15757,0,55
11128048,34,45,19648,0,40


In [50]:
X_train = all_data[all_data['월ID']< 33]
X_train = all_data.drop(['월간 판매량'], axis =1)

X_valid = all_data[all_data['월ID'] == 33]
X_valid = X_valid.drop(['월간 판매량'], axis=1)

X_test = all_data[all_data['월ID']==34]
X_test = X_test.drop(['월간 판매량'], axis = 1)


y_train = all_data[all_data['월ID']<33]['월간 판매량']
y_train = y_train.clip(0,20)

y_valid = all_data[all_data['월ID']==33]['월간 판매량']
y_valid = y_valid.clip(0,20)

del all_data
gc.collect()

0

In [51]:
import lightgbm as lgb

params = {
    'metric': 'rmse',
    'num_leaves': 255,
    'learning_rate': 0.01,
    'force_col_wise': True,
    'random_state': 10
}

cat_features = ['상점ID', '상품분류ID']

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid)

In [52]:
callbacks = [lgb.log_evaluation(period=50)]
lgb_model = lgb.train(params = params, train_set=dtrain, num_boost_round=500, 
                      valid_sets=(dtrain, dvalid), categorical_feature=cat_features,
                      callbacks=callbacks)

LightGBMError: Length of labels differs from the length of #data