In [1]:
import os
import sys

import math
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 70)
pd.options.mode.chained_assignment = None  # default='warn'
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import log_loss
import lightgbm as lgb
from bayes_opt import BayesianOptimization #From https://github.com/fmfn/BayesianOptimization

%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
sales = pd.read_csv('./data/sales_train.csv', parse_dates=['date'], dtype={'date': 'str', 'date_block_num': 'int32', 'shop_id': 'int32', 
                          'item_id': 'int32', 'item_price': 'float32', 'item_cnt_day': 'int32'})
test = pd.read_csv('./data/test.csv', dtype={'ID': 'int32', 'shop_id': 'int32','item_id': 'int32'})
items = pd.read_csv('./data/items.csv', dtype={'item_name': 'str', 'item_id': 'int32', 'item_category_id': 'int32'})
item_categories = pd.read_csv('./data/item_categories.csv', dtype={'item_category_name': 'str', 'item_category_id': 'int32'})
shops = pd.read_csv('./data/shops.csv', dtype={'shop_name': 'str', 'shop_id': 'int32'})

In [3]:
train = sales.join(items, on='item_id', rsuffix='_').join(shops, on='shop_id', rsuffix='_').join(item_categories, on='item_category_id', rsuffix='_').drop(['item_id_', 'shop_id_', 'item_category_id_'], axis=1)

In [4]:
train.head(1)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,shop_name,item_category_name
0,2013-02-01,0,59,22154,999.0,1,ЯВЛЕНИЕ 2012 (BD),37,"Ярославль ТЦ ""Альтаир""",Кино - Blu-Ray


In [5]:
train_monthly = train[['date', 'date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price', 'item_cnt_day']]

In [6]:
train_monthly = train_monthly.sort_values('date').groupby(['date_block_num', 'shop_id', 'item_category_id', 'item_id'], as_index=False)
train_monthly = train_monthly.agg({'item_price':['sum', 'mean'], 'item_cnt_day':['sum', 'mean','count']})
# Rename features.
train_monthly.columns = ['date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price', 'mean_item_price', 'item_cnt', 'mean_item_cnt', 'transactions']

In [7]:
train_monthly

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,item_price,mean_item_price,item_cnt,mean_item_cnt,transactions
0,0,0,2,5572,7932.0,1322.000000,10,1.666667,6
1,0,0,2,5573,560.0,560.000000,1,1.000000,1
2,0,0,2,5575,2418.0,806.000000,4,1.333333,3
3,0,0,2,5576,11155.0,2231.000000,5,1.000000,5
4,0,0,2,5609,2381.0,2381.000000,1,1.000000,1
5,0,0,2,5612,3623.0,3623.000000,1,1.000000,1
6,0,0,2,5623,294.0,294.000000,1,1.000000,1
7,0,0,2,5627,4120.0,2060.000000,2,1.000000,2
8,0,0,2,5629,15400.0,1925.000000,9,1.125000,8
9,0,0,2,5630,2060.0,2060.000000,1,1.000000,1


In [8]:
#SLOW
shop_ids = train_monthly['shop_id'].unique()
item_ids = train_monthly['item_id'].unique()
empty_df = []
for i in range(34):
    for shop in shop_ids:
        for item in item_ids:
            empty_df.append([i, shop, item])
    
empty_df = pd.DataFrame(empty_df, columns=['date_block_num','shop_id','item_id'])

In [10]:
print(empty_df.shape)

(44486280, 3)


In [13]:
#FASTER
monthsdf = pd.DataFrame({'date_block_num':train_monthly.date_block_num.unique()}).sort_values('date_block_num').reset_index(drop=True)
monthsdf['key'] = 0
shopidsdf = pd.DataFrame({'shop_id':train_monthly.shop_id.unique()}).sort_values('shop_id').reset_index(drop=True)
shopidsdf['key'] = 0
itemidsdf = pd.DataFrame({'item_id':train_monthly.item_id.unique()}).sort_values('item_id').reset_index(drop=True)
itemidsdf['key'] = 0
empty_df = pd.merge(monthsdf,shopidsdf,how='outer',on='key')
empty_df = pd.merge(empty_df,itemidsdf,how='outer',on='key')
empty_df = empty_df.drop(columns='key')
print(empty_df.shape)

(44486280, 3)


In [14]:
train_monthly = pd.merge(empty_df, train_monthly, on=['date_block_num','shop_id','item_id'], how='left')

In [15]:
train_monthly.head(1)

Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_price,mean_item_price,item_cnt,mean_item_cnt,transactions
0,0,0,0,,,,,,


In [16]:
train_monthly.fillna(0, inplace=True)

In [17]:
train_monthly.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_price,mean_item_price,item_cnt,mean_item_cnt,transactions
0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,3,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,4,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
from itertools import product
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
matrix    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)
print(matrix.shape)
matrix

(10913850, 3)


Unnamed: 0,date_block_num,shop_id,item_id
139255,0,0,19
141495,0,0,27
144968,0,0,28
142661,0,0,29
138947,0,0,32
138948,0,0,33
138949,0,0,34
139247,0,0,35
142672,0,0,40
142065,0,0,41
