# Import libraries

In [100]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# edited by Mayur

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# Load necessary files

In [101]:
items = pd.read_csv("items.csv.zip")
item_categories = pd.read_csv("item_categories.csv")
sales_train = pd.read_csv("sales_train.csv.zip", parse_dates=['date'])
shops = pd.read_csv("shops.csv")
sales_test = pd.read_csv("test.csv.zip")
submission = pd.read_csv("sample_submission.csv.zip")

# Add data from files to train and test datasets

Train dataset

In [102]:
sales_train = sales_train.join(items.set_index('item_id'), on='item_id')
sales_train = sales_train.join(item_categories.set_index('item_category_id'), on='item_category_id')
sales_train = sales_train.join(shops.set_index('shop_id'), on='shop_id')
# sales_train = sales_train.drop('date_block_num', axis=1)
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,item_category_name,shop_name
0,2013-02-01,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""
1,2013-03-01,0,25,2552,899.0,1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"Москва ТРК ""Атриум"""
2,2013-05-01,0,25,2552,899.0,-1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"Москва ТРК ""Атриум"""
3,2013-06-01,0,25,2554,1709.05,1.0,DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил,"Москва ТРК ""Атриум"""
4,2013-01-15,0,25,2555,1099.0,1.0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства,"Москва ТРК ""Атриум"""


Test dataset

In [103]:
sales_test['date_block_num'] = 34
sales_test = sales_test.join(items.set_index('item_id'), on='item_id')
sales_test = sales_test.join(item_categories.set_index('item_category_id'), on='item_category_id')
sales_test = sales_test.join(shops.set_index('shop_id'), on='shop_id')

sales_test.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_name,item_category_id,item_category_name,shop_name
0,0,5,5037,34,"NHL 15 [PS3, русские субтитры]",19,Игры - PS3,"Вологда ТРЦ ""Мармелад"""
1,1,5,5320,34,ONE DIRECTION Made In The A.M.,55,Музыка - CD локального производства,"Вологда ТРЦ ""Мармелад"""
2,2,5,5233,34,"Need for Speed Rivals (Essentials) [PS3, русск...",19,Игры - PS3,"Вологда ТРЦ ""Мармелад"""
3,3,5,5232,34,"Need for Speed Rivals (Classics) [Xbox 360, ру...",23,Игры - XBOX 360,"Вологда ТРЦ ""Мармелад"""
4,4,5,5268,34,"Need for Speed [PS4, русская версия]",20,Игры - PS4,"Вологда ТРЦ ""Мармелад"""


Before adding *item_price* feature let's inspect price for each item and find wrong prices.

In [104]:
# for each item get price mode which more than 5
def amode(col):
    i = 0
    res = 0
    count = col.value_counts().index
    while res <= 5 and i < len(count):
        res = count[i]
        i += 1
    return res

def alast(col):
    return res
    
# group train dataset by prices, aggregate by price mode
item_prices = sales_train[['item_id', 'item_price', 
                           'item_name']].groupby('item_id').agg({'item_price': [np.min, 
                                                                                np.max, 
                                                                                amode],
                                                                 'item_name': 'first'})
# add feature for difference between min and max prices
item_prices['price_diff'] = abs(item_prices.item_price.amax/item_prices.item_price.amin)
# get all prices with the difference between min and max prices more than 15 and min price less than 5
# save indexes of these prices
wrong_prices = item_prices[(item_prices.item_price.amin <= 5) & 
                           (item_prices.price_diff >= 15)].sort_values('price_diff', ascending=False)

Fix wrong prices in train dataset.

In [105]:
for i_id in list(wrong_prices.index):
    sales_train.loc[(sales_train.item_id == i_id) & 
                    (sales_train.item_price <= 5), 'item_price'] = wrong_prices.loc[i_id, 'item_price'].amode
    
shops_last_prices = sales_train[['shop_id', 'item_id', 'item_price']].groupby(['shop_id', 
                                                                               'item_id']).agg('last')

sales_test = sales_test.join(shops_last_prices, on=['shop_id', 'item_id'])

Add day, month and year features

In [106]:
sales_train['month'] = sales_train['date'].dt.month
sales_train['year'] = sales_train['date'].dt.year

sales_test['month'] = 10
sales_test['year'] = 2015

Make columns in train and test in the same order.

In [107]:
cols = [col for col in sales_train.columns if col not in ['item_cnt_day', 'date']]
sales_train = sales_train[cols + ['item_cnt_day']]
sales_test = sales_test[cols]

Group train dataset by shop_id, item_id and date_block_num

In [108]:
sales_train_gr = sales_train.groupby(['date_block_num', 
                                      'shop_id', 'item_id'], as_index=False).agg({'item_price': 'mean', 
                                                                                  'item_name': 'first', 
                                                                                  'item_category_id': 'first', 
                                                                                  'item_category_name': 'first', 
                                                                                  'shop_name': 'first', 
                                                                                  'month': 'first', 
                                                                                  'year': 'first', 
                                                                                  'item_cnt_day': 'sum'})

Expert features

In [109]:
# add city feature
def add_city(col):
    city = col.split(' ')
    if city[0][0] == '!':
        return city[0][1:]
    if city[0] == 'Цифровой':
        return 'Цифровой склад'
    if city[0] == 'Выездная':
        return 'Выездная Торговля'
    return city[0]

# add global category
def add_global_category(col):
    cat = col.split(' ')
    if cat[0] == 'Игровые':
        return 'Игровые консоли'
    if cat[0] == 'Карты':
        return 'Карты оплаты'
    if cat[0] == 'Чистые':
        return 'Чистые носители'
    if cat[0] == 'Элементы':
        return 'Элементы питания'
    if cat[0] == 'Доставка':
        return 'Доставка товара'
    return cat[0]

# define if item is material thing
def is_material(col):
    material_list = ['Развитие', 'Аксессуары', 'Гаджеты', 'консоли', 'Открытки', 'Cувениры', 
                     'Настольные игры', 'Мягкие игрушки', 'Гарнитуры', 'Элементы','Комиксы',
                     'Фигурки', 'Атрибутика', 'Сумки', 'литература', 'Путеводители', 'Артбуки']
    for i in material_list:
        if i in col:
            return 1
    return 0
        
 # define if item is service
def is_service(col):
    service_list = ['Доставка товара', 'Служебные', 'Подарки - Сертификаты, услуги']
    if col in service_list:
        return 1
    return 0   
    
month_dict = {1: 31, 2:28, 3:31, 4:30, 5:31, 6:30, 7:31, 8:31, 9:30, 10:31, 11:30, 12:31}

    
def create_expert_features(data):
    # add expert features
    data['city'] = data.shop_name.apply(add_city) 
    data['global_category'] = data.item_category_name.apply(add_global_category)
    data['is_material'] = data.item_category_name.apply(is_material)
    data['is_service'] = data.item_category_name.apply(is_service)
    # add number of days in each month
    data['days_in_month'] = None
    for m in month_dict.keys():
        data.loc[data.month == m, 'days_in_month'] = month_dict[m]
    data['description'] = data.shop_name + ' ' + data.item_category_name + ' ' + data.item_name

Additional features

In [110]:
def create_col_with_min_freq(data, col, min_freq=10, test=False):
    # replace rare values (less than min_freq rows) in feature by RARE_VALUE
    data[col + '_fixed'] = data[col].astype(str)
    if test is False or col != 'item_name':
        data.loc[sales_train_gr[col].value_counts()[data[col + '_fixed']].values < min_freq, col + '_fixed'] = "RARE_VALUE"
    
def create_gr_feats(data, test=False):
    # create aggregation feats for numeric features based on categorical ones
    for cat_col in ['shop_name', 'item_category_name', 'city', 'global_category', 'item_name']:
        create_col_with_min_freq(data, cat_col, 15, test)
        for num_col in ['item_price']:
            for n, f in [('mean', np.mean), ('min', np.nanmin), ('max', np.nanmax)]:
                data['FIXED_' + n + '_' + num_col + 
                     '_by_' + cat_col] = sales_train_gr.groupby(cat_col + '_fixed')[num_col].transform(f)
                
    # create features with counts
    for col in ['shop_name', 'item_name', 'item_category_name', 'city', 'global_category', 'item_price']:
        data[col + '_cnt'] = data[col].map(sales_train_gr[col].value_counts(dropna = False))
        
    # count number of NAN in each row
    data['NANs_cnt'] = data.isnull().sum(axis = 1)

Add expert and additional features

In [111]:
%%time

create_expert_features(sales_train_gr)
create_expert_features(sales_test)

create_gr_feats(sales_train_gr)
create_gr_feats(sales_test, test=True)

# some values of item_name feature in test dataset can't be met in train dataset
# so we need to add them in values_count for test dataset to prevent error
train_vc = pd.concat([sales_train_gr['item_name_fixed'].value_counts(), 
           sales_test.loc[~sales_test['item_name_fixed'].isin(sales_train_gr['item_name_fixed']), 
                          'item_name_fixed'].value_counts()])
train_vc[len(sales_train_gr['item_name_fixed'].value_counts()):] = 0

sales_test.loc[train_vc[sales_test['item_name_fixed']].values < 15, 'item_name_fixed'] = 'RARE_VALUE'

CPU times: user 26.9 s, sys: 1.5 s, total: 28.4 s
Wall time: 28.4 s


Rename feature item_cnt_day to item_cnt_month 

In [125]:
sales_train_gr.rename({'item_cnt_day': 'item_cnt_month'}, axis=1, inplace=True)

# Use LightAutoML

import LAMA libraries

In [None]:
import os
import logging

from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.tasks import Task
import multiprocessing as mp

logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

In [None]:
N_THREADS = mp.cpu_count() - 1
N_FOLDS = 10
RANDOM_STATE = 42
TIMEOUT = 600
TARGET_NAME = 'item_cnt_month'


task = Task('reg', loss='mse', metric='mse')
roles = {
    'target': TARGET_NAME,
    'drop': ['shop_id', 'item_id', 'item_category_id', 'description']
}

In [None]:
automl = TabularUtilizedAutoML(task=task,
                               timeout=TIMEOUT,
                               cpu_limit=N_THREADS,
                               general_params={'use_algos':
                                                  [['lgb', 'cb'],
                                                   ['lgb_tuned'],
                                                   ]},
                               reader_params={'cv': N_FOLDS,
                                             'random_state': RANDOM_STATE,
                                             'n_jobs': N_THREADS}
                          ) 


oof_pred = automl.fit_predict(sales_train, roles=roles)
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

In [None]:
test_pred = automl.predict(sales_test)
logging.info('Prediction for test data:\n{}\nShape = {}'
             .format(test_pred, test_pred.shape))

In [None]:
submission[TARGET_NAME] = test_pred.data[:, 0]
submission.head()

# Futher ideas

