In [None]:
import numpy as np
import pandas as pd
import scipy
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from scipy.optimize import curve_fit
from fbprophet import Prophet
from fbprophet import Prophet
from scipy.stats import boxcox
from multiprocessing import Pool, cpu_count
import tqdm
from xgboost import XGBRegressor
from xgboost import plot_importance
from gensim.models import Word2Vec 
sns.set(style="darkgrid")

%matplotlib inline

In [None]:
items = pd.read_csv('items.csv')
item_cat = pd.read_csv('item_categories.csv')
shops = pd.read_csv('shops.csv')
sales_tr = pd.read_csv('sales_train_v2.csv')
test_df = pd.read_csv('test.csv')

In [None]:
print ('shops in train not in test is', set(sales_tr['shop_id']) - set(test_df['shop_id']))
print ('shops in test not in train is ', set(test_df['shop_id']) - set(sales_tr['shop_id']))

val1 = set(sales_tr['item_id']) - set(test_df['item_id'])
val2 = set(test_df['item_id']) - set(sales_tr['item_id'])
print ('items in train not in test is', list(val1)[:10])
print ('count of items in train not in test is', len(list(val1)))
print ('count of items in test not in train is', len(list(val2)))


In [None]:
days_per_month = pd.Series({0:31, 1:28, 2: 31, 3: 30, 4: 31, 5: 30, 6: 31, 7: 31, 8: 30, 9: 31, 10: 30, 11: 30})

In [None]:
sales_tr.head()
sales_tr = sales_tr[sales_tr['item_price'] < 10000]
sales_tr = sales_tr[sales_tr['item_cnt_day'] < 600]

In [None]:
sales_tr.loc[sales_tr.shop_id == 0, 'shop_id'] = 57
test_df.loc[test_df.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_tr.loc[sales_tr.shop_id == 1, 'shop_id'] = 58
test_df.loc[test_df.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_tr.loc[sales_tr.shop_id == 10, 'shop_id'] = 11
test_df.loc[test_df.shop_id == 10, 'shop_id'] = 11

In [None]:
matrix = sales_tr.groupby(['date_block_num', 'item_id', 'shop_id'], as_index=False).agg({'item_price': {'mean_price': 'mean', 'count_prices': 'count'}, 'item_cnt_day': 'sum'})
matrix.columns = ['date_block_num', 'item_id', 'shop_id', 'mean_price', 'count_prices', 'item_cnt_month']
matrix.loc[matrix['item_cnt_month'] < 0, 'item_cnt_month'] = 0
matrix['ID'] = -1


In [None]:
matrix.shape

In [None]:
test_df['item_cnt_month'] = 0
test_df['date_block_num'] = 34

In [None]:
test_df.shape

In [None]:
matrix = pd.concat([matrix, test_df], axis=0, sort=True)

In [None]:
matrix.shape

In [None]:
shops = shops.sort_values(by=['shop_name'])
shops['city'] = shops['shop_name'].str.split(' ').str.get(0)
shops.loc[(shops['city'] == u'!Якутск'), 'city'] = u'Якутск'
shops['city'] = shops['city'].factorize()[0]

In [None]:
item_cat = item_cat.sort_values(by=['item_category_name'])
item_cat['item_cat_level1'] = item_cat['item_category_name'].str.split('-').str.get(0)
item_cat['item_cat_level1'] = item_cat['item_cat_level1'].factorize()[0]

item_cat['item_cat_level2'] = item_cat['item_category_name'].str.split('-').str.get(1)
item_cat['item_cat_level2'] = item_cat['item_cat_level2'].factorize()[0]

item_cat['item_category_name'] = item_cat['item_category_name'].factorize()[0]

In [None]:
matrix.shape

In [None]:
print (matrix.shape)

matrix = pd.merge(matrix, shops, left_on='shop_id', right_on='shop_id', how='left')
print (matrix.shape)
matrix = pd.merge(matrix, items, left_on='item_id', right_on='item_id', how='left')
print (matrix.shape)
matrix = pd.merge(matrix, item_cat, left_on='item_category_id', right_on='item_category_id', how='left')
print (matrix.shape)
matrix['month'] =  (matrix['date_block_num'])%12
matrix['days_per_month'] = matrix['month'].map(days_per_month)

In [None]:
matrix.info()

In [None]:

matrix.ID = matrix.ID.astype('int64')
matrix.date_block_num = matrix.date_block_num.astype('uint8')
matrix.item_cnt_month = matrix.item_cnt_month.astype('int16')
matrix.item_id = matrix.item_id.astype('int16')
matrix.shop_id = matrix.shop_id.astype('int16')
matrix.item_category_id = matrix.item_category_id.astype('float16')
matrix.days_per_month = matrix.days_per_month.astype('float16')
matrix.item_cat_level1 = matrix.item_cat_level1.astype('float16')
matrix.item_cat_level2 = matrix.item_cat_level2.astype('float16')


In [None]:
# construct item_id, shop_id sales matrix
sales_matrix = matrix.groupby(['item_id', 'shop_id'])['item_cnt_month'].sum().unstack('shop_id')

shop_id_sales_me_na = sales_matrix.mean(axis=0)
shop_id_sales_me_z = sales_matrix.fillna(0).mean(axis=0)
item_id_sales_me_na = sales_matrix.mean(axis=1)
item_id_sales_me_z = sales_matrix.fillna(0).mean(axis=1)


# mean encoding for shop id based on sales
matrix['m_shop_id_sales_me_na'] = matrix['shop_id'].map(shop_id_sales_me_na)
matrix['m_shop_id_sales_me_z'] = matrix['shop_id'].map(shop_id_sales_me_z)

# mean encoding for item_id based on sales
matrix['m_item_id_sales_me_na'] = matrix['item_id'].map(item_id_sales_me_na)
matrix['m_item_id_sales_me_z'] = matrix['item_id'].map(item_id_sales_me_z)

del sales_matrix
del shop_id_sales_me_na
del shop_id_sales_me_z
del item_id_sales_me_na
del item_id_sales_me_z

In [None]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [None]:
matrix = lag_feature(matrix, [1,2,3,6,12], 'item_cnt_month')

In [None]:
matrix.info()

In [None]:
group = matrix.groupby(['date_block_num', 'shop_id'], as_index=False)['item_cnt_month'].mean()
group.columns = ['date_block_num', 'shop_id', 'mean_shop_sales']
matrix = pd.merge(matrix, group, left_on = ['date_block_num', 'shop_id'], right_on=['date_block_num', 'shop_id'], how='left')
print (matrix.shape)

matrix = lag_feature(matrix, [1,2,3,6], 'mean_shop_sales')
print (matrix.shape)

In [None]:
group = matrix.groupby(['date_block_num', 'item_category_id'], as_index=False)['item_cnt_month'].mean()
group.columns = ['date_block_num', 'item_category_id', 'mean_item_cat_sales']
matrix = pd.merge(matrix, group, left_on = ['date_block_num', 'item_category_id'], right_on=['date_block_num', 'item_category_id'], how='left')
print (matrix.shape)

matrix = lag_feature(matrix, [1,2,3,6], 'mean_item_cat_sales')
print (matrix.shape)

In [None]:
group = matrix.groupby(['date_block_num', 'item_id'], as_index=False)['item_cnt_month'].mean()
group.columns = ['date_block_num', 'item_id', 'mean_item_sales']
matrix = pd.merge(matrix, group, left_on = ['date_block_num', 'item_id'], right_on=['date_block_num', 'item_id'], how='left')
print (matrix.shape)

matrix = lag_feature(matrix, [1,2,3,6], 'mean_item_sales')
print (matrix.shape)

In [None]:
group = matrix.groupby(['date_block_num', 'item_id', 'city'], as_index=False)['item_cnt_month'].mean()
group.columns = ['date_block_num', 'item_id', 'city', 'mean_item_city_sales']
matrix = pd.merge(matrix, group, left_on = ['date_block_num', 'item_id', 'city'], right_on=['date_block_num', 'item_id', 'city'], how='left')
print (matrix.shape)

matrix = lag_feature(matrix, [1,2,3,6], 'mean_item_city_sales')
print (matrix.shape)

In [None]:
group = matrix.groupby(['date_block_num', 'shop_id', 'city'], as_index=False)['item_cnt_month'].mean()
group.columns = ['date_block_num', 'shop_id', 'city', 'mean_item_shop_sales']
matrix = pd.merge(matrix, group, left_on = ['date_block_num', 'shop_id', 'city'], right_on=['date_block_num', 'shop_id', 'city'], how='left')
print (matrix.shape)

matrix = lag_feature(matrix, [1,2,3,6], 'mean_item_shop_sales')
print (matrix.shape)

In [None]:
matrix.drop(['shop_name', 'item_name', 'item_category_name'], axis=1, inplace=True)

In [None]:
def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)         
    return df

matrix = fill_na(matrix)

In [None]:
X_train = matrix[ (matrix.date_block_num < 33)].drop(['item_cnt_month'], axis=1)
Y_train = matrix[ (matrix.date_block_num < 33)]['item_cnt_month']
X_valid = matrix[matrix.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = matrix[matrix.date_block_num == 33]['item_cnt_month']
X_test = matrix[matrix.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)
