In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
import itertools
import lightgbm as lgb

In [66]:
DIRECTORY = './data/'
categories = pd.read_csv(DIRECTORY + 'item_categories.csv')
items = pd.read_csv(DIRECTORY + 'items.csv')
shops = pd.read_csv(DIRECTORY + 'shops.csv')
train = pd.read_csv(DIRECTORY + 'sales_train.csv', parse_dates=['date'], date_format='%d.%m.%Y')
test = pd.read_csv(DIRECTORY + 'test.csv')
train['month'] = train['date'].dt.to_period('M')

In [67]:
# Final submission
submission = test.copy()
submission_items = submission['item_id'].unique()
submission_stores = submission['shop_id'].unique()

In [68]:
# 1. New Items - predict that category's avg in the first month
new_items = np.setdiff1d(test['item_id'].unique(), train['item_id'].unique())

first_month_map = train.groupby(['item_id'])['month'].min() # for each item, get its first month
train['first_month'] = train['item_id'].map(first_month_map)
train['item_cnt_day'] = np.clip(train['item_cnt_day'], 0, 20)
first_month_sales = train[train['month'] == train['first_month']].groupby(['item_id', 'shop_id'])['item_cnt_day'].sum().to_frame('first_month_sales').reset_index()
first_month_sales = first_month_sales.merge(items, on=['item_id'], how='left').merge(categories, on=['item_category_id'])
first_month_sales['first_month_sales'] = np.clip(first_month_sales['first_month_sales'], 0, 20)

by_category = first_month_sales.groupby(['item_category_id', 'shop_id'])['first_month_sales'].mean().to_frame('prediction_category').reset_index()
by_category['prediction_category'] = np.clip(by_category['prediction_category'], 0, 20)

# put into submission
new_items_result = pd.DataFrame(data=itertools.product(new_items, submission_stores), columns=['item_id', 'shop_id'])
new_items_result = new_items_result.merge(items[['item_id', 'item_category_id']], on=['item_id'], how='left')
new_items_result = new_items_result.merge(by_category, on=['item_category_id', 'shop_id'], how='left').fillna(0)
new_items_result = new_items_result[['item_id', 'shop_id', 'prediction_category']].rename(columns={'prediction_category': 'item_cnt_month'})
new_items_result

Unnamed: 0,item_id,shop_id,item_cnt_month
0,83,5,2.525735
1,83,4,1.571195
2,83,6,2.592036
3,83,3,1.694639
4,83,2,1.363636
...,...,...,...
15241,22137,46,2.483843
15242,22137,41,1.743869
15243,22137,44,2.268916
15244,22137,39,1.307692


In [69]:
# 2. Old Items - predict the same shop/item combo the previous month

old_items = np.intersect1d(submission_items, train['item_id'].unique())

last_month = train[train['month'] == pd.to_datetime('2015-10-01').to_period('M')]
last_month_aggregated = last_month.groupby(['shop_id', 'item_id'])['item_cnt_day'].sum()

old_items_result = pd.DataFrame(data=itertools.product(old_items, submission_stores), columns=['item_id', 'shop_id'])
old_items_result = old_items_result.merge(last_month_aggregated, on=['shop_id', 'item_id'], how='left').fillna(0)
old_items_result = old_items_result.rename(columns={'item_cnt_day': 'item_cnt_month'})
old_items_result['item_cnt_month'] = np.clip(old_items_result['item_cnt_month'], 0, 20)
old_items_result

Unnamed: 0,item_id,shop_id,item_cnt_month
0,30,5,0.0
1,30,4,0.0
2,30,6,0.0
3,30,3,0.0
4,30,2,0.0
...,...,...,...
198949,22167,46,0.0
198950,22167,41,0.0
198951,22167,44,0.0
198952,22167,39,0.0


In [70]:
submission = pd.concat([new_items_result, old_items_result])
test.merge(submission, on=['shop_id', 'item_id'])[['ID', 'item_cnt_month']].to_csv('submission.csv', index=False)