In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import itertools


In [3]:
DIRECTORY = './data/'
categories = pd.read_csv(DIRECTORY + 'item_categories.csv')
items = pd.read_csv(DIRECTORY + 'items.csv')
shops = pd.read_csv(DIRECTORY + 'shops.csv')
train = pd.read_csv(DIRECTORY + 'sales_train.csv', parse_dates=['date'], date_format="%d.%m.%Y")
test = pd.read_csv(DIRECTORY + 'test.csv')
train['year-month'] = train['date'].dt.to_period("M")

# clean code

In [6]:
# get dfs
date = train['year-month'].unique()[0]
train_data = train[train['year-month'] == date]
test_data = train[train['year-month'] == (date+1)]

aggregated_train = train_data.groupby(by=['shop_id', 'item_id'])['item_cnt_day'].sum().to_frame('train')
aggregated_test = test_data.groupby(by=['shop_id', 'item_id'])['item_cnt_day'].sum().to_frame('test')

alive_shops = np.intersect1d(train_data['shop_id'].unique(), test_data['shop_id'].unique())
train_items = train_data[train_data['shop_id'].isin(alive_shops)]['item_id'].unique()
test_items = test_data[test_data['shop_id'].isin(alive_shops)]['item_id'].unique()

alive_items = np.intersect1d(train_items, test_items)
all_combinations = pd.DataFrame(
    list(itertools.product(alive_shops, alive_items)),
    columns=['shop_id', 'item_id']
)

train_final = all_combinations.merge(aggregated_train, on=['shop_id', 'item_id'], how='left').fillna(0)
test_final = all_combinations.merge(aggregated_test, on=['shop_id', 'item_id'], how='left').fillna(0)

In [7]:
together = pd.merge(train_final, test_final, on=['shop_id', 'item_id'])
np.mean((together['train'] - together['test'])**2)

1.786957916345907

# reproduce 2

In [110]:
# get dfs
testing_df = train.set_index("year-month")
date = testing_df.index.unique()[0]
train_data = testing_df.loc[date]
test_data = testing_df.loc[date+1]

aggregated_train = train_data.groupby(by=[train_final.index, 'shop_id', 'item_id'], dropna=False)['item_cnt_day'].sum().reset_index()
aggregated_test = test_data.groupby(by=[train_final.index, 'shop_id', 'item_id'], dropna=False)['item_cnt_day'].sum().reset_index()

alive_shops = np.intersect1d(train_data['shop_id'].unique(), test_data['shop_id'].unique())

train_items = train_data['item_id'].unique()
test_items = test_data['item_id'].unique()
alive_items = np.intersect1d(train_items, test_items)

all_combinations = pd.DataFrame(
    list(itertools.product(alive_shops, alive_items)),
    columns=['shop_id', 'item_id']
)

train_combined = all_combinations.merge(train_data.reset_index(), on=['shop_id', 'item_id'], how='left').set_index('year-month')
test_combined = all_combinations.merge(test_data.reset_index(), on=['shop_id', 'item_id'], how='left').set_index('year-month')

In [119]:
train_combined

Unnamed: 0_level_0,shop_id,item_id,date,date_block_num,item_price,item_cnt_day
year-month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NaT,0,27,NaT,,,
NaT,0,28,NaT,,,
2013-01,0,32,2013-01-03,0.0,221.0,2.0
2013-01,0,32,2013-01-21,0.0,221.0,2.0
2013-01,0,32,2013-01-25,0.0,221.0,1.0
...,...,...,...,...,...,...
2013-01,59,22151,2013-01-10,0.0,399.0,1.0
2013-01,59,22151,2013-01-02,0.0,399.0,1.0
2013-01,59,22154,2013-01-02,0.0,999.0,1.0
NaT,59,22160,NaT,,,


In [111]:
# fit
train_final = train_combined.drop(columns=["item_price", 'date', 'date_block_num'])
train_final = train_final.groupby(by=[train_final.index, 'shop_id', 'item_id'], dropna=False)['item_cnt_day'].sum().reset_index()

test = test_combined.drop(columns=["item_price", 'date', 'date_block_num'])
test = test.groupby(by=[test.index, 'shop_id', 'item_id'], dropna=False)['item_cnt_day'].sum().reset_index()

In [112]:
together = pd.merge(train_final, test, on=['shop_id', 'item_id'])
np.mean((together['item_cnt_day_x'] - together['item_cnt_day_y'])**2)

1.7862678144888504