In [1]:
import pandas as pd

In [80]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [81]:
def rmse(a, b):
    return np.sqrt(mean_squared_error(a, b))

In [3]:
train = pd.read_csv("../data/train/sales.csv")
validation = pd.read_csv("../data/validation/sales.csv")
test = pd.read_csv("../data/test/test.csv")

In [4]:
train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-02-01,0,59,22154,999.00,1.0
1,2013-03-01,0,25,2552,899.00,1.0
2,2013-05-01,0,25,2552,899.00,-1.0
3,2013-06-01,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.00,1.0
...,...,...,...,...,...,...
2797945,2015-01-10,33,25,7640,4040.00,1.0
2797946,2015-05-10,33,25,7780,248.00,1.0
2797947,2015-04-10,33,25,7233,599.00,1.0
2797948,2015-03-10,33,25,7233,599.00,1.0


In [31]:
(train.query("item_cnt_day > 0")
    .assign(date=lambda df: pd.to_datetime(df.date))
    .assign(date_block_num=lambda df: df.eval("date.dt.year").astype(str) + "-" + df.eval("date.dt.month").astype(str))
)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-02-01,2013-2,59,22154,999.00,1.0
1,2013-03-01,2013-3,25,2552,899.00,1.0
3,2013-06-01,2013-6,25,2554,1709.05,1.0
4,2013-01-15,2013-1,25,2555,1099.00,1.0
5,2013-10-01,2013-10,25,2564,349.00,1.0
...,...,...,...,...,...,...
2797945,2015-01-10,2015-1,25,7640,4040.00,1.0
2797946,2015-05-10,2015-5,25,7780,248.00,1.0
2797947,2015-04-10,2015-4,25,7233,599.00,1.0
2797948,2015-03-10,2015-3,25,7233,599.00,1.0


In [59]:
train_df.date_year.min()

2013

In [62]:
def aggregate_months(df):
    return (df.query("item_cnt_day > 0")
        .assign(date=lambda df: pd.to_datetime(df.date))
        .assign(
            date_month=lambda df: df.date.dt.month,
            date_year=lambda df: df.date.dt.year,
        )
        .assign(
            date_block_num=lambda df: (df.date_year - 2013) * 12 + df.date_month,
        )
        .assign(date_block_num=lambda df: df.date_block_num.astype(int))
        .groupby(["shop_id", "item_id", "date_block_num"])
        .agg({"item_cnt_day": sum, "date_year": lambda r: r.iloc[0], "date_month": lambda r: r.iloc[0]})
        .reset_index()
        .sort_values(["date_block_num", "shop_id", "item_id"])
        .assign(item_cnt_day=lambda df: df.item_cnt_day.clip(0, 20))
    )

In [63]:
train_df = aggregate_months(train)
X_train, y_train = train_df.drop(columns=["item_cnt_day"]), train_df["item_cnt_day"]

In [64]:
train_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_day,date_year,date_month
2,0,32,1,4.0,2013,1
8,0,33,1,2.0,2013,1
12,0,35,1,2.0,2013,1
22,0,43,1,1.0,2013,1
25,0,51,1,2.0,2013,1


In [65]:
val_df = aggregate_months(validation)
X_val, y_val = val_df.drop(columns=["item_cnt_day"]), val_df["item_cnt_day"]

In [66]:
# reg = RandomForestRegressor(n_jobs=-1, n_estimators=200)
# reg.fit(X_train, y_train)

In [67]:
gb_reg = GradientBoostingRegressor(n_estimators=200)

In [68]:
gb_reg.fit(X_train, y_train)

GradientBoostingRegressor(n_estimators=200)

In [75]:
prediction = gb_reg.predict(X_val).clip(0, 20)

In [77]:
test_range = pd.read_csv("../data/test.csv", index_col=["shop_id", "item_id"])
target_df = test_range.join(val_df.set_index(["shop_id", "item_id"]).assign(prediction=prediction)).assign(
    date_block_num=(24 + 9), 
    item_cnt_day=lambda df: df.item_cnt_day.fillna(0),
    prediction=lambda df: df.prediction.fillna(0),
    date_year=2015, 
    date_month=9,
)

In [82]:
rmse(target_df.item_cnt_day, target_df.prediction)

0.6759055848292221