In [3]:
import datetime
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import matplotlib

In [4]:
matplotlib.rcParams['figure.figsize'] = [20, 10]

In [5]:
sales = pd.read_csv("data/competitive-data-science-predict-future-sales/sales_train.csv.gz")
sales['date'] = sales['date'].apply(lambda x: datetime.datetime.strptime(x, '%d.%m.%Y'))

In [63]:
total_daily_item_sales = sales.groupby(['date', 'item_id'], as_index=False)[['item_cnt_day']].sum()

In [120]:
total_monthly_item_sales = total_daily_shop_sales\
    .groupby((pd.Grouper(key='date', freq="M"), pd.Grouper(key='item_id')))\
    [["item_cnt_day"]].sum().reset_index().rename({"item_cnt_day": "item_cnt_month"}, axis=1)
total_monthly_item_sales.shape

(233912, 3)

In [124]:
dates = total_monthly_item_sales['date'].unique()
item_ids = total_monthly_item_sales['item_id'].unique()

dates_item_ids = [[date, item_id] for date in dates for item_id in item_ids]
complete_df = pd.DataFrame(dates_item_ids, columns=["date", "item_id"])
complete_df.shape

(741438, 2)

In [126]:
complete_df = complete_df.merge(total_monthly_item_sales, how="left").fillna(0)

In [129]:
def add_lag(df, lag, col_name='item_cnt_month'):
    df[col_name + "-" + str(lag)] = df.groupby(['item_id'])[col_name].shift(lag)
    return df

def add_lags(df, lags, col_name='item_cnt_month'):
    for lag in lags:
        df = add_lag(df, lag, col_name)
    return df

complete_df = add_lags(complete_df, range(1, 13))

In [133]:
Xy = complete_df.query("date > '2013-12-31'")

def separate_X_y(Xy, sales_col='item_cnt_month'):
    y = Xy[sales_col]
    X = Xy.drop(sales_col, axis=1)
    if 'date' in X.columns:
        X = X.drop('date', axis=1)
    return X, y
        
def train_dev_test_split(Xy):
    X_test, y_test = separate_X_y(Xy.query("date == '2015-10-31'"))
    X_dev, y_dev = separate_X_y(Xy.query("date == '2015-09-30'"))
    X_train, y_train = separate_X_y(Xy.query("date <= '2015-08-31'"))
    return X_train, X_dev, X_test, y_train, y_dev, y_test

X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(Xy)

In [141]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train.drop("item_id", axis=1), y_train)
y_dev_hat = reg.predict(X_dev.drop("item_id", axis=1))
np.sqrt(mean_squared_error(y_dev, y_dev_hat))

28.36079762371046

In [138]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


reg_rf = RandomForestRegressor(random_state=667)
reg_rf.fit(X_train.drop("item_id", axis=1), y_train)
y_dev_hat = reg_rf.predict(X_dev.drop("item_id", axis=1))
np.sqrt(mean_squared_error(y_dev, y_dev_hat))

29.333380817161505