In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split


In [2]:
from sklearn.metrics import mean_squared_error

def rmse(y_te, y_p):
    return np.sqrt(mean_squared_error(y_te, y_p))

In [3]:
train = pd.read_csv('data/technical/sales_train.csv')

In [4]:
# regroup the train data on the relevant columns
group_on = ['date_block_num', 'shop_id', 'item_id']
# sum it up
df = train.groupby(group_on).agg({'item_cnt_day': ['sum']})
train.drop('item_cnt_day', axis=1, inplace=True)
df.columns = df.columns.droplevel(1)

# merge it back
df = df.merge(train, left_index=True, right_on=group_on, how='left')
# remove the duplicates
df.drop_duplicates(group_on, inplace=True)
# rename the column and drop redundant columns
df.rename(columns = {'item_cnt_day': 'item_cnt_month'}, inplace=True)
df.drop('date', axis=1, inplace=True)

# copy it back
train = df.copy()
del df

train['item_cnt_month'] = train['item_cnt_month'].clip(0, 20)

In [5]:
from itertools import product

matrix = []
for i in range(34):
    this_month = train[train.date_block_num == i]
    matrix.append(np.array(list(product(
            [i], this_month.shop_id.unique(), this_month.item_id.unique()
    ))))

# vstack == stack array row-wise
matrix = pd.DataFrame(np.vstack(matrix), columns=group_on)
matrix = pd.merge(matrix, train, how='left', on=group_on)

matrix['item_cnt_month'] = matrix['item_cnt_month'].fillna(0)
matrix.drop('item_price', axis=1, inplace=True)

In [6]:
# assign features and label
X = matrix.drop('item_cnt_month', axis=1)
Y = matrix['item_cnt_month']

In [7]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.65, random_state=42)

LR = LinearRegression()
LR.fit(X_train, y_train)
prediction = LR.predict(X_test).clip(0, 20)

print(f"RMSE: {rmse(y_test, prediction)}")

RMSE: 1.2233041172027828
