In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split


In [2]:
from sklearn.metrics import mean_squared_error

def rmse(y_te, y_p):
    return np.sqrt(mean_squared_error(y_te, y_p))

In [3]:
train = pd.read_csv('data/technical/sales_train.csv')

In [4]:
# regroup the train data on the relevant columns
group_on = ['date_block_num', 'shop_id', 'item_id']
# sum it up
df = train.groupby(group_on).agg({'item_cnt_day': ['sum']})
train.drop('item_cnt_day', axis=1, inplace=True)
df.columns = df.columns.droplevel(1)

# merge it back
df = df.merge(train, left_index=True, right_on=group_on, how='left')
# remove the duplicates
df.drop_duplicates(group_on, inplace=True)
# rename the column and drop redundant columns
df.rename(columns = {'item_cnt_day': 'item_cnt_month'}, inplace=True)
df.drop('date', axis=1, inplace=True)

# copy it back
train = df.copy()
del df

In [5]:
# assign features and label
X = train.drop('item_cnt_month', axis=1)
Y = train['item_cnt_month']

In [6]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.65, random_state=42)

LR = LinearRegression()
LR.fit(X_train, y_train)
prediction = LR.predict(X_test).clip(0, 20)

print(f"RMSE: {rmse(y_test, prediction)}")

RMSE: 7.86341511326255
