# Rossman Sales Prediction - Part 2

# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Settings and Auxiliary Functions

In [2]:
plt.rcParams["figure.figsize"] = [9,6]
sns.set_theme()

In [3]:
def calc_error( model_name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    
    return pd.DataFrame({"Model": model_name, "MAE": mae, "MAPE": mape, "RMSE": rmse}, index=[0])

# Data Loading

In [17]:
X_train = pd.read_csv("training_set.csv", low_memory=False, index_col=0)
X_test = pd.read_csv("validation_set.csv", low_memory=False, index_col=0)

In [18]:
y_train = X_train["Sales"]
y_test = X_test["Sales"]

X_train = X_train.drop(columns=["Sales"])
X_test = X_test.drop(columns=["Sales"])

# Model Training

## Baseline (Average) Model

In [19]:
aux1 = X_train.copy()
aux1["Sales"] = y_train.copy()
aux1 = aux1[["Store", "Sales"]].groupby("Store").mean().reset_index()

aux2 = X_test.copy()
y_pred = pd.merge(aux1, aux2, how="right", on="Store")["Sales"]

baseline_error = calc_error("Baseline", np.exp(y_test), np.exp(y_pred))
baseline_error

Unnamed: 0,Model,MAE,MAPE,RMSE
0,Baseline,1429.764812,0.216813,1939.336071


## Linear Regression

In [20]:
lr = LinearRegression().fit(X_train, y_train)
y_pred = lr.predict(X_test)
linear_error = calc_error("Linear Regression", np.exp(y_test), np.exp(y_pred))
linear_error

Unnamed: 0,Model,MAE,MAPE,RMSE
0,Linear Regression,1872.234082,0.295008,2676.976699


## Random Forest Regression

In [None]:
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf_error = calc_error("Random Forest", np.exp(y_test), np.exp(y_pred))