In [1]:
# Import custom module
import sys
import os
sys.path.insert(0,os.path.abspath("../scripts"))
import prediction as pre
import importlib
importlib.reload(pre)

import logging
import pandas as pd


In [2]:
# Load a small sample to inspect data
train_sample = pd.read_csv('../data/train.csv', nrows=10)

# Check the data types
print(train_sample.dtypes)

# Inspect values in column 7 (replace with actual column index or name)
print(train_sample.iloc[:, 7].unique())

# Load the full dataset with the correct dtype for column 7
train = pd.read_csv('../data/train.csv', dtype={'StateHoliday': 'str'})  # Treat 'StateHoliday' as string


Store             int64
DayOfWeek         int64
Date             object
Sales             int64
Customers         int64
Open              int64
Promo             int64
StateHoliday      int64
SchoolHoliday     int64
dtype: object
[0]


In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename='rossmann_sales_forecast.log')
# Load the dataset
test = pre.load_data('../data/test.csv')

Pre-processing data

In [4]:
#  Apply feature engineering
train = pre.feature_engineering(train)
test = pre.feature_engineering(test)

In [5]:
# Handle missing values
train = pre.handle_missing(train)
test = pre.handle_missing(test)

In [6]:
# Prepare features and target
X_train = train.drop(columns=['Sales', 'Customers', 'Date', 'Store'])
y_train = train['Sales']
X_test = test.drop(columns=['Date', 'Store'])

In [7]:
# Encode categorical variables
X_train_encoded, X_test_encoded = pre.encode_categorical(X_train, X_test)

Building models with sklearn pipelines

In [8]:
# Split data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = pre.train_test_split(X_train_encoded, y_train, test_size=0.2, random_state=42)

In [9]:
# Train the model
model = pre.train_model(X_train_split, y_train_split)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [10]:
# Evaluate the model
pre.evaluate_model(model, X_val_split, y_val_split)

RMSE: 2444.1386896772933


In [11]:
test['PredictedSales'] = model.predict(X_test_encoded)

In [12]:
# Save predictions to a CSV file
test[['Id', 'PredictedSales']].to_csv('../data/predicted_sales.csv', index=False)
print("Predictions saved.")

Predictions saved.


Choose a loss function

Root Mean Squared Error (RMSE):

RMSE gives higher weight to large errors, making it suitable for cases where outliers or large deviations matter. Since you're predicting sales, having a few large errors could be detrimental to business planning, so RMSE is often a good choice.
Pros: Sensitive to large errors (e.g., over-predicting sales for a store by a large margin).
Cons: May overly penalize outliers.

In [13]:
# Make predictions for the validation set
y_pred = model.predict(X_val_split)

# Calculate RMSE
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_val_split, y_pred))
print(f"Root Mean Squared Error: {rmse}")


Root Mean Squared Error: 2444.1386896772933
