In [1]:
import os
import sys
# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))

In [9]:
# Import necessary modules
from pathlib import Path
from data_loader import load_data, summarize_data, identify_column_types
from preprocessing import handle_missing_values, encode_categorical_columns, scale_numerical_columns
from feature_engineering import add_date_features, add_rolling_features
from model_training import train_random_forest, evaluate_model
from evaluation import save_model, load_model, calculate_confidence_intervals

In [8]:
#Define the Dataset Path
data_path = Path(r"D:\Kifya_training\Week 4\Rossmann-Pharmaceuticals-Sales-Forecasting\data\processed")
train_file = data_path / "train_data.csv"
test_file = data_path / "test_data.csv"
store_file = data_path / "store_data.csv"

In [10]:
# Load the Datasets
print("Loading datasets...")
train_data = load_data(train_file)
test_data = load_data(test_file)
store_data = load_data(store_file)

Loading datasets...
Loaded data from D:\Kifya_training\Week 4\Rossmann-Pharmaceuticals-Sales-Forecasting\data\processed\train_data.csv
Loaded data from D:\Kifya_training\Week 4\Rossmann-Pharmaceuticals-Sales-Forecasting\data\processed\test_data.csv
Loaded data from D:\Kifya_training\Week 4\Rossmann-Pharmaceuticals-Sales-Forecasting\data\processed\store_data.csv


In [11]:
# Summarize the Dataset
print("Summarizing train data...")
summarize_data(train_data)

print("\nIdentifying column types...")
numeric_cols, categorical_cols = identify_column_types(train_data)
print(f"Numeric Columns: {numeric_cols}")
print(f"Categorical Columns: {categorical_cols}")

Summarizing train data...
Summary of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 12 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Store               1017209 non-null  float64
 1   DayOfWeek           1017209 non-null  int64  
 2   Date                1017209 non-null  object 
 3   Sales               1017209 non-null  float64
 4   Customers           1017209 non-null  float64
 5   Open                1017209 non-null  float64
 6   Promo               1017209 non-null  float64
 7   StateHoliday        1017209 non-null  float64
 8   SchoolHoliday       1017209 non-null  float64
 9   Month               1017209 non-null  int64  
 10  Year                1017209 non-null  int64  
 11  DaysSinceLastPromo  1017209 non-null  int64  
dtypes: float64(7), int64(4), object(1)
memory usage: 93.1+ MB
None

Missing Values:
Store                 0
DayOfWeek     

In [None]:
# Preprocessing Pipeline
print("Handling missing values...")
train_data = handle_missing_values(train_data, strategy='mean')

print("Encoding categorical columns...")
train_data, encoders = encode_categorical_columns(train_data)

print("Scaling numerical columns...")
train_data = scale_numerical_columns(train_data, numeric_cols)

Handling missing values...


In [None]:
#Feature Engineering
print("Adding date features...")
if 'Date' in train_data.columns:
    train_data = add_date_features(train_data, 'Date')

print("Adding rolling average features...")
if 'Sales' in train_data.columns:
    train_data = add_rolling_features(train_data, 'Sales', window_size=7)


In [None]:
# Prepare Data for Modeling
print("Preparing data for modeling...")
X = train_data.drop(['Sales'], axis=1)  # Features
y = train_data['Sales']  # Target

In [None]:
# Train Machine Learning Model
print("Training Random Forest model...")
rf_params = {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}
model = train_random_forest(X, y, params=rf_params)

In [None]:
#Evaluate the Model
print("Evaluating the model...")
X_test = test_data.drop(['Sales'], axis=1) if 'Sales' in test_data.columns else test_data
y_test = test_data['Sales'] if 'Sales' in test_data.columns else None

if y_test is not None:
    rmse = evaluate_model(model, X_test, y_test)

In [None]:
# Serialize the Model
print("Saving the trained model...")
model_path = "../models/sales_model.pkl"
save_model(model, model_path)

print("Loading the model for validation...")
loaded_model = load_model(model_path)

In [None]:

# Post-Prediction Analysis
print("Calculating confidence intervals...")
if y_test is not None:
    predictions = loaded_model.predict(X_test)
    confidence_lower, confidence_upper = calculate_confidence_intervals(predictions)
    print(f"95% Confidence Interval: ({confidence_lower}, {confidence_upper})")

In [None]:
#Deep Learning Model (Optional)
# If using LSTM or another model, the workflow will be added in a separate module/script.

print("Task 2 completed!")