In [2]:
import pandas as pd
import numpy as np
import joblib

# Load the trained model and imputer
model = joblib.load('../models/random_forest_model.joblib')
imputer = joblib.load('../models/mean_imputer.joblib')

print("Model and imputer loaded successfully")


Model and imputer loaded successfully


In [4]:
# Load raw input data
raw_df = pd.read_csv('../data/new_data.csv', parse_dates=['date', 'expiry_date'])
print(f"Loaded raw data: {raw_df.shape[0]} rows")
raw_df.head()


Loaded raw data: 10000 rows


Unnamed: 0,date,store_id,prescription_id,prescription_flag,product_id,product_category,manufacturer,units_sold,price,discount_percent,...,lead_time_days,stock_out_flag,holiday_flag,special_event_flag,delivery_time_hours,delivery_delay_flag,competitor_price_index,customer_segment_id,day_of_week,month_of_year
0,2024-09-13,store_4,e2e42f85-059d-4c23-b5b3-eaa077a33970,1,prod_2,OTC,Sun Pharma,5,64.85,0,...,9,0,1,0,21.75,0,0.97,wholesale,5,9
1,2024-07-30,store_4,7e3726d5-6f6d-4d9e-90ca-7c0e057ea1ff,1,prod_17,Supplement,Novartis,6,36.6,0,...,3,1,0,0,27.68,0,1.11,retail,2,7
2,2024-04-12,store_2,8b262d5f-ea43-4090-98f2-3ec3b7518a4a,1,prod_15,OTC,Novartis,6,13.57,30,...,7,0,1,0,28.64,0,1.17,retail,5,4
3,2024-08-17,store_1,ada10ed1-c590-4833-bcd9-195031b6d613,1,prod_13,Supplement,Novartis,2,31.53,20,...,9,0,1,0,26.34,0,0.86,retail,6,8
4,2024-02-16,store_2,89ac7340-1449-4081-8c3e-62eb37a69d2b,0,prod_11,Prescription,Pfizer,5,85.03,30,...,12,1,1,0,17.06,0,1.1,wholesale,5,2


In [5]:
df = raw_df.copy()

# Sort for lag/rolling calculations
df = df.sort_values(['store_id','product_id','date']).reset_index(drop=True)

# Lag features
df['units_sold_lag_1'] = df.groupby(['store_id','product_id'])['units_sold'].shift(1)
df['units_sold_lag_7'] = df.groupby(['store_id','product_id'])['units_sold'].shift(7)

# Rolling statistics
df['units_sold_roll_mean_7'] = df.groupby(['store_id','product_id'])['units_sold'] \
                                 .transform(lambda x: x.shift(1).rolling(7, min_periods=1).mean())
df['units_sold_roll_std_7']  = df.groupby(['store_id','product_id'])['units_sold'] \
                                 .transform(lambda x: x.shift(1).rolling(7, min_periods=1).std())

# Time-to-expiry
df['days_to_expiry'] = (df['expiry_date'] - df['date']).dt.days

# Time since last sale
df['days_since_last_sale'] = df.groupby(['store_id','product_id'])['date'] \
                               .transform(lambda x: x.diff().dt.days)

# Categorical encodings
df = pd.get_dummies(df, columns=['product_category','customer_segment_id'], drop_first=True)

# Date-derived variables
df['is_weekend']   = df['date'].dt.weekday.isin([5,6]).astype(int)
df['day_of_month'] = df['date'].dt.day
df['quarter_of_year'] = df['date'].dt.quarter

# Promotion intensity
df['promotion_intensity'] = df['discount_percent'] / 100.0

# Supply chain ratios
df['order_to_sale_ratio'] = df['order_quantity'] / (df['units_sold'] + 1)
df['lead_time_adjusted']  = df['lead_time_days'] * df['delivery_delay_flag']

print("Feature engineering applied")


Feature engineering applied


In [7]:
# Select feature columns (same as training)
feature_cols = [
    'prescription_flag','price','discount_percent','promotion_flag',
    'inventory_level','safety_stock','reorder_point','order_quantity',
    'lead_time_days','stock_out_flag','holiday_flag','special_event_flag',
    'delivery_time_hours','delivery_delay_flag','competitor_price_index',
    'day_of_week','month_of_year','units_sold_lag_1','units_sold_lag_7',
    'units_sold_roll_mean_7','units_sold_roll_std_7','days_since_last_sale',
    'product_category_Prescription','product_category_Supplement',
    'customer_segment_id_wholesale','is_weekend','day_of_month',
    'quarter_of_year','promotion_intensity','order_to_sale_ratio',
    'lead_time_adjusted'
]

X_new = df[feature_cols]

# Impute
X_imputed_new = pd.DataFrame(
    imputer.transform(X_new),
    columns=feature_cols,
    index=X_new.index
)

print("Imputation complete. Any NaNs left? ", X_imputed_new.isna().any().any())


Imputation complete. Any NaNs left?  False


In [8]:
# Predict units_sold
df['forecast_units_sold'] = model.predict(X_imputed_new)

# Preview forecasts
df[['store_id','product_id','date','forecast_units_sold']].head()


Unnamed: 0,store_id,product_id,date,forecast_units_sold
0,store_1,prod_1,2024-01-02,8.01
1,store_1,prod_1,2024-01-05,5.0
2,store_1,prod_1,2024-01-06,4.0
3,store_1,prod_1,2024-01-07,7.0
4,store_1,prod_1,2024-01-21,6.0


In [10]:
import os

# Ensure the predictions directory exists
os.makedirs('../predictions', exist_ok=True)

# Save to CSV
output_path = '../predictions/forecast.csv'
df[['store_id','product_id','date','forecast_units_sold']].to_csv(output_path, index=False)
print(f"Forecasts saved to {output_path}")


Forecasts saved to ../predictions/forecast.csv
