In [1]:
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
ensemble_dir = "/content/drive/MyDrive/Sales Forecast/ensemble_xgb_cnn"
ensemble_meta = joblib.load(f"{ensemble_dir}/ensemble_meta.pkl")

model_xgb = xgb.Booster()
model_xgb.load_model(ensemble_meta['model_xgb_path'])

model_cnnlstm = load_model(ensemble_meta['model_cnnlstm_path'])
scaler = joblib.load(ensemble_meta['scaler_path'])

In [4]:
test = pd.read_csv("/content/drive/MyDrive/Sales Forecast/test_processed1.csv", parse_dates=['Date'])
sample_submission = pd.read_csv("/content/drive/MyDrive/Sales Forecast/sample_submission.csv")

In [5]:
test.columns.tolist()

['Id',
 'Store',
 'DayOfWeek',
 'Date',
 'Open',
 'Promo',
 'StateHoliday',
 'SchoolHoliday',
 'StoreType',
 'Assortment',
 'CompetitionDistance',
 'CompetitionOpenSinceMonth',
 'CompetitionOpenSinceYear',
 'Promo2',
 'Promo2SinceWeek',
 'Promo2SinceYear',
 'PromoInterval']

In [6]:
def apply_feature_engineering(df):
    df = df.copy()

    df['DateInt'] = df['Date'].astype(np.int64) // 10**9
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfYear'] = df['Date'].dt.dayofyear
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week.astype(int)
    df['IsWeekend'] = df['DayOfWeek'].isin([6, 7]).astype(int)
    df['Quarter'] = df['Date'].dt.quarter
    df['IsMonthStart'] = df['Date'].dt.is_month_start.astype(int)
    df['IsMonthEnd'] = df['Date'].dt.is_month_end.astype(int)

    df['CompetitionOpenSince'] = (
        12 * (df['Year'] - df['CompetitionOpenSinceYear']) +
        (df['Month'] - df['CompetitionOpenSinceMonth'])
    ).clip(lower=0)

    df['Promo2Since'] = (
        52 * (df['Year'] - df['Promo2SinceYear']) +
        (df['WeekOfYear'] - df['Promo2SinceWeek'])
    ).clip(lower=0)

    df['IsPromo2Month'] = 0
    promo_month_map = {
        1: [1, 4, 7, 10],
        2: [2, 5, 8, 11],
        3: [3, 6, 9, 12]
    }
    for interval_code, months in promo_month_map.items():
        df.loc[(df['PromoInterval'] == interval_code) & (df['Month'].isin(months)), 'IsPromo2Month'] = 1

    selected_columns = [
        'Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday',
        'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
        'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear',
        'PromoInterval', 'DateInt', 'Year', 'Month', 'Day', 'DayOfYear', 'WeekOfYear',
        'IsWeekend', 'Quarter', 'IsMonthStart', 'IsMonthEnd', 'CompetitionOpenSince',
        'Promo2Since', 'IsPromo2Month'
    ]
    return df[[col for col in selected_columns if col in df.columns]]

In [7]:
test_fe = apply_feature_engineering(test)
test_fe = test_fe.bfill().reset_index(drop=True)

In [8]:
# Prepare tabular input for XGBoost
test_xgb = test_fe.drop(columns=['Date'] + (['Sales'] if 'Sales' in test_fe.columns else []))

In [9]:
def prepare_cnn_input(df, window=14):
    df = df.drop(columns=['Date'])
    df = df.bfill()
    if 'Sales' in df.columns:
        df = df.drop(columns=['Sales'])
    X_scaled = StandardScaler().fit_transform(df)
    X_seq = [X_scaled[i-window:i] for i in range(window, len(X_scaled))]
    return np.array(X_seq)

X_cnn = prepare_cnn_input(test_fe)
X_xgb = test_xgb.iloc[-len(X_cnn):].copy()

In [10]:
# Align XGBoost input columns
for col in model_xgb.feature_names:
    if col not in X_xgb.columns:
        X_xgb[col] = 0
X_xgb = X_xgb[model_xgb.feature_names]

In [11]:
xgb_preds = model_xgb.predict(xgb.DMatrix(X_xgb))
cnn_scaled_preds = model_cnnlstm.predict(X_cnn).flatten()
cnn_preds = scaler.inverse_transform(cnn_scaled_preds.reshape(-1, 1)).flatten()

ensemble_preds = 0.5 * xgb_preds + 0.5 * cnn_preds

[1m1284/1284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step


In [13]:
final_ids = test.iloc[-len(ensemble_preds):]['Id'].values
submission = pd.DataFrame({
    'Id': final_ids,
    'Sales': np.expm1(ensemble_preds)  # inverse of log1p
})

submission.to_csv("/content/drive/MyDrive/Sales Forecast/final_submission.csv", index=False)
print("✅ Submission saved as final_submission.csv")

✅ Submission saved as final_submission.csv
