# BigMart Sales Prediction: End-to-End Pipeline

This notebook contains the complete workflow for predicting BigMart sales, ensuring a reproducible flow from raw data to final submission. 
It uses a **Neural Network (Keras)** which was found to be the best performing model (RMSE ~1015).
**Configuration**: No L2 Regularization, No Target Scaling.

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
import warnings
warnings.filterwarnings('ignore')

## 1. Data Loading

In [None]:
# Define paths
RAW_DIR = 'dataset/raw'
TRAIN_PATH = os.path.join(RAW_DIR, 'train_v9rqX0R.csv')
TEST_PATH = os.path.join(RAW_DIR, 'test_AbJTz2l.csv')

# Load datasets
print("Loading datasets...")
train_raw = pd.read_csv(TRAIN_PATH)
test_raw = pd.read_csv(TEST_PATH)
print(f"Train Shape: {train_raw.shape}")
print(f"Test Shape: {test_raw.shape}")

## 2. Data Cleaning
- **Item_Weight**: Impute missing values with median per `Item_Identifier`.
- **Outlet_Size**: Impute missing values using a Random Forest Classifier.
- **Item_Fat_Content**: Standardize categories.

In [None]:
def clean_data(train_df, test_df):
    print("--- Starting Data Cleaning ---")
    combined = pd.concat([train_df, test_df], ignore_index=True)

    # 1. Impute Item_Weight (Median by Item_Identifier)
    item_weight_median = combined.groupby('Item_Identifier')['Item_Weight'].median()
    def impute_weight(row):
        if pd.isnull(row['Item_Weight']):
            return item_weight_median.get(row['Item_Identifier'], np.nan)
        return row['Item_Weight']
    
    combined['Item_Weight'] = combined.apply(impute_weight, axis=1)
    combined['Item_Weight'].fillna(combined['Item_Weight'].median(), inplace=True) # Fallback

    # 2. Impute Outlet_Size (Random Forest)
    # Encode features for RF
    impute_df = combined.copy()
    features = ['Outlet_Type', 'Outlet_Location_Type', 'Outlet_Establishment_Year']
    target = 'Outlet_Size'
    
    le = LabelEncoder()
    for col in features:
        impute_df[col] = le.fit_transform(impute_df[col].astype(str))
        
    known = impute_df[impute_df[target].notnull()]
    unknown = impute_df[impute_df[target].isnull()]
    
    if len(unknown) > 0:
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(known[features], known[target])
        predicted_sizes = rf.predict(unknown[features])
        combined.loc[combined[target].isnull(), target] = predicted_sizes
        print("Outlet_Size imputed using Random Forest.")

    # 3. Standardize Item_Fat_Content
    mapping = {'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'}
    combined['Item_Fat_Content'] = combined['Item_Fat_Content'].replace(mapping)
    
    print("--- Data Cleaning Completed ---")
    return combined

cleaned_df = clean_data(train_raw, test_raw)

## 3. Feature Engineering
- **Item_Visibility_MeanRatio**: Ratio of visibility to mean visibility of item.
- **Item_Visibility_Ratio_OutletSize**: Ratio of visibility to mean visibility of outlet size.
- **Outlet_Years**: 2013 - Establishment Year.
- **Encoding**: Manual Ordinal for sizes/locations, One-Hot for others.

In [None]:
def engineer_features(df):
    print("--- Starting Feature Engineering ---")
    
    # 1. Item_Visibility_MeanRatio
    visibility_avg = df.pivot_table(values='Item_Visibility', index='Item_Identifier')
    # First impute 0 visibility
    def impute_visibility(row):
        if row['Item_Visibility'] == 0:
            return visibility_avg.loc[row['Item_Identifier'], 'Item_Visibility']
        return row['Item_Visibility']
    
    df['Item_Visibility'] = df.apply(impute_visibility, axis=1)
    
    # Re-calc average and create ratio
    visibility_avg = df.pivot_table(values='Item_Visibility', index='Item_Identifier')
    df['Item_Visibility_MeanRatio'] = df.apply(
        lambda x: x['Item_Visibility'] / visibility_avg.loc[x['Item_Identifier'], 'Item_Visibility'], axis=1
    )

    # 2. Item_Visibility_Ratio_OutletSize
    vis_size_avg = df.pivot_table(values='Item_Visibility', index='Outlet_Size')
    def get_vis_size_ratio(row):
        size = row['Outlet_Size']
        vis = row['Item_Visibility']
        if size in vis_size_avg.index:
             mean_vis = vis_size_avg.loc[size, 'Item_Visibility']
             return vis / mean_vis if mean_vis != 0 else 0
        return 1.0
    
    df['Item_Visibility_Ratio_OutletSize'] = df.apply(get_vis_size_ratio, axis=1)

    # 3. Outlet_Years
    df['Outlet_Years'] = 2013 - df['Outlet_Establishment_Year']
    
    # 4. Item_Type_Combined
    df['Item_Type_Combined'] = df['Item_Identifier'].apply(lambda x: x[0:2])
    df['Item_Type_Combined'] = df['Item_Type_Combined'].map(
        {'FD': 'Food', 'NC': 'Non-Consumable', 'DR': 'Drinks'}
    )
    
    # 5. Encoding
    # Ordinal
    size_map = {'Small': 0, 'Medium': 1, 'High': 2}
    df['Outlet_Size'] = df['Outlet_Size'].map(size_map)
    
    loc_map = {'Tier 1': 0, 'Tier 2': 1, 'Tier 3': 2}
    df['Outlet_Location_Type'] = df['Outlet_Location_Type'].map(loc_map)
    
    # One-Hot
    df = pd.get_dummies(df, columns=['Item_Fat_Content', 'Outlet_Type', 'Item_Type_Combined'])
    
    # 6. Drop Redundant
    df.drop(columns=['Outlet_Establishment_Year'], inplace=True)
    
    print("--- Feature Engineering Completed ---")
    return df

processed_df = engineer_features(cleaned_df)

## 4. Preprocessing
- Split Train/Test.
- Scale Features (RobustScaler).
- **NO Target Scaling** (Model trained on raw sales).

In [None]:
# Separate Train/Test
train_len = len(train_raw)
train_final = processed_df.iloc[:train_len].copy()
test_final = processed_df.iloc[train_len:].copy()

# Features & Target
X = train_final.drop(columns=['Item_Outlet_Sales'])
y = train_final['Item_Outlet_Sales']
X_test = test_final.drop(columns=['Item_Outlet_Sales'], errors='ignore')

# Drop IDs for training
cols_to_drop = ['Item_Identifier', 'Outlet_Identifier', 'Item_Type']
X_train_data = X.drop(columns=cols_to_drop)
X_test_data = X_test.drop(columns=cols_to_drop)

# Scaling
scaler_x = RobustScaler()
X_scaled = scaler_x.fit_transform(X_train_data)
X_test_scaled = scaler_x.transform(X_test_data)

# Target is NOT scaled for this best performing model
y_train = y.values

# Validation Split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_scaled, y_train, test_size=0.2, random_state=42
)

print(f"Training Data Shape: {X_train_split.shape}")

## 5. Neural Network Model

In [None]:
def build_model(input_dim):
    model = keras.Sequential([
        layers.Dense(256, activation='relu', input_shape=(input_dim,)),
        layers.Dropout(0.4),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
                  loss='mean_squared_error')
    return model

model = build_model(X_train_split.shape[1])
history = model.fit(
    X_train_split, y_train_split,
    validation_data=(X_val_split, y_val_split),
    batch_size=32,
    epochs=150,
    verbose=1,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
    ]
)

## 6. Evaluation

In [None]:
# Plot Loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Training Loss')
plt.legend()
plt.show()

# RMSE
y_pred = model.predict(X_val_split).flatten()
# No inverse transform needed for target
rmse = np.sqrt(mean_squared_error(y_val_split, y_pred))
print(f"Validation RMSE: {rmse}")

## 7. Submission
Retraining on full data and generating predictions.

In [None]:
print("Retraining on full dataset...")
final_model = build_model(X_scaled.shape[1])
final_model.fit(
    X_scaled, y_train,
    batch_size=32,
    epochs=150,
    verbose=1,
    callbacks=[keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5)]
)

predictions = final_model.predict(X_test_scaled).flatten()
predictions = np.maximum(predictions, 0)

submission = test_raw[['Item_Identifier', 'Outlet_Identifier']]
submission['Item_Outlet_Sales'] = predictions
submission.to_csv('final_submission.csv', index=False)
print("Submission saved to final_submission.csv")