# BigMart Sales Prediction - Feature Engineering

## 1. Import Libraries & Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

%matplotlib inline
sns.set_style('whitegrid')

In [None]:
train_df = pd.read_csv('../dataset/processed/cleaned_train.csv')
test_df = pd.read_csv('../dataset/processed/cleaned_test.csv')

print("Train Shape:", train_df.shape)
print("Test Shape:", test_df.shape)

## 2. Feature Generation

### Outlet_Years
The dataset is from 2013. We can create a new feature `Outlet_Years` indicating how old the outlet is.

In [None]:
train_df['Outlet_Years'] = 2013 - train_df['Outlet_Establishment_Year']
test_df['Outlet_Years'] = 2013 - test_df['Outlet_Establishment_Year']

print(train_df[['Outlet_Establishment_Year', 'Outlet_Years']].head())

### Item_Type_Combined
Simplifying `Item_Type` into broader categories might help.

In [None]:
train_df['Item_Type_Combined'] = train_df['Item_Identifier'].apply(lambda x: x[0:2])
train_df['Item_Type_Combined'] = train_df['Item_Type_Combined'].map({'FD': 'Food', 'NC': 'Non-Consumable', 'DR': 'Drinks'})

test_df['Item_Type_Combined'] = test_df['Item_Identifier'].apply(lambda x: x[0:2])
test_df['Item_Type_Combined'] = test_df['Item_Type_Combined'].map({'FD': 'Food', 'NC': 'Non-Consumable', 'DR': 'Drinks'})

print(train_df['Item_Type_Combined'].value_counts())

## 3. Categorical Encoding

Using Label Encoding for ordinal variables and One-Hot Encoding for nominal variables.
For this baseline, let's use Label Encoding for simplicity on some, or One-Hot for everything. Model requirements vary. 
Let's Label Encode `Outlet_Size`, `Outlet_Location_Type`, and One-Hot `Item_Fat_Content`, `Outlet_Type`, `Item_Type_Combined`.

In [None]:
le = LabelEncoder()

cols_to_encode = ['Outlet_Size', 'Outlet_Location_Type']

for col in cols_to_encode:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.fit_transform(test_df[col])

# One-Hot Encoding
train_df = pd.get_dummies(train_df, columns=['Item_Fat_Content', 'Outlet_Type', 'Item_Type_Combined'])
test_df = pd.get_dummies(test_df, columns=['Item_Fat_Content', 'Outlet_Type', 'Item_Type_Combined'])

print("Train Shape after encoding:", train_df.shape)
print("Test Shape after encoding:", test_df.shape)

## 4. Drop Unnecessary Columns
Dropping `Item_Identifier`, `Outlet_Identifier`, `Item_Type` (since we have combined), and `Outlet_Establishment_Year` (since we have Years).

In [None]:
cols_to_drop = ['Item_Identifier', 'Outlet_Identifier', 'Item_Type', 'Outlet_Establishment_Year']
# Be careful to drop only what exists. Some might have been dropped or transformed.

train_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
test_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Store IDs separately if needed for submission, but for training we drop them.
# We should probably keep IDs in a separate dataframe for submission matching before dropping.

In [None]:
# Save Feature Engineered Data
train_df.to_csv('../dataset/processed/feat_eng_train.csv', index=False)
test_df.to_csv('../dataset/processed/feat_eng_test.csv', index=False)

print("Feature Engineered data saved.")