In [1]:
import pandas as pd
import numpy as np
from typing import Dict, Tuple, Any
import warnings
warnings.filterwarnings('ignore')

In [None]:
data_path = "/Users/whysocurious/Documents/MLDSAIProjects/BigMartSalesPred_Hackathon/data"

train_clean = pd.read_csv(f"{data_path}/processed/train_cleaned.csv")
test_clean = pd.read_csv(f"{data_path}/processed/test_cleaned.csv")

print (train_clean.shape, test_clean.shape)

(8523, 14) (5681, 12)


In [None]:
# Continuing from Phase 1 - Feature Engineering
print("=== Phase 2: Feature Engineering ===")

# 2.1 Basic Features
print("\n2.1 Creating basic derived features...")

# Outlet Age - key feature as older stores might have established customer base
train_clean['Outlet_Age'] = 2013 - train_clean['Outlet_Establishment_Year']
train_clean['Outlet_Age_Squared'] = train_clean['Outlet_Age'] ** 2

print(f"Outlet age range: {train_clean['Outlet_Age'].min()} to {train_clean['Outlet_Age'].max()} years")

# Item MRP bins - price segments often drive different buying behaviors
train_clean['Item_MRP_Bins'] = pd.cut(
    train_clean['Item_MRP'], 
    bins=[0, 50, 100, 150, 200, 300], 
    labels=['Very_Low', 'Low', 'Medium', 'High', 'Premium'],
    include_lowest=True
)

print("MRP distribution across bins:")
print(train_clean['Item_MRP_Bins'].value_counts())

# Price per unit visibility - efficiency metric
train_clean['Price_Per_Unit_Visibility'] = train_clean['Item_MRP'] / (train_clean['Item_Visibility'] + 0.001)


=== Phase 2: Feature Engineering ===

2.1 Creating basic derived features...
Outlet age range: 4 to 28 years
MRP distribution across bins:
Item_MRP_Bins
High        2434
Medium      2210
Low         1682
Premium     1440
Very_Low     757
Name: count, dtype: int64


In [4]:
# 2.2 Category-Level Statistics (only calculate on train data to prevent leakage)
print("\n2.2 Creating category-level statistical features...")

# train_mask = train_clean['source'] == 'train'
train_data = train_clean.copy()

# Item-Type and Outlet combination stats
item_outlet_stats = train_data.groupby(['Item_Type', 'Outlet_Identifier'])['Item_Outlet_Sales'].agg(['mean', 'count']).reset_index()
item_outlet_stats.columns = ['Item_Type', 'Outlet_Identifier', 'Item_Type_Outlet_Avg_Sales', 'Item_Type_Outlet_Count']

# Merge back to train_clean dataset
train_clean = train_clean.merge(item_outlet_stats, on=['Item_Type', 'Outlet_Identifier'], how='left')

# Fill missing values for test data with overall means
train_clean['Item_Type_Outlet_Avg_Sales'].fillna(train_data['Item_Outlet_Sales'].mean(), inplace=True)
train_clean['Item_Type_Outlet_Count'].fillna(1, inplace=True)

# Item-level statistics across all outlets
item_stats = train_data.groupby('Item_Identifier')['Item_Outlet_Sales'].agg(['mean', 'std']).reset_index()
item_stats.columns = ['Item_Identifier', 'Item_Avg_Sales', 'Item_Sales_Std']
item_stats['Item_Sales_Std'].fillna(0, inplace=True)  # New items might not have std

train_clean = train_clean.merge(item_stats, on='Item_Identifier', how='left')
train_clean['Item_Avg_Sales'].fillna(train_data['Item_Outlet_Sales'].mean(), inplace=True)
train_clean['Item_Sales_Std'].fillna(train_data['Item_Outlet_Sales'].std(), inplace=True)

# Outlet-level statistics
outlet_stats = train_data.groupby('Outlet_Identifier').agg({
    'Item_Outlet_Sales': 'mean',
    'Item_Visibility': 'sum'
}).reset_index()
outlet_stats.columns = ['Outlet_Identifier', 'Outlet_Avg_Sales', 'Outlet_Total_Visibility']

train_clean = train_clean.merge(outlet_stats, on='Outlet_Identifier', how='left')
train_clean['Outlet_Avg_Sales'].fillna(train_data['Item_Outlet_Sales'].mean(), inplace=True)
train_clean['Outlet_Total_Visibility'].fillna(train_clean['Item_Visibility'].mean(), inplace=True)

print("Statistical features created successfully")



2.2 Creating category-level statistical features...
Statistical features created successfully


In [5]:
# 2.3 Competition & Cross-Product Features
print("\n2.3 Creating competition and positioning features...")

# Within category competition
train_clean['Items_In_Same_Category'] = train_clean.groupby(['Outlet_Identifier', 'Item_Type'])['Item_Identifier'].transform('count')

# Category visibility share - how much shelf space does this item get in its category
category_visibility = train_clean.groupby(['Outlet_Identifier', 'Item_Type'])['Item_Visibility'].transform('sum')
train_clean['Category_Visibility_Share'] = train_clean['Item_Visibility'] / (category_visibility + 0.001)

# Price ranking within category and outlet
train_clean['Item_Price_Rank_In_Category'] = train_clean.groupby(['Outlet_Identifier', 'Item_Type'])['Item_MRP'].rank(method='dense')

# Count of cheaper alternatives in same category
def count_cheaper_alternatives(group):
    result = []
    for idx, row in group.iterrows():
        cheaper_count = (group['Item_MRP'] < row['Item_MRP']).sum()
        result.append(cheaper_count)
    return pd.Series(result, index=group.index)

cheaper_counts = train_clean.groupby(['Outlet_Identifier', 'Item_Type']).apply(count_cheaper_alternatives)
train_clean['Cheaper_Alternatives_Count'] = cheaper_counts.values

# Relative price positioning
category_avg_price = train_clean.groupby(['Outlet_Identifier', 'Item_Type'])['Item_MRP'].transform('mean')
train_clean['Price_Ratio_To_Category_Avg'] = train_clean['Item_MRP'] / category_avg_price

# Relative visibility positioning
category_avg_visibility = train_clean.groupby(['Outlet_Identifier', 'Item_Type'])['Item_Visibility'].transform('mean')
train_clean['Visibility_Ratio_To_Category_Avg'] = train_clean['Item_Visibility'] / (category_avg_visibility + 0.001)

print("Competition features created")



2.3 Creating competition and positioning features...
Competition features created


In [6]:
# 2.4 Outlet Assortment Features
print("\n2.4 Creating outlet assortment features...")

# Diversity metrics
train_clean['Unique_Categories_In_Outlet'] = train_clean.groupby('Outlet_Identifier')['Item_Type'].transform('nunique')

# Premium product ratio per outlet
outlet_premium_ratio = train_clean.groupby('Outlet_Identifier').apply(
    lambda x: (x['Item_MRP'] > 150).mean()
).reset_index()
outlet_premium_ratio.columns = ['Outlet_Identifier', 'Outlet_Premium_Ratio']

train_clean = train_clean.merge(outlet_premium_ratio, on='Outlet_Identifier', how='left')

# Low fat ratio per outlet
train_clean['Outlet_Low_Fat_Ratio'] = train_clean.groupby('Outlet_Identifier')['Low_Fat_Flag'].transform('mean')

print("Outlet assortment features created")


2.4 Creating outlet assortment features...
Outlet assortment features created


In [7]:
# 2.5 Advanced Interaction Features
print("\n2.5 Creating interaction features...")

# Location-Type interactions
train_clean['Is_Tier1_Supermarket'] = (
    (train_clean['Outlet_Location_Type'] == 'Tier 1') & 
    (train_clean['Outlet_Type'].str.contains('Supermarket'))
).astype(int)

# Price-Size interactions
train_clean['Large_Outlet_Premium_Item'] = (
    (train_clean['Outlet_Size'] == 'Large') & 
    (train_clean['Item_MRP_Bins'] == 'Premium')
).astype(int)

# Category-Outlet type interactions
drinks_categories = ['Soft Drinks', 'Dairy']
train_clean['Drinks_In_Grocery'] = (
    (train_clean['Item_Type'].isin(drinks_categories)) & 
    (train_clean['Outlet_Type'] == 'Grocery Store')
).astype(int)

print("Interaction features created")


2.5 Creating interaction features...
Interaction features created


In [8]:
# 2.6 Domain-Based Complementary Product Features
print("\n2.6 Creating complementary product features...")

# Define complement groups based on shopping patterns
complement_groups = {
    'Breakfast': ['Dairy', 'Breads', 'Breakfast'],
    'Snacks': ['Snack Foods', 'Soft Drinks'],
    'Household': ['Household', 'Health and Hygiene'],
    'Cooking': ['Fruits and Vegetables', 'Meat', 'Seafood']
}

# Create complement group mapping
item_to_group = {}
for group, items in complement_groups.items():
    for item in items:
        item_to_group[item] = group

train_clean['Complement_Group'] = train_clean['Item_Type'].map(item_to_group).fillna('Other')

# Count of items from same complement group in outlet
train_clean['Complement_Group_Items_Count'] = train_clean.groupby(['Outlet_Identifier', 'Complement_Group'])['Item_Identifier'].transform('count')

# Total visibility of complement group in outlet
train_clean['Complement_Group_Visibility'] = train_clean.groupby(['Outlet_Identifier', 'Complement_Group'])['Item_Visibility'].transform('sum')

print("Complementary product features created")



2.6 Creating complementary product features...
Complementary product features created


In [9]:
# Additional useful features based on domain knowledge
print("\n2.7 Additional domain-specific features...")

# Item establishment ratio - how long has this item type been in the outlet relative to outlet age
# This captures if certain categories were introduced later
train_clean['Item_Weight_To_MRP_Ratio'] = train_clean['Item_Weight'] / (train_clean['Item_MRP'] + 1)

# Visibility per dollar - marketing efficiency
train_clean['Visibility_Per_Dollar'] = train_clean['Item_Visibility'] / (train_clean['Item_MRP'] + 1)

# High visibility flag - items with above average visibility in their category
train_clean['Above_Avg_Visibility_In_Category'] = (
    train_clean['Item_Visibility'] > train_clean.groupby(['Outlet_Identifier', 'Item_Type'])['Item_Visibility'].transform('mean')
).astype(int)

# Outlet market penetration - how well established is this outlet type in this location
outlet_location_counts = train_clean.groupby(['Outlet_Location_Type', 'Outlet_Type']).size().reset_index(name='Outlet_Type_Count_In_Location')
train_clean = train_clean.merge(outlet_location_counts, on=['Outlet_Location_Type', 'Outlet_Type'], how='left')


2.7 Additional domain-specific features...


In [11]:
print("=== Feature Engineering Summary ===")
print(f"Total features created: {train_clean.shape[1]}")
print(f"New features: {train_clean.shape[1] - 12}")  # Original dataset had 12 columns

# Check for any remaining missing values
missing_check = train_clean.isnull().sum()
if missing_check.sum() > 0:
    print("\nRemaining missing values:")
    print(missing_check[missing_check > 0])
else:
    print("\nNo missing values remaining!")

# Display feature correlation with target (for train data only)
if 'Item_Outlet_Sales' in train_clean.columns:
    print("\nTop 15 features correlated with sales:")
    train_correlations = train_clean.select_dtypes(include=[np.number]).corr()['Item_Outlet_Sales'].abs().sort_values(ascending=False)
    print(train_correlations.head(15))

print("\nFeature engineering completed successfully!")
print("Ready for encoding and model development...")

=== Feature Engineering Summary ===
Total features created: 42
New features: 30

No missing values remaining!

Top 15 features correlated with sales:
Item_Outlet_Sales               1.000000
log_sales                       0.861762
Item_Avg_Sales                  0.653482
Item_MRP                        0.567574
Price_Ratio_To_Category_Avg     0.562509
Item_Price_Rank_In_Category     0.525312
Item_Sales_Std                  0.505950
Item_Type_Outlet_Avg_Sales      0.504331
Outlet_Avg_Sales                0.490755
Item_Weight_To_MRP_Ratio        0.408235
Visibility_Per_Dollar           0.362340
Outlet_Total_Visibility         0.219985
Items_In_Same_Category          0.168149
Item_Type_Outlet_Count          0.168149
Complement_Group_Items_Count    0.151363
Name: Item_Outlet_Sales, dtype: float64

Feature engineering completed successfully!
Ready for encoding and model development...
