# BigMart Sales Prediction - Data Cleaning & EDA

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

%matplotlib inline
sns.set_style('whitegrid')

## 2. Load Datasets

In [None]:
train_path = '../dataset/raw/train_v9rqX0R.csv'
test_path = '../dataset/raw/test_AbJTz2l.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("Train Shape:", train_df.shape)
print("Test Shape:", test_df.shape)

## 3. Data Inspection

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

## 4. Handling Missing Information

### Item_Weight Analysis
1. Identify Item_Weight mismatches (deviations from median).
2. Impute missing Item_Weight using the median of the specific Item_Identifier.

In [None]:
# Combine data for unified analysis
combined = pd.concat([train_df, test_df], ignore_index=True)

# Calculate median weight per item
item_weight_median = combined.groupby('Item_Identifier')['Item_Weight'].median()

# Check for mismatches (values that exist but differ from median)
def check_mismatch(row, median_map):
    if pd.notnull(row['Item_Weight']):
        median = median_map.get(row['Item_Identifier'])
        if pd.notnull(median) and abs(row['Item_Weight'] - median) > 0.1: # Allow small float diff
            return True
    return False

mismatches = combined.apply(lambda x: check_mismatch(x, item_weight_median), axis=1)
print(f"Number of Item_Weight records deviating from item median: {mismatches.sum()}")

if mismatches.sum() > 0:
    print("Example Mismatches:")
    print(combined[mismatches][['Item_Identifier', 'Item_Weight']].head())

In [None]:
# Impute ONLY missing values using the median
def impute_weight(row, median_map):
    if pd.isnull(row['Item_Weight']):
        return median_map.get(row['Item_Identifier'], np.nan)
    return row['Item_Weight']

train_df['Item_Weight'] = train_df.apply(lambda x: impute_weight(x, item_weight_median), axis=1)
test_df['Item_Weight'] = test_df.apply(lambda x: impute_weight(x, item_weight_median), axis=1)

# Fallback for any still missing
global_median = combined['Item_Weight'].median()
train_df['Item_Weight'].fillna(global_median, inplace=True)
test_df['Item_Weight'].fillna(global_median, inplace=True)

print("Missing Item_Weight in Train after imputation:", train_df['Item_Weight'].isnull().sum())
print("Missing Item_Weight in Test after imputation:", test_df['Item_Weight'].isnull().sum())

### Outlet_Size Imputation with Random Forest
Using `Outlet_Type`, `Outlet_Location_Type`, and `Outlet_Establishment_Year` to predict `Outlet_Size`.

In [None]:
# Prepare data for imputation model
# Combine train and test again to use all available data for training the imputer
impute_df = pd.concat([train_df, test_df], ignore_index=True)

# Features for prediction
features = ['Outlet_Type', 'Outlet_Location_Type', 'Outlet_Establishment_Year']
target = 'Outlet_Size'

# Label Encode categorical features
le = LabelEncoder()
for col in features:
    impute_df[col] = le.fit_transform(impute_df[col].astype(str))

# Split into sets with known and unknown Outlet_Size
known_size = impute_df[impute_df[target].notnull()]
unknown_size = impute_df[impute_df[target].isnull()]

print(f"Training Imputer on {len(known_size)} rows. Predicting for {len(unknown_size)} rows.")

# Train Random Forest Classifier
rf_imputer = RandomForestClassifier(n_estimators=100, random_state=42)
rf_imputer.fit(known_size[features], known_size[target])

# Predict
predicted_sizes = rf_imputer.predict(unknown_size[features])

# Fill missing values in original dataframes
def fill_size(df, model, le_encoders, features): # Helper to apply model
    # We need to encode the features exactly as trained
    # Note: re-encoding here assumes consistency. Better to map.
    # For simplicity in this notebook, we'll iterate and update indices.
    pass

# Update combined dataframe first then split back? Or just fill by index.
impute_df.loc[impute_df[target].isnull(), target] = predicted_sizes

# Split back to train and test
train_df['Outlet_Size'] = impute_df.loc[:len(train_df)-1, 'Outlet_Size']
test_df['Outlet_Size'] = impute_df.loc[len(train_df):, 'Outlet_Size'].values

print("Missing Outlet_Size in Train:", train_df['Outlet_Size'].isnull().sum())
print("Missing Outlet_Size in Test:", test_df['Outlet_Size'].isnull().sum())

## 5. Cleaning Categorical Inconsistencies

### Item_Fat_Content
Standardizing values: 'LF', 'low fat' -> 'Low Fat', and 'reg' -> 'Regular'.

In [None]:
print("Original Item_Fat_Content Categories:", train_df['Item_Fat_Content'].unique())

mapping = {'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'}
train_df['Item_Fat_Content'] = train_df['Item_Fat_Content'].replace(mapping)
test_df['Item_Fat_Content'] = test_df['Item_Fat_Content'].replace(mapping)

print("Standardized Item_Fat_Content Categories:", train_df['Item_Fat_Content'].unique())

## 6. Exploratory Data Analysis (EDA)

### Univariate Analysis
Analyzing the distribution of the target variable `Item_Outlet_Sales` and other independent features.

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(train_df['Item_Outlet_Sales'])
plt.title('Distribution of Item Outlet Sales')
plt.xlabel('Item Outlet Sales')
plt.ylabel('Density')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='Item_Fat_Content', data=train_df)
plt.title('Count of Item Fat Content')
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x='Item_Type', data=train_df)
plt.xticks(rotation=90)
plt.title('Count of Item Type')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='Outlet_Size', data=train_df)
plt.title('Count of Outlet Size')
plt.show()

### Bivariate Analysis
Analyzing relationships between features and the target variable `Item_Outlet_Sales`.

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='Item_Fat_Content', y='Item_Outlet_Sales', data=train_df)
plt.title('Item Fat Content vs Item Outlet Sales')
plt.show()

In [None]:
plt.figure(figsize=(15,8))
sns.boxplot(x='Item_Type', y='Item_Outlet_Sales', data=train_df)
plt.xticks(rotation=90)
plt.title('Item Type vs Item Outlet Sales')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='Outlet_Size', y='Item_Outlet_Sales', data=train_df)
plt.title('Outlet Size vs Item Outlet Sales')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x='Item_MRP', y='Item_Outlet_Sales', data=train_df)
plt.title('Item MRP vs Item Outlet Sales')
plt.show()

## 7. Save Processed Data

In [None]:
os.makedirs('../dataset/processed', exist_ok=True)

train_df.to_csv('../dataset/processed/cleaned_train.csv', index=False)
test_df.to_csv('../dataset/processed/cleaned_test.csv', index=False)

print("Cleaned datasets saved to ../dataset/processed/")