# Walmart Sales Prediction - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on the Walmart sales dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Load Data

In [None]:
# Load the dataset
df = pd.read_csv('../data/raw/Walmart.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nNumber of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

## 2. Data Overview

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Data types and non-null counts
df.info()

In [None]:
# Statistical summary
df.describe()

## 3. Missing Values Analysis

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})

missing_df = missing_df[missing_df['Missing Values'] > 0].sort_values('Missing Values', ascending=False)

if len(missing_df) > 0:
    print("Missing Values Summary:")
    print(missing_df)
else:
    print("No missing values found!")

## 4. Data Visualization

In [None]:
# Plot distribution of numerical features
numeric_columns = df.select_dtypes(include=[np.number]).columns

n_cols = 3
n_rows = (len(numeric_columns) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 4))
axes = axes.flatten() if n_rows > 1 else [axes]

for idx, col in enumerate(numeric_columns):
    if idx < len(axes):
        axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black')
        axes[idx].set_title(f'Distribution of {col}')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')

# Hide extra subplots
for idx in range(len(numeric_columns), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
correlation = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', center=0, square=True, linewidths=1)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 6. Outlier Detection

In [None]:
# Box plots for outlier detection
numeric_columns = df.select_dtypes(include=[np.number]).columns

n_cols = 3
n_rows = (len(numeric_columns) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 4))
axes = axes.flatten() if n_rows > 1 else [axes]

for idx, col in enumerate(numeric_columns):
    if idx < len(axes):
        axes[idx].boxplot(df[col].dropna())
        axes[idx].set_title(f'Boxplot of {col}')
        axes[idx].set_ylabel(col)

# Hide extra subplots
for idx in range(len(numeric_columns), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.show()

## 7. Categorical Features Analysis

In [None]:
# Analyze categorical features
categorical_columns = df.select_dtypes(include=['object']).columns

if len(categorical_columns) > 0:
    for col in categorical_columns:
        print(f"\n{col} - Value Counts:")
        print(df[col].value_counts())
        print(f"Unique values: {df[col].nunique()}")
else:
    print("No categorical columns found")

## 8. Key Insights and Observations

Document your key findings here:

1. Dataset characteristics
2. Data quality issues
3. Feature relationships
4. Potential feature engineering opportunities
5. Modeling recommendations

## Next Steps

1. Feature engineering
2. Data preprocessing
3. Model training and evaluation
4. Hyperparameter tuning