# Exploratory Data Analysis (EDA) for Sales Prediction AI

This notebook performs exploratory data analysis on the sales data for our AI prediction model.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style for plots
plt.style.use('seaborn')
sns.set_palette("deep")

## Load the Data

In [None]:
# Load the data
df = pd.read_csv('../data/raw/sales_data.csv')

# Display the first few rows
print(df.head())

# Display basic information about the dataset
print(df.info())

## Data Cleaning and Preprocessing

In [None]:
# Check for missing values
print(df.isnull().sum())

# Convert 'date' to datetime
df['date'] = pd.to_datetime(df['date'])

# Extract additional features
df['day_of_week'] = df['date'].dt.day_name()
df['month'] = df['date'].dt.month
df['is_weekend'] = df['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)

print(df.head())

## Descriptive Statistics

In [None]:
# Display summary statistics
print(df.describe())

# Display summary for categorical variables
print(df.describe(include=['object']))

## Data Visualization

In [None]:
# Distribution of total sales
plt.figure(figsize=(10, 6))
sns.histplot(df['total_sales'], kde=True)
plt.title('Distribution of Total Sales')
plt.show()

# Sales by day of week
plt.figure(figsize=(12, 6))
sns.boxplot(x='day_of_week', y='total_sales', data=df, order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title('Sales Distribution by Day of Week')
plt.xticks(rotation=45)
plt.show()

# Sales by category
plt.figure(figsize=(12, 6))
sns.barplot(x='category', y='total_sales', data=df)
plt.title('Average Sales by Category')
plt.xticks(rotation=45)
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 10))
numeric_cols = df.select_dtypes(include=[np.number]).columns
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of Numeric Features')
plt.show()

## Time Series Analysis

In [None]:
# Resample data to daily total sales
daily_sales = df.groupby('date')['total_sales'].sum().reset_index()

# Plot time series
plt.figure(figsize=(15, 6))
plt.plot(daily_sales['date'], daily_sales['total_sales'])
plt.title('Daily Total Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.show()

# Seasonal decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(daily_sales.set_index('date')['total_sales'], model='additive')
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(15, 20))
result.observed.plot(ax=ax1)
ax1.set_title('Observed')
result.trend.plot(ax=ax2)
ax2.set_title('Trend')
result.seasonal.plot(ax=ax3)
ax3.set_title('Seasonal')
result.resid.plot(ax=ax4)
ax4.set_title('Residual')
plt.tight_layout()
plt.show()

## Feature Importance Analysis

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Prepare the data
features = ['day_of_week', 'month', 'category', 'store_id', 'weather_condition', 'price', 'is_weekend', 'is_holiday', 'promotion_active', 'temperature']
X = df[features]
y = df['total_sales']

# Encode categorical variables
le = LabelEncoder()
for col in X.select_dtypes(include=['object']):
    X[col] = le.fit_transform(X[col])

# Train a Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# Plot feature importances
importances = pd.DataFrame({'feature': features, 'importance': rf.feature_importances_}).sort_values('importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=importances)
plt.title('Feature Importances')
plt.show()

## Conclusion

Summarize your findings here and discuss potential next steps for feature engineering and modeling.