In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Preprocessing for Walmart Demand Prediction\n",
    "\n",
    "This notebook demonstrates the data preprocessing steps for the demand prediction system."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from datetime import datetime, timedelta\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Generate Sample Data\n",
    "\n",
    "For demonstration purposes, we'll create sample sales data with the following features:\n",
    "- Date\n",
    "- Region\n",
    "- Product ID\n",
    "- Sales Quantity\n",
    "- Price\n",
    "- Promotion Flag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Generate sample data\n",
    "np.random.seed(42)\n",
    "\n",
    "# Parameters\n",
    "n_days = 365\n",
    "regions = ['North', 'South', 'East', 'West']\n",
    "products = [f'SKU{i:03d}' for i in range(1, 11)]\n",
    "\n",
    "# Create date range\n",
    "start_date = datetime(2023, 1, 1)\n",
    "dates = [start_date + timedelta(days=i) for i in range(n_days)]\n",
    "\n",
    "# Generate data\n",
    "data = []\n",
    "for date in dates:\n",
    "    for region in regions:\n",
    "        for product in products:\n",
    "            # Base demand with some randomness\n",
    "            base_demand = np.random.normal(100, 20)\n",
    "            \n",
    "            # Add seasonality\n",
    "            seasonality = 20 * np.sin(2 * np.pi * date.timetuple().tm_yday / 365)\n",
    "            \n",
    "            # Add promotion effect\n",
    "            is_promotion = np.random.random() < 0.1  # 10% chance of promotion\n",
    "            promotion_effect = 30 if is_promotion else 0\n",
    "            \n",
    "            # Calculate final demand\n",
    "            demand = max(0, int(base_demand + seasonality + promotion_effect))\n",
    "            \n",
    "            # Add price\n",
    "            base_price = np.random.normal(50, 10)\n",
    "            price = base_price * 0.8 if is_promotion else base_price\n",
    "            \n",
    "            data.append({\n",
    "                'date': date,\n",
    "                'region': region,\n",
    "                'product_id': product,\n",
    "                'sales_quantity': demand,\n",
    "                'price': round(price, 2),\n",
    "                'promotion_flag': is_promotion\n",
    "            })\n",
    "\n",
    "# Create DataFrame\n",
    "df = pd.DataFrame(data)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Data Cleaning and Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Add time-based features\n",
    "df['month'] = df['date'].dt.month\n",
    "df['day_of_week'] = df['date'].dt.dayofweek\n",
    "df['quarter'] = df['date'].dt.quarter\n",
    "\n",
    "# Calculate revenue\n",
    "df['revenue'] = df['sales_quantity'] * df['price']\n",
    "\n",
    "# Check for missing values\n",
    "print(\"Missing values:\")\n",
    "print(df.isnull().sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Data Analysis and Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Plot sales trends by region\n",
    "plt.figure(figsize=(12, 6))\n",
    "for region in regions:\n",
    "    region_data = df[df['region'] == region].groupby('date')['sales_quantity'].mean()\n",
    "    plt.plot(region_data.index, region_data.values, label=region)\n",
    "\n",
    "plt.title('Average Daily Sales by Region')\n",
    "plt.xlabel('Date')\n",
    "plt.ylabel('Sales Quantity')\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Analyze promotion impact\n",
    "promo_analysis = df.groupby('promotion_flag')['sales_quantity'].agg(['mean', 'std'])\n",
    "print(\"\\nPromotion Impact Analysis:\")\n",
    "print(promo_analysis)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Save Processed Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Save to CSV\n",
    "df.to_csv('../data/processed_sales_data.csv', index=False)\n",
    "print(\"Data saved to processed_sales_data.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}