In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exploratory Data Analysis\n",
    "## Automated Analytics & Predictive Modeling Tool\n",
    "\n",
    "This notebook demonstrates the exploratory analysis capabilities of the tool."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from datetime import datetime\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Set plotting style\n",
    "plt.style.use('seaborn-v0_8-whitegrid')\n",
    "sns.set_palette(\"husl\")\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load data\n",
    "from src.data_processor import DataProcessor\n",
    "\n",
    "processor = DataProcessor()\n",
    "data = processor.process_pipeline()\n",
    "\n",
    "print(f\"Data shape: {data.shape}\")\n",
    "print(\"\\nFirst 5 rows:\")\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data overview\n",
    "print(\"Data Information:\")\n",
    "print(\"=\" * 50)\n",
    "data.info()\n",
    "\n",
    "print(\"\\n\\nDescriptive Statistics:\")\n",
    "print(\"=\" * 50)\n",
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Time series plots\n",
    "fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
    "\n",
    "# Sales trend\n",
    "axes[0, 0].plot(data['date'], data['sales'], marker='o', linewidth=2)\n",
    "axes[0, 0].set_title('Daily Sales Trend', fontsize=14, fontweight='bold')\n",
    "axes[0, 0].set_xlabel('Date')\n",
    "axes[0, 0].set_ylabel('Sales')\n",
    "axes[0, 0].grid(True, alpha=0.3)\n",
    "\n",
    "# Revenue trend\n",
    "axes[0, 1].plot(data['date'], data['revenue'], marker='s', color='green', linewidth=2)\n",
    "axes[0, 1].set_title('Daily Revenue Trend', fontsize=14, fontweight='bold')\n",
    "axes[0, 1].set_xlabel('Date')\n",
    "axes[0, 1].set_ylabel('Revenue')\n",
    "axes[0, 1].grid(True, alpha=0.3)\n",
    "\n",
    "# Users trend\n",
    "axes[1, 0].plot(data['date'], data['users'], marker='^', color='orange', linewidth=2)\n",
    "axes[1, 0].set_title('Daily Active Users', fontsize=14, fontweight='bold')\n",
    "axes[1, 0].set_xlabel('Date')\n",
    "axes[1, 0].set_ylabel('Users')\n",
    "axes[1, 0].grid(True, alpha=0.3)\n",
    "\n",
    "# Conversion rate trend\n",
    "axes[1, 1].plot(data['date'], data['conversion_rate'] * 100, marker='d', color='red', linewidth=2)\n",
    "axes[1, 1].set_title('Conversion Rate Trend', fontsize=14, fontweight='bold')\n",
    "axes[1, 1].set_xlabel('Date')\n",
    "axes[1, 1].set_ylabel('Conversion Rate (%)')\n",
    "axes[1, 1].grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Correlation analysis\n",
    "numeric_cols = ['sales', 'revenue', 'users', 'conversion_rate']\n",
    "correlation_matrix = data[numeric_cols].corr()\n",
    "\n",
    "plt.figure(figsize=(10, 8))\n",
    "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,\n",
    "            square=True, linewidths=1, cbar_kws={\"shrink\": 0.8})\n",
    "plt.title('Correlation Matrix of Key Metrics', fontsize=16, fontweight='bold')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Predictive modeling demonstration\n",
    "from src.predictive_model import PredictiveModel\n",
    "\n",
    "model = PredictiveModel()\n",
    "metrics = model.train(data, target_column='sales')\n",
    "\n",
    "print(\"Model Performance Metrics:\")\n",
    "print(\"=\" * 50)\n",
    "for key, value in metrics.items():\n",
    "    if isinstance(value, float):\n",
    "        print(f\"{key}: {value:.4f}\")\n",
    "    else:\n",
    "        print(f\"{key}: {value}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate and visualize forecast\n",
    "forecast = model.forecast(data, periods=14, target_column='sales')\n",
    "\n",
    "# Create forecast visualization\n",
    "plt.figure(figsize=(14, 7))\n",
    "\n",
    "# Plot historical data\n",
    "plt.plot(data['date'], data['sales'], 'b-', label='Historical Sales', linewidth=2, alpha=0.7)\n",
    "\n",
    "# Plot forecast\n",
    "plt.plot(forecast['date'], forecast['predicted_sales'], 'r--', label='Forecast', linewidth=3)\n",
    "\n",
    "# Plot confidence interval\n",
    "plt.fill_between(forecast['date'], \n",
    "                 forecast['confidence_interval_lower'], \n",
    "                 forecast['confidence_interval_upper'], \n",
    "                 alpha=0.2, color='red', label='90% Confidence Interval')\n",
    "\n",
    "plt.title('14-Day Sales Forecast with Confidence Interval', fontsize=16, fontweight='bold')\n",
    "plt.xlabel('Date', fontsize=12)\n",
    "plt.ylabel('Sales', fontsize=12)\n",
    "plt.legend(fontsize=11)\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Feature importance analysis\n",
    "feature_importance = pd.DataFrame({\n",
    "    'Feature': list(metrics['feature_importance'].keys()),\n",
    "    'Importance': list(metrics['feature_importance'].values())\n",
    "}).sort_values('Importance', key=abs, ascending=False)\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "bars = plt.barh(feature_importance['Feature'], feature_importance['Importance'])\n",
    "\n",
    "# Color bars based on sign\n",
    "for bar in bars:\n",
    "    if bar.get_width() >= 0:\n",
    "        bar.set_color('green')\n",
    "    else:\n",
    "        bar.set_color('red')\n",
    "\n",
    "plt.xlabel('Coefficient Value', fontsize=12)\n",
    "plt.title('Feature Importance (Linear Regression Coefficients)', fontsize=14, fontweight='bold')\n",
    "plt.grid(True, alpha=0.3, axis='x')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Key Insights\n",
    "\n",
    "From this analysis, we can observe:\n",
    "\n",
    "1. **Trend Patterns**: Clear patterns in daily metrics with potential weekly seasonality\n",
    "2. **Correlations**: Strong relationships between different business metrics\n",
    "3. **Forecast Reliability**: Model shows good predictive capability with reasonable confidence intervals\n",
    "4. **Feature Importance**: Time-based features (lags, rolling statistics) are most significant for predictions\n",
    "\n",
    "These insights directly support the \"AI use case\" for proactive performance management."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}