In [None]:
# layer1/notebooks/03_preprocessing_demo.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing Demo\n",
    "\n",
    "Demonstrate the preprocessing pipeline and feature engineering."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.insert(0, '..')\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from layer1.core.preprocessing import DataPreprocessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load sample data\n",
    "df = pd.read_csv('../data/raw/insurance_claims.csv')\n",
    "print(f\"Original shape: {df.shape}\")\n",
    "print(f\"Original columns: {list(df.columns)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run Preprocessing Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessor = DataPreprocessor()\n",
    "processed_df = preprocessor.preprocess(df)\n",
    "\n",
    "print(f\"Processed shape: {processed_df.shape}\")\n",
    "print(f\"\\nNew columns added:\")\n",
    "new_cols = set(processed_df.columns) - set(df.columns)\n",
    "for col in sorted(new_cols):\n",
    "    print(f\"  - {col}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Markov State Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "state_counts = processed_df['markov_state'].value_counts()\n",
    "print(\"Markov State Distribution:\")\n",
    "print(state_counts)\n",
    "\n",
    "state_counts.plot(kind='bar')\n",
    "plt.title('Distribution of Markov States')\n",
    "plt.ylabel('Count')\n",
    "plt.xticks(rotation=45)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Groups for Layer 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_groups = preprocessor.get_feature_groups()\n",
    "\n",
    "for group, columns in feature_groups.items():\n",
    "    print(f\"\\n{group.upper()}:\")\n",
    "    available = [c for c in columns if c in processed_df.columns]\n",
    "    missing = [c for c in columns if c not in processed_df.columns]\n",
    "    \n",
    "    for col in available[:5]:  # Show first 5\n",
    "        print(f\"  âœ“ {col}\")\n",
    "    if len(available) > 5:\n",
    "        print(f\"  ... and {len(available)-5} more\")\n",
    "    \n",
    "    if missing:\n",
    "        print(f\"  Missing: {missing}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}