In [None]:

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Indonesia Heart Attack Prediction\n",
    "## Notebook 4: Data Exploration\n",
    "\n",
    "---\n",
    "\n",
    "### Tahap 4 dari Data Science Life Cycle\n",
    "\n",
    "Pada tahap ini, kita akan:\n",
    "1. Exploratory Data Analysis (EDA)\n",
    "2. Univariate Analysis - distribusi setiap variabel\n",
    "3. Bivariate Analysis - hubungan antara features dan target\n",
    "4. Multivariate Analysis - korelasi antar variabel\n",
    "5. Statistical Testing\n",
    "6. Generate insights dan hipotesis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Import Libraries dan Load Clean Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data manipulation\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# Visualization\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "\n",
    "# Statistical analysis\n",
    "from scipy import stats\n",
    "from scipy.stats import chi2_contingency, ttest_ind, mannwhitneyu\n",
    "\n",
    "# System utilities\n",
    "import sys\n",
    "sys.path.append('../src')\n",
    "\n",
    "# Import custom modules\n",
    "from data_preprocessing import get_column_types\n",
    "\n",
    "# Settings\n",
    "pd.set_option('display.max_columns', None)\n",
    "plt.style.use('seaborn-v0_8-whitegrid')\n",
    "sns.set_palette('Set2')\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "print(\"Libraries imported successfully!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load cleaned data\n",
    "df = pd.read_csv('../data/heart_attack_data_cleaned.csv')\n",
    "\n",
    "print(f\"Dataset loaded: {df.shape}\")\n",
    "print(f\"Records: {df.shape[0]}, Features: {df.shape[1]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Target Variable Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Target Variable (Heart Attack) Distribution:\")\n",
    "print(\"=\"*60)\n",
    "\n",
    "target_dist = df['heart_attack'].value_counts()\n",
    "target_pct = df['heart_attack'].value_counts(normalize=True) * 100\n",
    "\n",
    "target_summary = pd.DataFrame({\n",
    "    'Count': target_dist,\n",
    "    'Percentage': target_pct\n",
    "})\n",
    "target_summary.index = ['No Heart Attack', 'Heart Attack']\n",
    "\n",
    "print(target_summary)\n",
    "print(f\"\\nClass Ratio: {target_dist[0]/target_dist[1]:.2f}:1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize target distribution\n",
    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
    "\n",
    "# Count plot\n",
    "colors = ['#90EE90', '#FF6B6B']\n",
    "target_dist.plot(kind='bar', ax=axes[0], color=colors)\n",
    "axes[0].set_title('Heart Attack Distribution (Count)', fontsize=14, fontweight='bold')\n",
    "axes[0].set_xlabel('Heart Attack Status')\n",
    "axes[0].set_ylabel('Count')\n",
    "axes[0].set_xticklabels(['No', 'Yes'], rotation=0)\n",
    "axes[0].grid(axis='y', alpha=0.3)\n",
    "\n",
    "for i, v in enumerate(target_dist):\n",
    "    axes[0].text(i, v + 5, str(v), ha='center', va='bottom', fontweight='bold')\n",
    "\n",
    "# Pie chart\n",
    "axes[1].pie(target_dist, labels=['No Heart Attack', 'Heart Attack'], \n",
    "           autopct='%1.1f%%', colors=colors, startangle=90,\n",
    "           explode=(0, 0.1))\n",
    "axes[1].set_title('Heart Attack Distribution (%)', fontsize=14, fontweight='bold')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Univariate Analysis - Demographics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.1 Age Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Age Statistics:\")\n",
    "print(\"=\"*60)\n",
    "print(df['age'].describe())\n",
    "\n",
    "# Visualize age distribution\n",
    "fig, axes = plt.subplots(1, 3, figsize=(18, 5))\n",
    "\n",
    "# Histogram\n",
    "axes[0].hist(df['age'], bins=20, color='skyblue', edgecolor='black')\n",
    "axes[0].set_title('Age Distribution', fontsize=12, fontweight='bold')\n",
    "axes[0].set_xlabel('Age')\n",
    "axes[0].set_ylabel('Frequency')\n",
    "axes[0].axvline(df['age'].mean(), color='red', linestyle='--', label=f'Mean: {df[\"age\"].mean():.1f}')\n",
    "axes[0].axvline(df['age'].median(), color='green', linestyle='--', label=f'Median: {df[\"age\"].median():.1f}')\n",
    "axes[0].legend()\n",
    "axes[0].grid(alpha=0.3)\n",
    "\n",
    "# Box plot\n",
    "axes[1].boxplot(df['age'])\n",
    "axes[1].set_title('Age Box Plot', fontsize=12, fontweight='bold')\n",
    "axes[1].set_ylabel('Age')\n",
    "axes[1].grid(alpha=0.3)\n",
    "\n",
    "# Age groups\n",
    "age_bins = [0, 30, 40, 50, 60, 100]\n",
    "age_labels = ['<30', '30-40', '40-50', '50-60', '60+']\n",
    "df['age_group'] = pd.cut(df['age'], bins=age_bins, labels=age_labels)\n",
    "age_group_counts = df['age_group'].value_counts().sort_index()\n",
    "\n",
    "age_group_counts.plot(kind='bar', ax=axes[2], color='coral')\n",
    "axes[2].set_title('Age Groups Distribution', fontsize=12, fontweight='bold')\n",
    "axes[2].set_xlabel('Age Group')\n",
    "axes[2].set_ylabel('Count')\n",
    "axes[2].set_xticklabels(age_group_counts.index, rotation=0)\n",
    "axes[2].grid(axis='y', alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.2 Gender Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Gender Distribution:\")\n",
    "print(\"=\"*60)\n",
    "gender_dist = df['gender'].value_counts()\n",
    "print(gender_dist)\n",
    "print(f\"\\nGender ratio (M:F): {gender_dist['Male']/gender_dist['Female']:.2f}:1\")\n",
    "\n",
    "# Visualize\n",
    "fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n",
    "\n",
    "gender_dist.plot(kind='bar', ax=axes[0], color=['#6495ED', '#FFB6C1'])\n",
    "axes[0].set_title('Gender Distribution', fontsize=12, fontweight='bold')\n",
    "axes[0].set_xlabel('Gender')\n",
    "axes[0].set_ylabel('Count')\n",
    "axes[0].set_xticklabels(gender_dist.index, rotation=0)\n",
    "axes[0].grid(axis='y', alpha=0.3)\n",
    "\n",
    "for i, v in enumerate(gender_dist):\n",
    "    axes[0].text(i, v + 5, str(v), ha='center', va='bottom', fontweight='bold')\n",
    "\n",
    "axes[1].pie(gender_dist, labels=gender_dist.index, autopct='%1.1f%%',\n",
    "           colors=['#6495ED', '#FFB6C1'], startangle=90)\n",
    "axes[1].set_title('Gender Distribution (%)', fontsize=12, fontweight='bold')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.3 Region and Income Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
    "\n",
    "# Region\n",
    "region_dist = df['region'].value_counts()\n",
    "region_dist.plot(kind='bar', ax=axes[0], color=['#98D8C8', '#F7DC6F'])\n",
    "axes[0].set_title('Region Distribution', fontsize=12, fontweight='bold')\n",
    "axes[0].set_xlabel('Region')\n",
    "axes[0].set_ylabel('Count')\n",
    "axes[0].set_xticklabels(region_dist.index, rotation=0)\n",
    "axes[0].grid(axis='y', alpha=0.3)\n",
    "\n",
    "for i, v in enumerate(region_dist):\n",
    "    axes[0].text(i, v + 5, str(v), ha='center', va='bottom', fontweight='bold')\n",
    "\n",
    "# Income Level\n",
    "income_order = ['Low', 'Middle', 'High']\n",
    "income_dist = df['income_level'].value_counts()[income_order]\n",
    "income_dist.plot(kind='bar', ax=axes[1], color=['#E74C3C', '#F39C12', '#27AE60'])\n",
    "axes[1].set_title('Income Level Distribution', fontsize=12, fontweight='bold')\n",
    "axes[1].set_xlabel('Income Level')\n",
    "axes[1].set_ylabel('Count')\n",
    "axes[1].set_xticklabels(income_dist.index, rotation=0)\n",
    "axes[1].grid(axis='y', alpha=0.3)\n",
    "\n",
    "for i, v in enumerate(income_dist):\n",
    "    axes[1].text(i, v + 5, str(v), ha='center', va='bottom', fontweight='bold')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Univariate Analysis - Clinical Risk Factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Clinical binary factors\n",
    "clinical_binary = ['hypertension', 'diabetes', 'obesity', 'family_history', \n",
    "                   'previous_heart_disease']\n",
    "\n",
    "fig, axes = plt.subplots(2, 3, figsize=(18, 10))\n",
    "axes = axes.ravel()\n",
    "\n",
    "for idx, col in enumerate(clinical_binary):\n",
    "    counts = df[col].value_counts().sort_index()\n",
    "    percentages = df[col].value_counts(normalize=True).sort_index() * 100\n",
    "    \n",
    "    axes[idx].bar(['No', 'Yes'], counts, color=['lightgreen', 'salmon'])\n",
    "    axes[idx].set_title(f'{col.replace(\"_\", \" \").title()}', fontsize=12, fontweight='bold')\n",
    "    axes[idx].set_ylabel('Count')\n",
    "    axes[idx].grid(axis='y', alpha=0.3)\n",
    "    \n",
    "    # Add percentages on bars\n",
    "    for i, (v, p) in enumerate(zip(counts, percentages)):\n",
    "        axes[idx].text(i, v + 5, f'{v}\\n({p:.1f}%)', \n",
    "                      ha='center', va='bottom', fontweight='bold')\n",
    "\n",
    "# Hide the last subplot if not used\n",
    "if len(clinical_binary) < 6:\n",
    "    axes[5].axis('off')\n",
    "\n",
    "plt.suptitle('Clinical Risk Factors Distribution', fontsize=14, fontweight='bold')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Summary statistics\n",
    "print(\"\\nClinical Risk Factors Summary:\")\n",
    "print(\"=\"*60)\n",
    "for col in clinical_binary:\n",
    "    prevalence = (df[col].sum() / len(df)) * 100\n",
    "    print(f\"{col.replace('_', ' ').title()}: {prevalence:.1f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.1 Cholesterol and Blood Pressure Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n",
    "\n",
    "# Cholesterol level\n",
    "axes[0, 0].hist(df['cholesterol_level'], bins=30, color='steelblue', edgecolor='black')\n",
    "axes[0, 0].axvline(df['cholesterol_level'].mean(), color='red', linestyle='--', \n",
    "                   label=f'Mean: {df[\"cholesterol_level\"].mean():.1f}')\n",
    "axes[0, 0].axvline(200, color='orange', linestyle='--', label='Desirable (<200)')\n",
    "axes[0, 0].axvline(240, color='darkred', linestyle='--', label='High (>240)')\n",
    "axes[0, 0].set_title('Total Cholesterol Distribution', fontsize=12, fontweight='bold')\n",
    "axes[0, 0].set_xlabel('Cholesterol Level (mg/dL)')\n",
    "axes[0, 0].set_ylabel('Frequency')\n",
    "axes[0, 0].legend()\n",
    "axes[0, 0].grid(alpha=0.3)\n",
    "\n",
    "# Blood Pressure Systolic\n",
    "axes[0, 1].hist(df['blood_pressure_systolic'], bins=30, color='coral', edgecolor='black')\n",
    "axes[0, 1].axvline(df['blood_pressure_systolic'].mean(), color='red', linestyle='--',\n",
    "                   label=f'Mean: {df[\"blood_pressure_systolic\"].mean():.1f}')\n",
    "axes[0, 1].axvline(120, color='green', linestyle='--', label='Normal (<120)')\n",
    "axes[0, 1].axvline(140, color='orange', linestyle='--', label='Stage 1 HT (140)')\n",
    "axes[0, 1].set_title('Systolic Blood Pressure Distribution', fontsize=12, fontweight='bold')\n",
    "axes[0, 1].set_xlabel('Systolic BP (mmHg)')\n",
    "axes[0, 1].set_ylabel('Frequency')\n",
    "axes[0, 1].legend()\n",
    "axes[0, 1].grid(alpha=0.3)\n",
    "\n",
    "# Blood Pressure Diastolic\n",
    "axes[1, 0].hist(df['blood_pressure_diastolic'], bins=30, color='lightcoral', edgecolor='black')\n",
    "axes[1, 0].axvline(df['blood_pressure_diastolic'].mean(), color='red', linestyle='--',\n",
    "                   label=f'Mean: {df[\"blood_pressure_diastolic\"].mean():.1f}')\n",
    "axes[1, 0].axvline(80, color='green', linestyle='--', label='Normal (<80)')\n",
    "axes[1, 0].axvline(90, color='orange', linestyle='--', label='Stage 1 HT (90)')\n",
    "axes[1, 0].set_title('Diastolic Blood Pressure Distribution', fontsize=12, fontweight='bold')\n",
    "axes[1, 0].set_xlabel('Diastolic BP (mmHg)')\n",
    "axes[1, 0].set_ylabel('Frequency')\n",
    "axes[1, 0].legend()\n",
    "axes[1, 0].grid(alpha=0.3)\n",
    "\n",
    "# Fasting Blood Sugar\n",
    "axes[1, 1].hist(df['fasting_blood_sugar'], bins=30, color='gold', edgecolor='black')\n",
    "axes[1, 1].axvline(df['fasting_blood_sugar'].mean(), color='red', linestyle='--',\n",
    "                   label=f'Mean: {df[\"fasting_blood_sugar\"].mean():.1f}')\n",
    "axes[1, 1].axvline(100, color='green', linestyle='--', label='Normal (<100)')\n",
    "axes[1, 1].axvline(126, color='darkred', linestyle='--', label='Diabetes (≥126)')\n",
    "axes[1, 1].set_title('Fasting Blood Sugar Distribution', fontsize=12, fontweight='bold')\n",
    "axes[1, 1].set_xlabel('Blood Sugar (mg/dL)')\n",
    "axes[1, 1].set_ylabel('Frequency')\n",
    "axes[1, 1].legend()\n",
    "axes[1, 1].grid(alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Univariate Analysis - Lifestyle Factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lifestyle_factors = ['smoking_status', 'alcohol_consumption', 'physical_activity', 'dietary_habits']\n",
    "\n",
    "fig, axes = plt.subplots(2, 2, figsize=(16, 10))\n",
    "axes = axes.ravel()\n",
    "\n",
    "colors_map = {\n",
    "    'smoking_status': ['lightgreen', 'yellow', 'salmon'],\n",
    "    'alcohol_consumption': ['lightblue', 'orange', 'red'],\n",
    "    'physical_activity': ['salmon', 'gold', 'lightgreen'],\n",
    "    'dietary_habits': ['lightgreen', 'salmon']\n",
    "}\n",
    "\n",
    "for idx, col in enumerate(lifestyle_factors):\n",
    "    counts = df[col].value_counts()\n",
    "    counts.plot(kind='bar', ax=axes[idx], color=colors_map[col])\n",
    "    axes[idx].set_title(f'{col.replace(\"_\", \" \").title()}', fontsize=12, fontweight='bold')\n",
    "    axes[idx].set_xlabel('')\n",
    "    axes[idx].set_ylabel('Count')\n",
    "    axes[idx].set_xticklabels(counts.index, rotation=45, ha='right')\n",
    "    axes[idx].grid(axis='y', alpha=0.3)\n",
    "    \n",
    "    # Add counts on bars\n",
    "    for i, v in enumerate(counts):\n",
    "        axes[idx].text(i, v + 5, str(v), ha='center', va='bottom', fontweight='bold')\n",
    "\n",
    "plt.suptitle('Lifestyle Factors Distribution', fontsize=14, fontweight='bold')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Summary\n",
    "print(\"\\nLifestyle Factors Summary:\")\n",
    "print(\"=\"*60)\n",
    "for col in lifestyle_factors:\n",
    "    print(f\"\\n{col.replace('_', ' ').title()}:\")\n",
    "    print(df[col].value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Bivariate Analysis - Features vs Target"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.1 Age vs Heart Attack"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(1, 3, figsize=(18, 5))\n",
    "\n",
    "# Box plot\n",
    "df.boxplot(column='age', by='heart_attack', ax=axes[0])\n",
    "axes[0].set_title('Age Distribution by Heart Attack Status')\n",
    "axes[0].set_xlabel('Heart Attack')\n",
    "axes[0].set_ylabel('Age')\n",
    "axes[0].set_xticklabels(['No', 'Yes'])\n",
    "plt.sca(axes[0])\n",
    "plt.xticks([1, 2], ['No', 'Yes'])\n",
    "\n",
    "# Violin plot\n",
    "sns.violinplot(data=df, x='heart_attack', y='age', ax=axes[1])\n",
    "axes[1].set_title('Age Distribution (Violin Plot)')\n",
    "axes[1].set_xlabel('Heart Attack')\n",
    "axes[1].set_ylabel('Age')\n",
    "axes[1].set_xticklabels(['No', 'Yes'])\n",
    "\n",
    "# Age group vs heart attack\n",
    "age_ha_crosstab = pd.crosstab(df['age_group'], df['heart_attack'], normalize='index') * 100\n",
    "age_ha_crosstab.plot(kind='bar', ax=axes[2], stacked=False, color=['lightgreen', 'salmon'])\n",
    "axes[2].set_title('Heart Attack Rate by Age Group')\n",
    "axes[2].set_xlabel('Age Group')\n",
    "axes[2].set_ylabel('Percentage (%)')\n",
    "axes[2].legend(['No Heart Attack', 'Heart Attack'])\n",
    "axes[2].set_xticklabels(age_ha_crosstab.index, rotation=0)\n",
    "axes[2].grid(axis='y', alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Statistical test\n",
    "no_ha = df[df['heart_attack'] == 0]['age']\n",
    "yes_ha = df[df['heart_attack'] == 1]['age']\n",
    "t_stat, p_value = ttest_ind(no_ha, yes_ha)\n",
    "\n",
    "print(f\"\\nT-test Results:\")\n",
    "print(f\"Mean age (No HA): {no_ha.mean():.2f}\")\n",
    "print(f\"Mean age (Yes HA): {yes_ha.mean():.2f}\")\n",
    "print(f\"T-statistic: {t_stat:.4f}\")\n",
    "print(f\"P-value: {p_value:.4f}\")\n",
    "print(f\"Significant: {'Yes' if p_value < 0.05 else 'No'} (α=0.05)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.2 Gender vs Heart Attack"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create crosstab\n",
    "gender_ha = pd.crosstab(df['gender'], df['heart_attack'])\n",
    "gender_ha_pct = pd.crosstab(df['gender'], df['heart_attack'], normalize='index') * 100\n",
    "\n",
    "print(\"Gender vs Heart Attack Crosstab:\")\n",
    "print(\"=\"*60)\n",
    "print(gender_ha)\n",
    "print(\"\\nPercentages:\")\n",
    "print(gender_ha_pct)\n",
    "\n",
    "# Visualize\n",
    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
    "\n",
    "# Grouped bar chart\n",
    "gender_ha.plot(kind='bar', ax=axes[0], color=['lightgreen', 'salmon'])\n",
    "axes[0].set_title('Heart Attack Count by Gender', fontsize=12, fontweight='bold')\n",
    "axes[0].set_xlabel('Gender')\n",
    "axes[0].set_ylabel('Count')\n",
    "axes[0].legend(['No Heart Attack', 'Heart Attack'])\n",
    "axes[0].set_xticklabels(gender_ha.index, rotation=0)\n",
    "axes[0].grid(axis='y', alpha=0.3)\n",
    "\n",
    "# Percentage bar chart\n",
    "gender_ha_pct.plot(kind='bar', ax=axes[1], color=['lightgreen', 'salmon'])\n",
    "axes[1].set_title('Heart Attack Rate by Gender', fontsize=12, fontweight='bold')\n",
    "axes[1].set_xlabel('Gender')\n",
    "axes[1].set_ylabel('Percentage (%)')\n",
    "axes[1].legend(['No Heart Attack', 'Heart Attack'])\n",
    "axes[1].set_xticklabels(gender_ha_pct.index, rotation=0)\n",
    "axes[1].grid(axis='y', alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Chi-square test\n",
    "chi2, p_value, dof, expected = chi2_contingency(gender_ha)\n",
    "print(f\"\\nChi-square Test:\")\n",
    "print(f\"Chi2 statistic: {chi2:.4f}\")\n",
    "print(f\"P-value: {p_value:.4f}\")\n",
    "print(f\"Significant: {'Yes' if p_value < 0.05 else 'No'} (α=0.05)")""}
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 6.3 Clinical Risk Factors vs Heart Attack"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clinical_binary = ['hypertension', 'diabetes', 'obesity', 'family_history', 'previous_heart_disease']\n",
"\n",
"fig, axes = plt.subplots(2, 3, figsize=(18, 10))\n",
"axes = axes.ravel()\n",
"\n",
"for idx, col in enumerate(clinical_binary):\n",
"    # Calculate heart attack rate\n",
"    crosstab = pd.crosstab(df[col], df['heart_attack'], normalize='index') * 100\n",
"    \n",
"    crosstab.plot(kind='bar', ax=axes[idx], color=['lightgreen', 'salmon'])\n",
"    axes[idx].set_title(f'{col.replace("", " ").title()} vs Heart Attack', \n",
"                       fontsize=11, fontweight='bold')\n",
"    axes[idx].set_xlabel('')\n",
"    axes[idx].set_ylabel('Percentage (%)')\n",
"    axes[idx].legend(['No HA', 'Heart Attack'], fontsize=9)\n",
"    axes[idx].set_xticklabels(['No', 'Yes'], rotation=0)\n",
"    axes[idx].grid(axis='y', alpha=0.3)\n",
"    \n",
"    # Calculate and display heart attack rate for those with the condition\n",
"    if 1 in df[col].unique():\n",
"        ha_rate = df[df[col] == 1]['heart_attack'].mean() * 100\n",
"        axes[idx].text(0.5, 0.95, f'HA Rate (Yes): {ha_rate:.1f}%',\n",
"                      transform=axes[idx].transAxes, ha='center', va='top',\n",
"                      bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),\n",
"                      fontsize=9, fontweight='bold')\n",
"\n",
"# Hide last subplot\n",
"axes[5].axis('off')\n",
"\n",
"plt.suptitle('Clinical Risk Factors vs Heart Attack', fontsize=14, fontweight='bold')\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# Statistical significance\n",
"print("\nStatistical Significance (Chi-square tests):")\n",
"print("="*60)\n",
"for col in clinical_binary:\n",
"    contingency_table = pd.crosstab(df[col], df['heart_attack'])\n",
"    chi2, p_value, dof, expected = chi2_contingency(contingency_table)\n",
"    print(f"{col.replace('', ' ').title():30s} - P-value: {p_value:.4f} {'' if p_value < 0.001 else '' if p_value < 0.01 else '' if p_value < 0.05 else 'ns'}")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 6.4 Lifestyle Factors vs Heart Attack"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"lifestyle_factors = ['smoking_status', 'alcohol_consumption', 'physical_activity', 'dietary_habits']\n",
"\n",
"fig, axes = plt.subplots(2, 2, figsize=(16, 10))\n",
"axes = axes.ravel()\n",
"\n",
"for idx, col in enumerate(lifestyle_factors):\n",
"    crosstab = pd.crosstab(df[col], df['heart_attack'], normalize='index') * 100\n",
"    \n",
"    crosstab.plot(kind='bar', ax=axes[idx], color=['lightgreen', 'salmon'])\n",
"    axes[idx].set_title(f'{col.replace("", " ").title()} vs Heart Attack',\n",
"                       fontsize=12, fontweight='bold')\n",
"    axes[idx].set_xlabel('')\n",
"    axes[idx].set_ylabel('Percentage (%)')\n",
"    axes[idx].legend(['No Heart Attack', 'Heart Attack'])\n",
"    axes[idx].set_xticklabels(crosstab.index, rotation=45, ha='right')\n",
"    axes[idx].grid(axis='y', alpha=0.3)\n",
"\n",
"plt.suptitle('Lifestyle Factors vs Heart Attack', fontsize=14, fontweight='bold')\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# Statistical tests\n",
"print("\nStatistical Significance (Chi-square tests):")\n",
"print("="*60)\n",
"for col in lifestyle_factors:\n",
"    contingency_table = pd.crosstab(df[col], df['heart_attack'])\n",
"    chi2, p_value, dof, expected = chi2_contingency(contingency_table)\n",
"    print(f"{col.replace('', ' ').title():25s} - P-value: {p_value:.4f} {'' if p_value < 0.001 else '' if p_value < 0.01 else '' if p_value < 0.05 else 'ns'}")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Multivariate Analysis - Correlation Matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Select numerical columns for correlation\n",
"numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()\n",
"\n",
"# Calculate correlation matrix\n",
"corr_matrix = df[numerical_cols].corr()\n",
"\n",
"# Plot correlation heatmap\n",
"plt.figure(figsize=(16, 14))\n",
"mask = np.triu(np.ones_like(corr_matrix, dtype=bool))\n",
"sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm',\n",
"           center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})\n",
"plt.title('Correlation Matrix - Numerical Features', fontsize=14, fontweight='bold')\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# Top correlations with heart_attack\n",
"print("\nTop Correlations with Heart Attack:")\n",
"print("="*60)\n",
"ha_corr = corr_matrix['heart_attack'].drop('heart_attack').sort_values(ascending=False)\n",
"print(ha_corr.head(10))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7.1 Feature Correlation with Target"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Visualize top correlations with target\n",
"top_n = 15\n",
"top_corr = corr_matrix['heart_attack'].drop('heart_attack').abs().sort_values(ascending=False).head(top_n)\n",
"\n",
"plt.figure(figsize=(12, 8))\n",
"colors = ['red' if x < 0 else 'green' for x in corr_matrix['heart_attack'][top_corr.index]]\n",
"plt.barh(range(len(top_corr)), corr_matrix['heart_attack'][top_corr.index], color=colors)\n",
"plt.yticks(range(len(top_corr)), top_corr.index)\n",
"plt.xlabel('Correlation Coefficient')\n",
"plt.title(f'Top {top_n} Features Correlated with Heart Attack', fontsize=14, fontweight='bold')\n",
"plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)\n",
"plt.grid(axis='x', alpha=0.3)\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 8. Key Insights and Findings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print("\n" + "="*60)\n",
"print("KEY FINDINGS FROM DATA EXPLORATION")\n",
"print("="*60)\n",
"\n",
"# 1. Target distribution\n",
"ha_pct = (df['heart_attack'].sum() / len(df)) * 100\n",
"print(f"\n1. TARGET VARIABLE:")\n",
"print(f"   - Heart attack prevalence: {ha_pct:.1f}%")\n",
"print(f"   - Class distribution: {'Balanced' if 40 < ha_pct < 60 else 'Imbalanced'}")\n",
"\n",
"# 2. Demographics\n",
"print(f"\n2. DEMOGRAPHICS:")\n",
"print(f"   - Mean age: {df['age'].mean():.1f} years")\n",
"print(f"   - Age range: {df['age'].min():.0f} - {df['age'].max():.0f} years")\n",
"print(f"   - Gender: {(df['gender']=='Male').sum()} Male, {(df['gender']=='Female').sum()} Female")\n",
"\n",
"# 3. Top risk factors\n",
"print(f"\n3. TOP RISK FACTORS (by correlation):")\n",
"top_5_risk = corr_matrix['heart_attack'].drop('heart_attack').abs().sort_values(ascending=False).head(5)\n",
"for i, (feature, corr) in enumerate(top_5_risk.items(), 1):\n",
"    print(f"   {i}. {feature}: {corr:.3f}")\n",
"\n",
"# 4. Clinical factors prevalence\n",
"print(f"\n4. CLINICAL RISK FACTORS PREVALENCE:")\n",
"for col in clinical_binary:\n",
"    prev = (df[col].sum() / len(df)) * 100\n",
"    print(f"   - {col.replace('_', ' ').title()}: {prev:.1f}%")\n",
"\n",
"# 5. Lifestyle patterns\n",
"print(f"\n5. LIFESTYLE PATTERNS:")\n",
"print(f"   - Current smokers: {(df['smoking_status']=='Current').sum()} ({(df['smoking_status']=='Current').sum()/len(df)*100:.1f}%)")\n",
"print(f"   - Low physical activity: {(df['physical_activity']=='Low').sum()} ({(df['physical_activity']=='Low').sum()/len(df)*100:.1f}%)")\n",
"print(f"   - Unhealthy diet: {(df['dietary_habits']=='Unhealthy').sum()} ({(df['dietary_habits']=='Unhealthy').sum()/len(df)*100:.1f}%)")\n",
"\n",
"print("\n" + "="*60)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summary\n",
"\n",
"Pada tahap Data Exploration ini, kita telah:\n",
"\n",
"1. ✅ Target Variable Analysis: Analyzed distribution dan prevalence serangan jantung\n",
"2. ✅ Univariate Analysis: Explored distribusi setiap feature\n",
"   - Demographics (age, gender, region, income)\n",
"   - Clinical risk factors (hypertension, diabetes, obesity, dll)\n",
"   - Lifestyle factors (smoking, alcohol, physical activity, diet)\n",
"3. ✅ Bivariate Analysis: Examined relationships antara features dan target\n",
"   - Age vs heart attack (significant)\n",
"   - Gender vs heart attack\n",
"   - Clinical factors vs heart attack (highly significant)\n",
"   - Lifestyle factors vs heart attack\n",
"4. ✅ Multivariate Analysis: Correlation analysis antar variables\n",
"5. ✅ Statistical Testing: Chi-square and t-tests untuk significance\n",
"6. ✅ Key Insights: Identified top risk factors dan patterns\n",
"\n",
"### Major Insights:\n",
"- [Top risk factors identified from correlation]\n",
"- [Significant demographic patterns]\n",
"- [Important lifestyle associations]\n",
"- [Clinical predictors highlighted]\n",
"\n",
"### Hypotheses for Modeling:\n",
"1. Age adalah strong predictor untuk heart attack\n",
"2. Clinical factors (hypertension, diabetes) highly associated dengan target\n",
"3. Previous heart disease adalah critical indicator\n",
"4. Kombinasi multiple risk factors increases probability significantly\n",
"\n",
"### Next Steps:\n",
"Lanjut ke Notebook 5: Feature Engineering untuk create new features dan prepare data untuk modeling.\n",
"\n",
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

SyntaxError: unexpected character after line continuation character (4115811542.py, line 619)

In [3]:
import pandas as pd

print("==============================================")
print("04. DATA EXPLORATION (EDA)")
print("==============================================")

# Load dataset (gunakan data yang sudah clean jika sudah dilakukan cleaning)
df = pd.read_csv('../data/heart_attack_data.csv')

# Tampilkan 5 baris awal data
print("\n=== PREVIEW DATA (5 BARIS PERTAMA) ===")
display(df.head())

# Informasi struktur dataset
print("\n=== INFORMASI DATA ===")
print(df.info())

# Statistik deskriptif
print("\n=== STATISTIK DESKRIPTIF DATA NUMERIK ===")
print(df.describe())

# Cek distribusi jumlah kategorikal
print("\n=== DISTRIBUSI DATA KATEGORIKAL ===")
print(df.nunique())


04. DATA EXPLORATION (EDA)

=== PREVIEW DATA (5 BARIS PERTAMA) ===


Unnamed: 0,age,gender,region,income_level,hypertension,diabetes,cholesterol_level,obesity,waist_circumference,family_history,...,blood_pressure_diastolic,fasting_blood_sugar,cholesterol_hdl,cholesterol_ldl,triglycerides,EKG_results,previous_heart_disease,medication_usage,participated_in_free_screening,heart_attack
0,60,Male,Rural,Middle,0,1,211,0,83,0,...,62,173,48,121,101,Normal,0,0,0,0
1,53,Female,Urban,Low,0,0,208,0,106,1,...,76,70,58,83,138,Normal,1,0,1,0
2,62,Female,Urban,Low,0,0,231,1,112,1,...,74,118,69,130,171,Abnormal,0,1,0,1
3,73,Male,Urban,Low,1,0,202,0,82,1,...,65,98,52,85,146,Normal,0,1,1,0
4,52,Male,Urban,Middle,1,0,232,0,89,0,...,75,104,59,127,139,Normal,1,0,1,1



=== INFORMASI DATA ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158355 entries, 0 to 158354
Data columns (total 28 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   age                             158355 non-null  int64  
 1   gender                          158355 non-null  object 
 2   region                          158355 non-null  object 
 3   income_level                    158355 non-null  object 
 4   hypertension                    158355 non-null  int64  
 5   diabetes                        158355 non-null  int64  
 6   cholesterol_level               158355 non-null  int64  
 7   obesity                         158355 non-null  int64  
 8   waist_circumference             158355 non-null  int64  
 9   family_history                  158355 non-null  int64  
 10  smoking_status                  158355 non-null  object 
 11  alcohol_consumption             63507 non-null   objec