# Heart Disease Prediction - Data Exploration & Analysis

This notebook provides comprehensive data exploration and analysis for the heart disease prediction dataset.

## Table of Contents
1. [Data Loading and Overview](#data-loading)
2. [Exploratory Data Analysis](#eda)
3. [Feature Analysis](#feature-analysis)
4. [Correlation Analysis](#correlation)
5. [Data Preprocessing](#preprocessing)
6. [Statistical Insights](#statistics)
7. [Conclusions](#conclusions)

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("üìä Libraries imported successfully!")
print("üé® Plotting style configured")

## 1. Data Loading and Overview {#data-loading}

Let's start by loading the heart disease dataset and getting an overview of the data structure.

In [None]:
# Load the dataset
data_path = "../data/heart_disease_dataset.csv"

try:
    df = pd.read_csv(data_path)
    print(f"‚úÖ Dataset loaded successfully!")
    print(f"üìä Dataset shape: {df.shape}")
except FileNotFoundError:
    print("‚ùå Dataset not found. Creating sample dataset...")
    
    # Create sample dataset for demonstration
    np.random.seed(42)
    n_samples = 1000
    
    data = {
        'age': np.random.randint(30, 80, n_samples),
        'sex': np.random.randint(0, 2, n_samples),
        'cp': np.random.randint(0, 4, n_samples),
        'trestbps': np.random.randint(90, 200, n_samples),
        'chol': np.random.randint(120, 400, n_samples),
        'fbs': np.random.randint(0, 2, n_samples),
        'restecg': np.random.randint(0, 3, n_samples),
        'thalach': np.random.randint(80, 200, n_samples),
        'exang': np.random.randint(0, 2, n_samples),
        'oldpeak': np.random.uniform(0, 6, n_samples),
        'slope': np.random.randint(0, 3, n_samples),
        'ca': np.random.randint(0, 4, n_samples),
        'thal': np.random.randint(0, 4, n_samples),
    }
    
    # Create target with some correlation
    target_prob = (
        0.1 * (data['age'] > 55) +
        0.2 * (data['cp'] > 0) +
        0.15 * (data['chol'] > 240) +
        0.1 * (data['thalach'] < 120) +
        0.15 * (data['exang'] == 1) +
        0.1 * (data['oldpeak'] > 2) +
        0.2 * np.random.random(n_samples)
    )
    
    data['target'] = (target_prob > 0.5).astype(int)
    df = pd.DataFrame(data)
    
    print(f"‚úÖ Sample dataset created: {df.shape}")

# Display basic information
print(f"\nüìã Dataset Info:")
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")
print(f"\nüìä Column Names:")
print(list(df.columns))

In [None]:
# Display first few rows
print("üîç First 5 rows of the dataset:")
display(df.head())

print("\nüìä Dataset Info:")
df.info()

In [None]:
# Check for missing values
print("üîç Missing Values Analysis:")
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
})

display(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("‚úÖ No missing values found!")
else:
    print(f"‚ö†Ô∏è  Total missing values: {missing_df['Missing Count'].sum()}")

## 2. Exploratory Data Analysis {#eda}

Let's explore the distribution of our target variable and basic statistics.

In [None]:
# Target variable analysis
target_col = 'target'  # Adjust if your target column has a different name

print("üéØ Target Variable Analysis:")
print(f"\nTarget distribution:")
target_counts = df[target_col].value_counts()
target_percent = df[target_col].value_counts(normalize=True) * 100

target_summary = pd.DataFrame({
    'Count': target_counts,
    'Percentage': target_percent
})

display(target_summary)

# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Count plot
sns.countplot(data=df, x=target_col, ax=axes[0])
axes[0].set_title('Target Variable Distribution (Count)')
axes[0].set_xlabel('Heart Disease (0=No, 1=Yes)')

# Pie chart
axes[1].pie(target_counts.values, labels=['No Disease', 'Disease'], autopct='%1.1f%%')
axes[1].set_title('Target Variable Distribution (Percentage)')

plt.tight_layout()
plt.show()

In [None]:
# Basic descriptive statistics
print("üìä Descriptive Statistics:")
display(df.describe())

## 3. Feature Analysis {#feature-analysis}

Let's analyze individual features and their relationship with the target variable.

In [None]:
# Separate numerical and categorical features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numerical_features:
    numerical_features.remove(target_col)

categorical_features = df.select_dtypes(include=['object']).columns.tolist()

print(f"üìä Numerical features ({len(numerical_features)}): {numerical_features}")
print(f"üìä Categorical features ({len(categorical_features)}): {categorical_features}")

# Identify binary features (likely categorical despite being numeric)
binary_features = []
for col in numerical_features:
    unique_vals = df[col].nunique()
    if unique_vals <= 2:
        binary_features.append(col)

print(f"üî¢ Binary features: {binary_features}")

# Continuous numerical features
continuous_features = [col for col in numerical_features if col not in binary_features]
print(f"üìà Continuous features: {continuous_features}")

In [None]:
# Distribution of continuous features
if continuous_features:
    n_features = len(continuous_features)
    cols = 3
    rows = (n_features + cols - 1) // cols
    
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
    axes = axes.flatten() if n_features > 1 else [axes]
    
    for i, feature in enumerate(continuous_features):
        if i < len(axes):
            axes[i].hist(df[feature], bins=30, alpha=0.7, edgecolor='black')
            axes[i].set_title(f'Distribution of {feature}')
            axes[i].set_xlabel(feature)
            axes[i].set_ylabel('Frequency')
    
    # Hide empty subplots
    for i in range(n_features, len(axes)):
        axes[i].axis('off')
    
    plt.suptitle('Distribution of Continuous Features', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
else:
    print("No continuous features found.")

In [None]:
# Box plots of continuous features by target
if continuous_features:
    n_features = len(continuous_features)
    cols = 3
    rows = (n_features + cols - 1) // cols
    
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
    axes = axes.flatten() if n_features > 1 else [axes]
    
    for i, feature in enumerate(continuous_features):
        if i < len(axes):
            sns.boxplot(data=df, x=target_col, y=feature, ax=axes[i])
            axes[i].set_title(f'{feature} by Target')
            axes[i].set_xlabel('Heart Disease (0=No, 1=Yes)')
    
    # Hide empty subplots
    for i in range(n_features, len(axes)):
        axes[i].axis('off')
    
    plt.suptitle('Feature Distributions by Target Variable', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

In [None]:
# Categorical/Binary features analysis
categorical_and_binary = categorical_features + binary_features

if categorical_and_binary:
    n_features = len(categorical_and_binary)
    cols = 3
    rows = (n_features + cols - 1) // cols
    
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
    axes = axes.flatten() if n_features > 1 else [axes]
    
    for i, feature in enumerate(categorical_and_binary):
        if i < len(axes):
            # Create cross-tabulation
            ct = pd.crosstab(df[feature], df[target_col], normalize='index') * 100
            ct.plot(kind='bar', ax=axes[i], width=0.8)
            axes[i].set_title(f'{feature} vs Target (%)')
            axes[i].set_xlabel(feature)
            axes[i].set_ylabel('Percentage')
            axes[i].legend(['No Disease', 'Disease'])
            axes[i].tick_params(axis='x', rotation=45)
    
    # Hide empty subplots
    for i in range(n_features, len(axes)):
        axes[i].axis('off')
    
    plt.suptitle('Categorical Features vs Target Variable', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

## 4. Correlation Analysis {#correlation}

Let's analyze correlations between features and with the target variable.

In [None]:
# Calculate correlation matrix
correlation_matrix = df.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, 
            mask=mask,
            annot=True, 
            cmap='coolwarm', 
            center=0,
            square=True,
            fmt='.2f')
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Feature correlation with target
target_correlations = correlation_matrix[target_col].drop(target_col).sort_values(key=abs, ascending=False)

print("üéØ Features ranked by correlation with target:")
target_corr_df = pd.DataFrame({
    'Feature': target_correlations.index,
    'Correlation': target_correlations.values,
    'Abs_Correlation': np.abs(target_correlations.values)
})

display(target_corr_df)

# Plot target correlations
plt.figure(figsize=(10, 8))
target_correlations.plot(kind='barh')
plt.title('Feature Correlations with Target Variable', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Identify highly correlated feature pairs
print("üîç Highly correlated feature pairs (|correlation| > 0.7):")

high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        feature1 = correlation_matrix.columns[i]
        feature2 = correlation_matrix.columns[j]
        corr_value = correlation_matrix.iloc[i, j]
        
        if abs(corr_value) > 0.7:
            high_corr_pairs.append({
                'Feature 1': feature1,
                'Feature 2': feature2,
                'Correlation': corr_value
            })

if high_corr_pairs:
    high_corr_df = pd.DataFrame(high_corr_pairs)
    display(high_corr_df)
else:
    print("‚úÖ No highly correlated feature pairs found.")

## 5. Data Preprocessing {#preprocessing}

Let's prepare the data for machine learning models.

In [None]:
# Check data types and unique values
print("üìä Data Type Analysis:")
dtype_info = pd.DataFrame({
    'Feature': df.columns,
    'Data_Type': df.dtypes,
    'Unique_Values': [df[col].nunique() for col in df.columns],
    'Sample_Values': [list(df[col].unique()[:5]) for col in df.columns]
})

display(dtype_info)

In [None]:
# Outlier detection using IQR method
def detect_outliers_iqr(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    return outliers

print("üîç Outlier Analysis:")
outlier_summary = []

for feature in continuous_features:
    outliers = detect_outliers_iqr(df, feature)
    outlier_summary.append({
        'Feature': feature,
        'Outlier_Count': len(outliers),
        'Outlier_Percentage': (len(outliers) / len(df)) * 100
    })

if outlier_summary:
    outlier_df = pd.DataFrame(outlier_summary)
    display(outlier_df)
else:
    print("No continuous features for outlier analysis.")

In [None]:
# Feature scaling analysis
if continuous_features:
    print("üìä Feature Scaling Analysis:")
    
    # Show statistics before scaling
    print("\nBefore Scaling:")
    scaling_stats = df[continuous_features].describe().loc[['mean', 'std', 'min', 'max']]
    display(scaling_stats)
    
    # Apply standard scaling
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df[continuous_features])
    scaled_df = pd.DataFrame(scaled_features, columns=continuous_features)
    
    print("\nAfter Standard Scaling:")
    scaling_stats_after = scaled_df.describe().loc[['mean', 'std', 'min', 'max']]
    display(scaling_stats_after)
    
    # Visualize scaling effect
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Before scaling
    df[continuous_features].boxplot(ax=axes[0])
    axes[0].set_title('Before Scaling')
    axes[0].tick_params(axis='x', rotation=45)
    
    # After scaling
    scaled_df.boxplot(ax=axes[1])
    axes[1].set_title('After Standard Scaling')
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.suptitle('Effect of Feature Scaling', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

## 6. Statistical Insights {#statistics}

Let's perform statistical tests to understand feature significance.

In [None]:
# Statistical tests for continuous features
if continuous_features:
    print("üìä Statistical Tests for Continuous Features:")
    print("(Testing difference between disease and no-disease groups)")
    
    stat_results = []
    
    for feature in continuous_features:
        # Separate groups
        group_0 = df[df[target_col] == 0][feature]
        group_1 = df[df[target_col] == 1][feature]
        
        # Perform t-test
        t_stat, p_value = stats.ttest_ind(group_0, group_1)
        
        # Calculate effect size (Cohen's d)
        pooled_std = np.sqrt(((len(group_0) - 1) * group_0.var() + 
                             (len(group_1) - 1) * group_1.var()) / 
                            (len(group_0) + len(group_1) - 2))
        cohens_d = (group_1.mean() - group_0.mean()) / pooled_std
        
        stat_results.append({
            'Feature': feature,
            'Mean_NoDisease': group_0.mean(),
            'Mean_Disease': group_1.mean(),
            'T_Statistic': t_stat,
            'P_Value': p_value,
            'Cohens_D': cohens_d,
            'Significant': 'Yes' if p_value < 0.05 else 'No'
        })
    
    stat_df = pd.DataFrame(stat_results)
    display(stat_df)
    
    # Interpretation of Cohen's d
    print("\nüìã Cohen's d interpretation:")
    print("‚Ä¢ Small effect: 0.2")
    print("‚Ä¢ Medium effect: 0.5")
    print("‚Ä¢ Large effect: 0.8")

In [None]:
# Chi-square tests for categorical features
if categorical_and_binary:
    print("üìä Chi-square Tests for Categorical Features:")
    
    chi2_results = []
    
    for feature in categorical_and_binary:
        # Create contingency table
        contingency_table = pd.crosstab(df[feature], df[target_col])
        
        # Perform chi-square test
        chi2_stat, p_value, dof, expected = stats.chi2_contingency(contingency_table)
        
        # Calculate Cram√©r's V (effect size)
        n = contingency_table.sum().sum()
        cramers_v = np.sqrt(chi2_stat / (n * (min(contingency_table.shape) - 1)))
        
        chi2_results.append({
            'Feature': feature,
            'Chi2_Statistic': chi2_stat,
            'P_Value': p_value,
            'Degrees_of_Freedom': dof,
            'Cramers_V': cramers_v,
            'Significant': 'Yes' if p_value < 0.05 else 'No'
        })
    
    chi2_df = pd.DataFrame(chi2_results)
    display(chi2_df)
    
    print("\nüìã Cram√©r's V interpretation:")
    print("‚Ä¢ Small effect: 0.1")
    print("‚Ä¢ Medium effect: 0.3")
    print("‚Ä¢ Large effect: 0.5")

## 7. Conclusions {#conclusions}

Let's summarize our findings and provide recommendations for model building.

In [None]:
# Summary of key findings
print("üìã DATA EXPLORATION SUMMARY")
print("=" * 50)

print(f"\nüìä Dataset Overview:")
print(f"‚Ä¢ Total samples: {len(df)}")
print(f"‚Ä¢ Total features: {len(df.columns) - 1}")
print(f"‚Ä¢ Target distribution: {df[target_col].value_counts().to_dict()}")
print(f"‚Ä¢ Missing values: {df.isnull().sum().sum()}")

print(f"\nüîç Feature Analysis:")
print(f"‚Ä¢ Continuous features: {len(continuous_features)}")
print(f"‚Ä¢ Binary features: {len(binary_features)}")
print(f"‚Ä¢ Categorical features: {len(categorical_features)}")

# Top correlated features with target
if 'target_correlations' in locals():
    top_features = target_correlations.head(3)
    print(f"\nüéØ Top 3 features correlated with target:")
    for feature, corr in top_features.items():
        print(f"‚Ä¢ {feature}: {corr:.3f}")

# Recommendations
print(f"\nüí° RECOMMENDATIONS FOR MODEL BUILDING:")
print(f"1. ‚úÖ Dataset is ready for machine learning")
print(f"2. üîß Apply standard scaling for tree-based models")
print(f"3. üìä Consider feature selection based on correlation analysis")
print(f"4. üéØ Target classes are reasonably balanced")
print(f"5. üîç Monitor for overfitting due to dataset size")

if 'high_corr_pairs' in locals() and high_corr_pairs:
    print(f"6. ‚ö†Ô∏è  Consider removing highly correlated features")

if 'outlier_df' in locals() and not outlier_df.empty:
    outlier_features = outlier_df[outlier_df['Outlier_Percentage'] > 5]['Feature'].tolist()
    if outlier_features:
        print(f"7. üîß Consider outlier treatment for: {outlier_features}")

print(f"\nüöÄ Ready to proceed with model training!")