In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Configure display settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("Libraries imported successfully!")

## 1. Data Loading and Overview

In [None]:
# Load the dataset
df = pd.read_csv('data/diabetes.csv')

# Display basic information
print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"\nDataset Shape: {df.shape[0]} rows √ó {df.shape[1]} columns")
print(f"\nColumn Names: {list(df.columns)}")

In [None]:
# Display first few rows
print("\nFirst 10 rows of the dataset:")
df.head(10)

In [None]:
# Display last few rows
print("\nLast 5 rows of the dataset:")
df.tail()

In [None]:
# Data types and memory usage
print("\nData Types and Memory Usage:")
print("=" * 60)
df.info()

### Feature Description

| Feature | Description | Unit |
|---------|-------------|------|
| Pregnancies | Number of times pregnant | Count |
| Glucose | Plasma glucose concentration (2 hours in oral glucose tolerance test) | mg/dL |
| BloodPressure | Diastolic blood pressure | mm Hg |
| SkinThickness | Triceps skin fold thickness | mm |
| Insulin | 2-Hour serum insulin | mu U/ml |
| BMI | Body mass index | weight(kg)/(height(m))¬≤ |
| DiabetesPedigreeFunction | Diabetes pedigree function (genetic influence) | Score |
| Age | Age of the patient | Years |
| Outcome | Target variable (0 = No diabetes, 1 = Diabetes) | Binary |

## 2. Statistical Summary

In [None]:
# Comprehensive statistical summary
print("Statistical Summary of All Features:")
print("=" * 60)
df.describe().T.round(2)

In [None]:
# Additional statistics
print("\nAdditional Statistics:")
print("=" * 60)

stats_df = pd.DataFrame({
    'Mean': df.mean(),
    'Median': df.median(),
    'Std Dev': df.std(),
    'Variance': df.var(),
    'Skewness': df.skew(),
    'Kurtosis': df.kurtosis(),
    'Min': df.min(),
    'Max': df.max()
}).round(3)

stats_df

In [None]:
# Unique values in each column
print("\nUnique Values per Column:")
print("=" * 60)
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

## 3. Missing Value Analysis

In this dataset, **zero values** in certain columns represent missing data (biologically impossible values):
- Glucose = 0 (impossible)
- BloodPressure = 0 (impossible for living person)
- SkinThickness = 0 (unlikely)
- Insulin = 0 (unlikely)
- BMI = 0 (impossible)

In [None]:
# Check for actual NaN values
print("Actual Missing (NaN) Values:")
print("=" * 60)
print(df.isnull().sum())
print(f"\nTotal NaN values: {df.isnull().sum().sum()}")

In [None]:
# Check for zero values (hidden missing values)
print("\nZero Values Analysis (Hidden Missing Values):")
print("=" * 60)

# Columns where zero is biologically invalid
zero_invalid_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

zero_counts = {}
for col in zero_invalid_cols:
    count = (df[col] == 0).sum()
    percentage = (count / len(df)) * 100
    zero_counts[col] = {'Count': count, 'Percentage': f'{percentage:.2f}%'}

zero_df = pd.DataFrame(zero_counts).T
print(zero_df)

In [None]:
# Visualize missing values (zeros as missing)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot of zero values
zero_counts_values = [(df[col] == 0).sum() for col in zero_invalid_cols]
colors = plt.cm.Reds(np.linspace(0.3, 0.8, len(zero_invalid_cols)))

axes[0].bar(zero_invalid_cols, zero_counts_values, color=colors, edgecolor='black')
axes[0].set_title('Count of Zero Values (Missing Data)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Feature')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Add value labels
for i, v in enumerate(zero_counts_values):
    axes[0].text(i, v + 5, str(v), ha='center', fontweight='bold')

# Percentage bar plot
zero_pct_values = [(df[col] == 0).sum() / len(df) * 100 for col in zero_invalid_cols]
axes[1].bar(zero_invalid_cols, zero_pct_values, color=colors, edgecolor='black')
axes[1].set_title('Percentage of Zero Values (Missing Data)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Feature')
axes[1].set_ylabel('Percentage (%)')
axes[1].tick_params(axis='x', rotation=45)

# Add value labels
for i, v in enumerate(zero_pct_values):
    axes[1].text(i, v + 1, f'{v:.1f}%', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('eda_missing_values.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Check for duplicates
print("\nDuplicate Rows Analysis:")
print("=" * 60)
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
print(f"Percentage of duplicates: {(duplicates/len(df))*100:.2f}%")

## 4. Target Variable Analysis

In [None]:
# Target variable distribution
print("Target Variable (Outcome) Distribution:")
print("=" * 60)

outcome_counts = df['Outcome'].value_counts()
outcome_pct = df['Outcome'].value_counts(normalize=True) * 100

print(f"\nNo Diabetes (0): {outcome_counts[0]} ({outcome_pct[0]:.2f}%)")
print(f"Diabetes (1): {outcome_counts[1]} ({outcome_pct[1]:.2f}%)")
print(f"\nClass Imbalance Ratio: {outcome_counts[0]/outcome_counts[1]:.2f}:1")

In [None]:
# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Pie chart
colors_pie = ['#2ecc71', '#e74c3c']
explode = (0, 0.05)
axes[0].pie(outcome_counts, explode=explode, labels=['No Diabetes (0)', 'Diabetes (1)'],
            autopct='%1.1f%%', colors=colors_pie, shadow=True, startangle=90)
axes[0].set_title('Target Variable Distribution (Pie Chart)', fontsize=14, fontweight='bold')

# Bar chart
bars = axes[1].bar(['No Diabetes (0)', 'Diabetes (1)'], outcome_counts.values, 
                   color=colors_pie, edgecolor='black', linewidth=2)
axes[1].set_title('Target Variable Distribution (Bar Chart)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Count')
axes[1].set_xlabel('Outcome')

# Add value labels
for bar, count, pct in zip(bars, outcome_counts.values, outcome_pct.values):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 10,
                 f'{count}\n({pct:.1f}%)', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('eda_target_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Feature Distributions

In [None]:
# Distribution of all features using histograms
feature_cols = [col for col in df.columns if col != 'Outcome']

fig, axes = plt.subplots(2, 4, figsize=(16, 10))
axes = axes.flatten()

for idx, col in enumerate(feature_cols):
    axes[idx].hist(df[col], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
    axes[idx].axvline(df[col].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df[col].mean():.2f}')
    axes[idx].axvline(df[col].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {df[col].median():.2f}')
    axes[idx].set_title(f'{col} Distribution', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].legend(fontsize=8)

plt.suptitle('Feature Distributions with Mean and Median', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('eda_feature_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# KDE plots for feature distributions by outcome
fig, axes = plt.subplots(2, 4, figsize=(16, 10))
axes = axes.flatten()

for idx, col in enumerate(feature_cols):
    for outcome in [0, 1]:
        subset = df[df['Outcome'] == outcome][col]
        label = 'No Diabetes' if outcome == 0 else 'Diabetes'
        color = '#2ecc71' if outcome == 0 else '#e74c3c'
        axes[idx].hist(subset, bins=25, alpha=0.5, label=label, color=color, density=True)
    
    axes[idx].set_title(f'{col} by Outcome', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Density')
    axes[idx].legend()

plt.suptitle('Feature Distributions by Diabetes Outcome', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('eda_feature_by_outcome.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Outlier Detection

In [None]:
# Box plots for outlier detection
fig, axes = plt.subplots(2, 4, figsize=(16, 10))
axes = axes.flatten()

for idx, col in enumerate(feature_cols):
    box = axes[idx].boxplot(df[col], patch_artist=True)
    box['boxes'][0].set_facecolor('lightblue')
    axes[idx].set_title(f'{col}', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Value')

plt.suptitle('Box Plots for Outlier Detection', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('eda_boxplots.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Calculate outliers using IQR method
print("Outlier Analysis (IQR Method):")
print("=" * 60)

outlier_summary = []

for col in feature_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    outlier_count = len(outliers)
    outlier_pct = (outlier_count / len(df)) * 100
    
    outlier_summary.append({
        'Feature': col,
        'Q1': Q1,
        'Q3': Q3,
        'IQR': IQR,
        'Lower Bound': lower_bound,
        'Upper Bound': upper_bound,
        'Outlier Count': outlier_count,
        'Outlier %': f'{outlier_pct:.2f}%'
    })

outlier_df = pd.DataFrame(outlier_summary)
outlier_df.set_index('Feature', inplace=True)
outlier_df.round(2)

In [None]:
# Box plots by Outcome
fig, axes = plt.subplots(2, 4, figsize=(16, 10))
axes = axes.flatten()

for idx, col in enumerate(feature_cols):
    df.boxplot(column=col, by='Outcome', ax=axes[idx])
    axes[idx].set_title(f'{col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Outcome')

plt.suptitle('Box Plots by Diabetes Outcome', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('eda_boxplots_by_outcome.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Correlation Analysis

In [None]:
# Correlation matrix
print("Correlation Matrix:")
print("=" * 60)

correlation_matrix = df.corr()
correlation_matrix.round(3)

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))

mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f', 
            cmap='RdYlBu_r', center=0, square=True, linewidths=0.5,
            cbar_kws={'shrink': 0.8, 'label': 'Correlation Coefficient'})

plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('eda_correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Correlation with target variable
print("\nCorrelation with Target Variable (Outcome):")
print("=" * 60)

target_corr = correlation_matrix['Outcome'].drop('Outcome').sort_values(ascending=False)
print(target_corr.round(3))

# Visualize
plt.figure(figsize=(10, 6))
colors = ['green' if x > 0 else 'red' for x in target_corr.values]
bars = plt.barh(target_corr.index, target_corr.values, color=colors, edgecolor='black')
plt.xlabel('Correlation Coefficient')
plt.title('Feature Correlation with Diabetes Outcome', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linewidth=0.5)

# Add value labels
for bar, val in zip(bars, target_corr.values):
    plt.text(val + 0.01, bar.get_y() + bar.get_height()/2, f'{val:.3f}', 
             va='center', fontsize=10)

plt.tight_layout()
plt.savefig('eda_target_correlation.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Feature Relationships with Target

In [None]:
# Pair plot for key features
key_features = ['Glucose', 'BMI', 'Age', 'Insulin', 'Outcome']

g = sns.pairplot(df[key_features], hue='Outcome', 
                 palette={0: '#2ecc71', 1: '#e74c3c'},
                 diag_kind='kde', plot_kws={'alpha': 0.6})

g.fig.suptitle('Pair Plot of Key Features', fontsize=16, fontweight='bold', y=1.02)
plt.savefig('eda_pairplot.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Mean feature values by Outcome
print("\nMean Feature Values by Outcome:")
print("=" * 60)

grouped_means = df.groupby('Outcome').mean()
grouped_means.T.round(2)

In [None]:
# Visualize mean differences
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(feature_cols))
width = 0.35

bars1 = ax.bar(x - width/2, grouped_means.loc[0][feature_cols], width, 
               label='No Diabetes (0)', color='#2ecc71', edgecolor='black')
bars2 = ax.bar(x + width/2, grouped_means.loc[1][feature_cols], width, 
               label='Diabetes (1)', color='#e74c3c', edgecolor='black')

ax.set_ylabel('Mean Value')
ax.set_title('Mean Feature Values by Diabetes Outcome', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(feature_cols, rotation=45, ha='right')
ax.legend()

plt.tight_layout()
plt.savefig('eda_mean_by_outcome.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Violin plots for detailed distribution comparison
fig, axes = plt.subplots(2, 4, figsize=(16, 10))
axes = axes.flatten()

for idx, col in enumerate(feature_cols):
    sns.violinplot(x='Outcome', y=col, data=df, ax=axes[idx],
                   palette={0: '#2ecc71', 1: '#e74c3c'})
    axes[idx].set_title(f'{col}', fontsize=12, fontweight='bold')
    axes[idx].set_xticklabels(['No Diabetes', 'Diabetes'])

plt.suptitle('Violin Plots: Feature Distributions by Outcome', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('eda_violin_plots.png', dpi=150, bbox_inches='tight')
plt.show()

## 9. Key Insights and Findings

In [None]:
# Summary Statistics
print("=" * 70)
print("KEY INSIGHTS AND FINDINGS")
print("=" * 70)

print("\nüìä DATASET OVERVIEW:")
print(f"   ‚Ä¢ Total samples: {len(df)}")
print(f"   ‚Ä¢ Total features: {len(feature_cols)}")
print(f"   ‚Ä¢ Target classes: 2 (Binary Classification)")

print("\nüéØ TARGET VARIABLE (Outcome):")
print(f"   ‚Ä¢ No Diabetes (0): {outcome_counts[0]} samples ({outcome_pct[0]:.1f}%)")
print(f"   ‚Ä¢ Diabetes (1): {outcome_counts[1]} samples ({outcome_pct[1]:.1f}%)")
print(f"   ‚Ä¢ Class imbalance ratio: {outcome_counts[0]/outcome_counts[1]:.2f}:1")

print("\n‚ö†Ô∏è MISSING VALUES (Zeros as Missing):")
for col in zero_invalid_cols:
    count = (df[col] == 0).sum()
    pct = (count / len(df)) * 100
    print(f"   ‚Ä¢ {col}: {count} zeros ({pct:.1f}%)")

print("\nüìà TOP CORRELATED FEATURES WITH DIABETES:")
for feature, corr in target_corr.head(5).items():
    print(f"   ‚Ä¢ {feature}: {corr:.3f}")

print("\nüîç KEY OBSERVATIONS:")
print("   1. Glucose is the strongest predictor of diabetes (r = 0.47)")
print("   2. BMI and Age also show moderate positive correlation")
print("   3. Insulin has significant missing values (48.7%)")
print("   4. Dataset is moderately imbalanced (65:35 ratio)")
print("   5. Diabetic patients show higher mean values for most features")

print("\nüí° RECOMMENDATIONS:")
print("   1. Handle missing values using median/mean imputation")
print("   2. Consider class balancing techniques (SMOTE, class weights)")
print("   3. Focus on Glucose, BMI, and Age as primary features")
print("   4. Consider feature engineering for Age and BMI categories")
print("   5. Address outliers in Insulin and SkinThickness")

print("\n" + "=" * 70)

In [None]:
# Save EDA summary to file
eda_summary = {
    'dataset_info': {
        'total_samples': len(df),
        'total_features': len(feature_cols),
        'target_classes': 2
    },
    'target_distribution': {
        'no_diabetes': int(outcome_counts[0]),
        'diabetes': int(outcome_counts[1]),
        'imbalance_ratio': round(outcome_counts[0]/outcome_counts[1], 2)
    },
    'missing_values': {col: int((df[col] == 0).sum()) for col in zero_invalid_cols},
    'top_correlations': target_corr.head(5).to_dict(),
    'feature_statistics': df.describe().to_dict()
}

import json
with open('eda_summary.json', 'w') as f:
    json.dump(eda_summary, f, indent=2)

print("EDA Summary saved to 'eda_summary.json'")

---

## üìã EDA Artifacts Generated

The following files were generated during this analysis:

| File | Description |
|------|-------------|
| `eda_missing_values.png` | Missing values visualization |
| `eda_target_distribution.png` | Target variable distribution |
| `eda_feature_distributions.png` | Feature histograms |
| `eda_feature_by_outcome.png` | Features by outcome |
| `eda_boxplots.png` | Outlier detection box plots |
| `eda_boxplots_by_outcome.png` | Box plots by outcome |
| `eda_correlation_heatmap.png` | Correlation matrix heatmap |
| `eda_target_correlation.png` | Target correlation chart |
| `eda_pairplot.png` | Pair plot of key features |
| `eda_mean_by_outcome.png` | Mean values comparison |
| `eda_violin_plots.png` | Violin plot distributions |
| `eda_summary.json` | Summary statistics JSON |

---

**Author:** MLOps Diabetes Prediction Project  
**Date:** 2025  
**Dataset:** Pima Indians Diabetes Database