# Enhanced EDA & Preprocessing Pipeline for Diabetes Dataset

This notebook converts the enhanced EDA and preprocessing pipeline for the diabetes dataset into an interactive format. It addresses gaps in year column handling, feature engineering, outlier detection, advanced imputation, comprehensive EDA, and high-dimensionality issues.

# 1. Import Required Libraries

Import all necessary libraries including pandas, numpy, matplotlib, seaborn, sklearn, and others used in the pipeline.

In [15]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, TargetEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set matplotlib backend for compatibility
import matplotlib
matplotlib.use('Agg')

# 2. Utility Functions

Define utility functions for directory creation, binary detection, and target guessing.

In [16]:
# ---------- Utility Functions ----------
def ensure_dir(p: str):
    """Create directory if it doesn't exist"""
    os.makedirs(p, exist_ok=True)

def is_binary_like(s: pd.Series) -> bool:
    """Check if series contains binary-like values"""
    vals = s.dropna().unique()
    if len(vals) == 2:
        return True
    lowered = pd.Series(vals).astype(str).str.lower().unique()
    return set(lowered).issubset({"yes","no","true","false","positive","negative","pos","neg","y","n","1","0"})

def guess_target(df: pd.DataFrame):
    """Automatically detect target column"""
    common = [
        "Outcome","outcome","target","Target","label","Label","class","Class",
        "diabetes","Diabetes","has_diabetes","diabetic","Diabetic"
    ]
    for c in common:
        if c in df.columns:
            return c
    return None

# 3. Enhanced EDA Functions

Define functions for comprehensive exploratory data analysis including missing values analysis, correlation, and visualization generation.

In [None]:
# ---------- Enhanced EDA Functions ----------
def comprehensive_eda(df: pd.DataFrame, reports_dir: str, target_col: str = None):
    """Enhanced EDA with comprehensive analysis"""
    ensure_dir(reports_dir)
    
    print("üîç Running Comprehensive EDA...")
    
    # 1. Basic Dataset Info
    basic_info = {
        'total_rows': len(df),
        'total_columns': len(df.columns),
        'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024**2,
        'duplicated_rows': df.duplicated().sum()
    }
    
    # 2. Column types and info
    dtypes_df = df.dtypes.astype(str).rename("dtype").reset_index().rename(columns={"index":"column"})
    dtypes_df['unique_values'] = [df[col].nunique() for col in df.columns]
    dtypes_df['null_count'] = [df[col].isnull().sum() for col in df.columns]
    dtypes_df['null_percentage'] = dtypes_df['null_count'].apply(lambda x: round(x / len(df) * 100, 2))
    dtypes_df.to_csv(os.path.join(reports_dir, "01_enhanced_dtypes.csv"), index=False)
    
    # 3. Enhanced missing values analysis with gender-specific context
    miss_analysis = df.isnull().sum().reset_index()
    miss_analysis.columns = ['column', 'missing_count']
    miss_analysis['missing_pct'] = miss_analysis['missing_count'].apply(lambda x: round(x / len(df) * 100, 2))
    
    # Add context for gender-specific features
    miss_analysis['missing_type'] = 'standard'
    gender_specific_features = ['gestational_history', 'gestational_diabetes', 'pregnancy_history']
    
    for feature in gender_specific_features:
        if feature in miss_analysis['column'].values:
            mask = miss_analysis['column'] == feature
            miss_analysis.loc[mask, 'missing_type'] = 'gender_specific'
            
            # If gender column exists, calculate male vs female missing rates
            if 'gender' in df.columns or 'sex' in df.columns:
                gender_col = 'gender' if 'gender' in df.columns else 'sex'
                
                # Calculate missing rates by gender
                gender_stats = []
                for gender_val in df[gender_col].unique():
                    if pd.notna(gender_val):
                        gender_subset = df[df[gender_col] == gender_val]
                        gender_missing = gender_subset[feature].isnull().sum()
                        gender_total = len(gender_subset)
                        gender_pct = round(gender_missing / gender_total * 100, 2) if gender_total > 0 else 0
                        gender_stats.append(f"{gender_val}: {gender_missing}/{gender_total} ({gender_pct}%)")
                
                # Add gender breakdown as a note
                miss_analysis.loc[mask, 'gender_breakdown'] = "; ".join(gender_stats)
    
    # Add gender_breakdown column for non-gender-specific features
    if 'gender_breakdown' not in miss_analysis.columns:
        miss_analysis['gender_breakdown'] = ''
    miss_analysis['gender_breakdown'] = miss_analysis['gender_breakdown'].fillna('')
    
    miss_analysis = miss_analysis.sort_values('missing_pct', ascending=False)
    miss_analysis.to_csv(os.path.join(reports_dir, "02_enhanced_missing_values.csv"), index=False)
    
    # 4. Identify column types
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    # Remove target from feature lists if specified
    if target_col and target_col in numeric_cols:
        numeric_cols.remove(target_col)
    if target_col and target_col in categorical_cols:
        categorical_cols.remove(target_col)
    
    # 5. Enhanced numeric analysis
    if numeric_cols:
        numeric_stats = df[numeric_cols].describe()
        
        # Add additional statistics
        numeric_enhanced = numeric_stats.copy()
        for col in numeric_cols:
            data = df[col].dropna()
            numeric_enhanced.loc['skewness', col] = stats.skew(data)
            numeric_enhanced.loc['kurtosis', col] = stats.kurtosis(data)
            numeric_enhanced.loc['cv', col] = data.std() / data.mean() if data.mean() != 0 else 0
        
        numeric_enhanced.round(4).to_csv(os.path.join(reports_dir, "03_enhanced_numeric_analysis.csv"))
    
    # 6. Categorical analysis
    if categorical_cols:
        cat_analysis = []
        for col in categorical_cols:
            unique_vals = df[col].nunique()
            top_category = df[col].mode()[0] if not df[col].mode().empty else 'No Mode'
            top_frequency = df[col].value_counts().iloc[0] if unique_vals > 0 else 0
            
            cat_analysis.append({
                'column': col,
                'unique_categories': unique_vals,
                'top_category': top_category,
                'top_frequency': top_frequency,
                'top_percentage': round(top_frequency / len(df) * 100, 2)
            })
        
        cat_df = pd.DataFrame(cat_analysis)
        cat_df.to_csv(os.path.join(reports_dir, "04_categorical_analysis.csv"), index=False)
    
    # 7. Target distribution (if target specified)
    if target_col and target_col in df.columns:
        target_dist = df[target_col].value_counts().reset_index()
        target_dist.columns = [target_col, 'count']
        target_dist['percentage'] = target_dist['count'].apply(lambda x: round(x / len(df) * 100, 2))
        target_dist.to_csv(os.path.join(reports_dir, "05_target_distribution.csv"), index=False)
    
    # 8. Outlier detection for numeric columns
    outlier_analysis = []
    for col in numeric_cols:
        data = df[col].dropna()
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = data[(data < lower_bound) | (data > upper_bound)]
        
        outlier_analysis.append({
            'column': col,
            'outlier_count': len(outliers),
            'outlier_percentage': round(len(outliers) / len(data) * 100, 2),
            'lower_bound': lower_bound,
            'upper_bound': upper_bound
        })
    
    outlier_df = pd.DataFrame(outlier_analysis)
    outlier_df.to_csv(os.path.join(reports_dir, "06_outlier_analysis.csv"), index=False)
    
    # 9. Correlation analysis (numeric columns only)
    if len(numeric_cols) > 1:
        corr_matrix = df[numeric_cols].corr()
        
        # Save correlation matrix
        corr_matrix.round(3).to_csv(os.path.join(reports_dir, "07_correlation_matrix.csv"))
        
        # Create correlation heatmap
        plt.figure(figsize=(12, 10))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                    square=True, linewidths=0.5)
        plt.title('Feature Correlation Heatmap')
        plt.tight_layout()
        plt.savefig(os.path.join(reports_dir, "correlation_heatmap.png"), dpi=300, bbox_inches='tight')
        plt.close()
    
    # 10. Generate comprehensive visualizations
    generate_enhanced_visualizations(df, reports_dir, numeric_cols, categorical_cols, target_col)
    
    print(f"‚úÖ Enhanced EDA completed. Reports saved to: {reports_dir}")
    return basic_info

def generate_enhanced_visualizations(df, reports_dir, numeric_cols, categorical_cols, target_col):
    """Generate comprehensive visualizations for all columns"""
    
    # Create visualizations subdirectory
    viz_dir = os.path.join(reports_dir, "visualizations")
    ensure_dir(viz_dir)
    
    # 1. Numeric columns - Histograms and Box plots
    for col in numeric_cols:
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        # Histogram
        df[col].hist(bins=50, ax=axes[0], alpha=0.7, edgecolor='black')
        axes[0].set_title(f'Histogram: {col}')
        axes[0].set_xlabel(col)
        axes[0].set_ylabel('Frequency')
        
        # Box plot
        df.boxplot(column=col, ax=axes[1])
        axes[1].set_title(f'Box Plot: {col}')
        axes[1].set_ylabel(col)
        
        plt.tight_layout()
        plt.savefig(os.path.join(viz_dir, f"numeric_{col}.png"), dpi=300, bbox_inches='tight')
        plt.close()
    
    # 2. Categorical columns - Bar plots
    for col in categorical_cols:
        plt.figure(figsize=(12, 6))
        
        # Get value counts
        value_counts = df[col].value_counts()
        
        # Limit to top 20 categories if too many
        if len(value_counts) > 20:
            value_counts = value_counts.head(20)
            title_suffix = " (Top 20)"
        else:
            title_suffix = ""
        
        # Create bar plot
        ax = value_counts.plot(kind='bar', color='skyblue', edgecolor='black')
        plt.title(f'Distribution: {col}{title_suffix}')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')
        
        # Add value labels on bars
        for i, v in enumerate(value_counts.values):
            ax.text(i, v + max(value_counts.values) * 0.01, str(v), 
                   ha='center', va='bottom', fontsize=9)
        
        plt.tight_layout()
        plt.savefig(os.path.join(viz_dir, f"categorical_{col}.png"), dpi=300, bbox_inches='tight')
        plt.close()
    
    # 3. Target vs Features analysis (if target specified)
    if target_col and target_col in df.columns:
        target_viz_dir = os.path.join(viz_dir, "target_analysis")
        ensure_dir(target_viz_dir)
        
        # Numeric features vs target
        for col in numeric_cols:
            plt.figure(figsize=(12, 5))
            
            # Create subplots
            fig, axes = plt.subplots(1, 2, figsize=(15, 5))
            
            # Box plot by target
            df.boxplot(column=col, by=target_col, ax=axes[0])
            axes[0].set_title(f'{col} by {target_col}')
            
            # Histogram by target
            for target_val in df[target_col].unique():
                subset = df[df[target_col] == target_val][col]
                axes[1].hist(subset, alpha=0.7, label=f'{target_col}={target_val}', bins=30)
            
            axes[1].set_title(f'{col} Distribution by {target_col}')
            axes[1].set_xlabel(col)
            axes[1].set_ylabel('Frequency')
            axes[1].legend()
            
            plt.tight_layout()
            plt.savefig(os.path.join(target_viz_dir, f"target_vs_{col}.png"), dpi=300, bbox_inches='tight')
            plt.close()

# 4. 

Define functions for outlier handling, imputation, feature engineering, and high cardinality categorical variable processing.

In [18]:
# ---------- Enhanced Preprocessing Functions ----------
def detect_outliers_iqr(series, multiplier=1.5):
    """Detect outliers using IQR method"""
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    return (series < lower_bound) | (series > upper_bound)

def handle_outliers(df, numeric_cols, method='cap', multiplier=1.5):
    """Handle outliers in numeric columns"""
    df_clean = df.copy()
    outlier_info = {}
    
    for col in numeric_cols:
        outliers = detect_outliers_iqr(df_clean[col], multiplier)
        outlier_count = outliers.sum()
        
        if outlier_count > 0:
            if method == 'cap':
                # Cap outliers
                Q1 = df_clean[col].quantile(0.25)
                Q3 = df_clean[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - multiplier * IQR
                upper_bound = Q3 + multiplier * IQR
                
                df_clean[col] = np.where(df_clean[col] < lower_bound, lower_bound, df_clean[col])
                df_clean[col] = np.where(df_clean[col] > upper_bound, upper_bound, df_clean[col])
                
            elif method == 'remove':
                # Remove outliers (not recommended for large datasets)
                df_clean = df_clean[~outliers]
        
        outlier_info[col] = {
            'outlier_count': outlier_count,
            'outlier_percentage': round(outlier_count / len(df) * 100, 2),
            'method_applied': method if outlier_count > 0 else 'none'
        }
    
    return df_clean, outlier_info

def enhanced_imputation(df, numeric_cols, categorical_cols, target_col=None):
    """Enhanced imputation strategies for different column types"""
    df_imputed = df.copy()
    imputation_info = {}
    
    # Medical/Health-specific imputation logic
    medical_features = ['bmi', 'hbA1c_level', 'blood_glucose_level', 'sleep_hours']
    
    # Numeric imputation
    for col in numeric_cols:
        missing_count = df_imputed[col].isnull().sum()
        
        if missing_count > 0:
            if col in medical_features:
                # For medical features, use median within similar groups if possible
                if target_col and target_col in df.columns:
                    # Group by target and use median
                    df_imputed[col] = df_imputed.groupby(target_col)[col].transform(
                        lambda x: x.fillna(x.median()) if not x.median() != x.median() else x.fillna(df_imputed[col].median())
                    )
                else:
                    df_imputed[col] = df_imputed[col].fillna(df_imputed[col].median())
                imputation_method = 'group_median' if target_col else 'median'
            else:
                # Regular median imputation for other numeric
                df_imputed[col] = df_imputed[col].fillna(df_imputed[col].median())
                imputation_method = 'median'
            
            imputation_info[col] = {
                'missing_count': missing_count,
                'imputation_method': imputation_method
            }
    
    # Categorical imputation
    for col in categorical_cols:
        missing_count = df_imputed[col].isnull().sum()
        
        if missing_count > 0:
            # Special handling for gender-related features like gestational_history
            if col.lower() in ['gestational_history', 'gestational_diabetes', 'pregnancy_history']:
                # For gender-specific features, use gender-aware imputation
                if 'gender' in df_imputed.columns or 'sex' in df_imputed.columns:
                    gender_col = 'gender' if 'gender' in df_imputed.columns else 'sex'
                    # For males, fill with 'Not Applicable' or 'NA'
                    # For females, use mode within female group
                    for gender_val in df_imputed[gender_col].unique():
                        if pd.notna(gender_val):
                            gender_mask = df_imputed[gender_col] == gender_val
                            if gender_val.lower() in ['male', 'm', 'man']:
                                df_imputed.loc[gender_mask, col] = df_imputed.loc[gender_mask, col].fillna('Not Applicable')
                            else:  # female or other
                                female_mode = df_imputed.loc[gender_mask, col].mode()
                                if len(female_mode) > 0:
                                    df_imputed.loc[gender_mask, col] = df_imputed.loc[gender_mask, col].fillna(female_mode[0])
                                else:
                                    df_imputed.loc[gender_mask, col] = df_imputed.loc[gender_mask, col].fillna('No')
                    imputation_method = 'gender_aware'
                else:
                    # If no gender column, assume mixed population and use conservative approach
                    df_imputed[col] = df_imputed[col].fillna('Not Applicable')
                    imputation_method = 'not_applicable'
            else:
                # Regular categorical imputation for non-gender-specific features
                # Use mode or 'Unknown' if no mode exists
                mode_val = df_imputed[col].mode()
                if len(mode_val) > 0:
                    df_imputed[col] = df_imputed[col].fillna(mode_val[0])
                    imputation_method = 'mode'
                else:
                    df_imputed[col] = df_imputed[col].fillna('Unknown')
                    imputation_method = 'unknown'
            
            imputation_info[col] = {
                'missing_count': missing_count,
                'imputation_method': imputation_method
            }
    
    return df_imputed, imputation_info

def feature_engineering(df, target_col=None):
    """Create additional engineered features"""
    df_engineered = df.copy()
    new_features = []
    
    # 1. BMI-related features
    if 'bmi' in df.columns:
        # BMI risk categories (more detailed)
        df_engineered['bmi_risk_level'] = pd.cut(df_engineered['bmi'], 
                                               bins=[0, 18.5, 25, 30, 35, float('inf')],
                                               labels=['underweight', 'normal', 'overweight', 'obese_1', 'obese_2'])
        new_features.append('bmi_risk_level')
    
    # 2. Age-related features
    if 'age' in df.columns:
        # Age risk for diabetes (medical domain knowledge)
        df_engineered['age_diabetes_risk'] = pd.cut(df_engineered['age'],
                                                  bins=[0, 35, 45, 65, float('inf')],
                                                  labels=['low_risk', 'moderate_risk', 'high_risk', 'very_high_risk'])
        new_features.append('age_diabetes_risk')
    
    # 3. Combined health risk score
    health_indicators = ['hypertension', 'heart_disease', 'family_history']
    available_indicators = [col for col in health_indicators if col in df.columns]
    
    if available_indicators:
        df_engineered['health_risk_score'] = df_engineered[available_indicators].sum(axis=1)
        new_features.append('health_risk_score')
    
    # 4. Lifestyle score
    lifestyle_factors = []
    
    # Physical activity scoring
    if 'physical_activity' in df.columns:
        activity_map = {'low': 0, 'moderate': 1, 'high': 2}
        df_engineered['activity_score'] = df_engineered['physical_activity'].map(activity_map).fillna(0)
        lifestyle_factors.append('activity_score')
        new_features.append('activity_score')
    
    # Sleep quality scoring
    if 'sleep_hours' in df.columns:
        # Optimal sleep is 7-9 hours
        df_engineered['sleep_quality'] = df_engineered['sleep_hours'].apply(
            lambda x: 2 if 7 <= x <= 9 else (1 if 6 <= x <= 10 else 0) if pd.notna(x) else 0
        )
        lifestyle_factors.append('sleep_quality')
        new_features.append('sleep_quality')
    
    # Combined lifestyle score
    if lifestyle_factors:
        df_engineered['lifestyle_score'] = df_engineered[lifestyle_factors].sum(axis=1)
        new_features.append('lifestyle_score')
    
    # 5. Geographic risk (if environmental_risk is available)
    if 'environmental_risk' in df.columns and 'urban_rural' in df.columns:
        # Combine environmental risk with urban/rural
        urban_risk_map = {'urban': 1.1, 'rural': 0.9}  # Urban areas might have higher risk
        df_engineered['location_risk'] = (df_engineered['environmental_risk'] * 
                                        df_engineered['urban_rural'].map(urban_risk_map).fillna(1.0))
        new_features.append('location_risk')
    
    return df_engineered, new_features

def handle_high_cardinality_categorical(df, categorical_cols, target_col=None, max_categories=10):
    """Handle high cardinality categorical variables"""
    df_processed = df.copy()
    encoding_info = {}
    
    for col in categorical_cols:
        unique_count = df_processed[col].nunique()
        
        if unique_count > max_categories:
            # For high cardinality columns like 'location' (states)
            if col == 'location':
                # Group by frequency - keep top states, others as 'Other'
                value_counts = df_processed[col].value_counts()
                top_categories = value_counts.head(max_categories).index.tolist()
                df_processed[col] = df_processed[col].apply(
                    lambda x: x if x in top_categories else 'Other'
                )
                encoding_info[col] = {
                    'method': 'frequency_grouping',
                    'kept_categories': len(top_categories) + 1,  # +1 for 'Other'
                    'original_categories': unique_count
                }
            
            elif target_col and target_col in df.columns:
                # Use target encoding for other high cardinality categorical variables
                # This is more sophisticated than frequency grouping
                target_encoder = TargetEncoder()
                df_processed[f'{col}_target_encoded'] = target_encoder.fit_transform(
                    df_processed[[col]], df_processed[target_col]
                )
                
                # Keep original column and add encoded version
                encoding_info[col] = {
                    'method': 'target_encoding',
                    'new_column': f'{col}_target_encoded',
                    'original_categories': unique_count
                }
            else:
                # Fallback to frequency grouping
                value_counts = df_processed[col].value_counts()
                top_categories = value_counts.head(max_categories).index.tolist()
                df_processed[col] = df_processed[col].apply(
                    lambda x: x if x in top_categories else 'Other'
                )
                encoding_info[col] = {
                    'method': 'frequency_grouping',
                    'kept_categories': len(top_categories) + 1,
                    'original_categories': unique_count
                }
    
    return df_processed, encoding_info

# 5. Load Dataset

Load the raw diabetes dataset from a CSV file and display basic information about the data.

In [19]:
# Load dataset
# Update the path below to the correct location of your diabetes dataset
df = pd.read_csv(r'C:\Users\ASUS TUF A15\Downloads\diabetes_dataset_E.csv')
print(f"Dataset loaded: {df.shape[0]} rows √ó {df.shape[1]} columns")

# Auto-detect target column
target_col = guess_target(df)
if target_col:
    print(f"üéØ Target column: {target_col}")
else:
    print("‚ö†Ô∏è  No target column detected. Please specify manually.")
    target_col = None  # Set manually if needed

# Display first few rows
df.head()

Dataset loaded: 100000 rows √ó 28 columns
üéØ Target column: diabetes


Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,...,diet_pattern,sleep_hours,alcohol_intake,family_history,medication_use,gestational_history,urban_rural,region_income,environmental_risk,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,...,balanced,4,none,0,0,0.0,urban,low,7,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,...,balanced,9,none,0,1,1.0,urban,medium,9,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,...,balanced,5,occasional,1,0,,urban,medium,10,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,...,balanced,6,regular,1,0,,rural,low,2,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,...,balanced,5,none,0,0,0.0,rural,medium,6,0


# 6. Identify Column Types

Analyze and categorize columns into numeric and categorical types, with special handling for the year column.

In [20]:
# Identify column types
numeric_cols = [c for c in df.columns 
               if pd.api.types.is_numeric_dtype(df[c]) and c != target_col]
categorical_cols = [c for c in df.columns 
                   if df[c].dtype == "object" and c != target_col]

# Special handling for 'year' column - treat as categorical
if 'year' in numeric_cols:
    print("Moving 'year' from numeric to categorical (ordinal treatment)")
    numeric_cols.remove('year')
    categorical_cols.append('year')
    # Convert year to string to treat as categorical
    df['year'] = df['year'].astype(str)

print(f"Numeric columns: {numeric_cols}")
print(f"Categorical columns: {categorical_cols}")

Moving 'year' from numeric to categorical (ordinal treatment)
Numeric columns: ['age', 'race:AfricanAmerican', 'race:Asian', 'race:Caucasian', 'race:Hispanic', 'race:Other', 'hypertension', 'heart_disease', 'bmi', 'hbA1c_level', 'blood_glucose_level', 'sleep_hours', 'family_history', 'medication_use', 'gestational_history', 'environmental_risk']
Categorical columns: ['gender', 'location', 'smoking_history', 'bmi_category', 'age_group', 'physical_activity', 'diet_pattern', 'alcohol_intake', 'urban_rural', 'region_income', 'year']


# 7. Run Comprehensive EDA

Execute the comprehensive EDA function to generate reports, visualizations, and analyses on the dataset, including missing values, correlations, and outlier detection.

In [21]:
# Run comprehensive EDA
reports_dir = 'data/reports_enhanced'
basic_info = comprehensive_eda(df, reports_dir=reports_dir, target_col=target_col)
print(f"Basic info: {basic_info}")

üîç Running Comprehensive EDA...
‚úÖ Enhanced EDA completed. Reports saved to: data/reports_enhanced
Basic info: {'total_rows': 100000, 'total_columns': 28, 'memory_usage_mb': np.float64(70.94676303863525), 'duplicated_rows': np.int64(0)}


# 8. Handle Outliers

Apply outlier detection and handling using the IQR method, capping or removing outliers as specified.

In [23]:
# Handle outliers
df, outlier_info = handle_outliers(df, numeric_cols, method='cap')
print("Outlier handling completed.")
print(f"Outlier info: {outlier_info}")

Outlier handling completed.
Outlier info: {'age': {'outlier_count': np.int64(0), 'outlier_percentage': np.float64(0.0), 'method_applied': 'none'}, 'race:AfricanAmerican': {'outlier_count': np.int64(20223), 'outlier_percentage': np.float64(20.22), 'method_applied': 'cap'}, 'race:Asian': {'outlier_count': np.int64(20015), 'outlier_percentage': np.float64(20.02), 'method_applied': 'cap'}, 'race:Caucasian': {'outlier_count': np.int64(19876), 'outlier_percentage': np.float64(19.88), 'method_applied': 'cap'}, 'race:Hispanic': {'outlier_count': np.int64(19888), 'outlier_percentage': np.float64(19.89), 'method_applied': 'cap'}, 'race:Other': {'outlier_count': np.int64(19998), 'outlier_percentage': np.float64(20.0), 'method_applied': 'cap'}, 'hypertension': {'outlier_count': np.int64(7485), 'outlier_percentage': np.float64(7.48), 'method_applied': 'cap'}, 'heart_disease': {'outlier_count': np.int64(3942), 'outlier_percentage': np.float64(3.94), 'method_applied': 'cap'}, 'bmi': {'outlier_count':

# 9. Perform Enhanced Imputation

Impute missing values in numeric and categorical columns using enhanced strategies, including group-based and gender-aware imputation.

In [22]:
# Perform enhanced imputation
df, imputation_info = enhanced_imputation(df, numeric_cols, categorical_cols, target_col)
print("Imputation completed.")
print(f"Imputation info: {imputation_info}")

Imputation completed.
Imputation info: {'gestational_history': {'missing_count': np.int64(41448), 'imputation_method': 'median'}}


# 10. Apply Feature Engineering

Create new features such as BMI risk levels, age diabetes risk, health risk scores, and lifestyle scores based on existing columns.

In [11]:
# Apply feature engineering
df, new_features = feature_engineering(df, target_col)
print(f"Feature engineering completed. New features: {new_features}")

# Update column lists with new categorical features
new_categorical = [f for f in new_features if df[f].dtype == 'object']
categorical_cols.extend(new_categorical)

new_numeric = [f for f in new_features if f not in new_categorical]
numeric_cols.extend(new_numeric)

print(f"Updated numeric columns: {numeric_cols}")
print(f"Updated categorical columns: {categorical_cols}")

Feature engineering completed. New features: ['bmi_risk_level', 'age_diabetes_risk', 'health_risk_score', 'activity_score', 'sleep_quality', 'lifestyle_score', 'location_risk']
Updated numeric columns: ['age', 'race:AfricanAmerican', 'race:Asian', 'race:Caucasian', 'race:Hispanic', 'race:Other', 'hypertension', 'heart_disease', 'bmi', 'hbA1c_level', 'blood_glucose_level', 'sleep_hours', 'family_history', 'medication_use', 'gestational_history', 'environmental_risk', 'bmi_risk_level', 'age_diabetes_risk', 'health_risk_score', 'activity_score', 'sleep_quality', 'lifestyle_score', 'location_risk']
Updated categorical columns: ['gender', 'location', 'smoking_history', 'bmi_category', 'age_group', 'physical_activity', 'diet_pattern', 'alcohol_intake', 'urban_rural', 'region_income', 'year']


# 11. Handle High Cardinality Categorical Variables

Process high cardinality categorical columns using frequency grouping or target encoding to reduce dimensionality.

In [12]:
# Handle high cardinality categorical variables
df, encoding_info = handle_high_cardinality_categorical(df, categorical_cols, target_col, max_categories=15)
print("High cardinality handling completed.")
print(f"Encoding info: {encoding_info}")

High cardinality handling completed.
Encoding info: {'location': {'method': 'frequency_grouping', 'kept_categories': 16, 'original_categories': 55}}


# 12. Prepare ML-Ready Data

Perform one-hot encoding, scaling of numeric features, and optional feature selection to prepare the data for machine learning models.

In [13]:
# Prepare ML-ready data

# One-hot encoding for categorical variables
categorical_for_encoding = [col for col in categorical_cols 
                           if not any(f'{col}_target_encoded' in colname for colname in df.columns)]

if categorical_for_encoding:
    print(f"Applying one-hot encoding to: {categorical_for_encoding}")
    df = pd.get_dummies(df, columns=categorical_for_encoding, drop_first=False)

# Update numeric columns list (include target encoded features, exclude categorical features)
target_encoded_cols = [col for col in df.columns if 'target_encoded' in col]

# Filter numeric_cols to only include truly numeric columns that exist in df
final_numeric_cols = []
for col in numeric_cols:
    if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
        final_numeric_cols.append(col)

# Add target encoded columns
final_numeric_cols.extend(target_encoded_cols)

# Scaling numeric features (excluding year which is now categorical)
if final_numeric_cols:
    print(f"Scaling {len(final_numeric_cols)} numeric features...")
    scaler = StandardScaler()
    df[final_numeric_cols] = scaler.fit_transform(df[final_numeric_cols])
    
    # Save scaler for later use
    import joblib
    scaler_path = 'data/processed_enhanced/feature_scaler.pkl'
    ensure_dir('data/processed_enhanced')
    joblib.dump(scaler, scaler_path)
    print(f"Scaler saved to: {scaler_path}")

# Feature selection (optional - select top K features for numeric columns only)
if target_col and target_col in df.columns and len(df.columns) > 50:
    print("Feature selection (too many features detected)...")
    
    # Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # Only apply feature selection to numeric columns
    numeric_feature_cols = [col for col in X.columns if pd.api.types.is_numeric_dtype(X[col])]
    categorical_feature_cols = [col for col in X.columns if not pd.api.types.is_numeric_dtype(X[col])]
    
    if numeric_feature_cols and len(numeric_feature_cols) > 30:
        # Select top K numeric features
        k = min(20, len(numeric_feature_cols))  # Select top 20 numeric or all available
        selector = SelectKBest(score_func=f_classif, k=k)
        X_numeric_selected = selector.fit_transform(X[numeric_feature_cols], y)
        
        # Get selected feature names
        selected_numeric_features = pd.Series(numeric_feature_cols)[selector.get_support()].tolist()
        
        # Combine selected numeric features with all categorical features and target
        selected_features = selected_numeric_features + categorical_feature_cols + [target_col]
        
        df = df[selected_features]
        
        # Save feature selection info
        feature_scores = pd.DataFrame({
            'feature': numeric_feature_cols,
            'score': selector.scores_,
            'selected': selector.get_support()
        }).sort_values('score', ascending=False)
        
        feature_scores.to_csv('data/processed_enhanced/feature_selection_report.csv', index=False)
        print(f"Selected {k} numeric features out of {len(numeric_feature_cols)} (kept all {len(categorical_feature_cols)} categorical features)")
    else:
        print("Skipping feature selection - not enough numeric features or features already manageable")

print(f"Final dataset: {df.shape[0]} rows √ó {df.shape[1]} columns")

Applying one-hot encoding to: ['gender', 'location', 'smoking_history', 'bmi_category', 'age_group', 'physical_activity', 'diet_pattern', 'alcohol_intake', 'urban_rural', 'region_income', 'year']
Scaling 21 numeric features...
Scaler saved to: data/processed_enhanced/feature_scaler.pkl
Feature selection (too many features detected)...
Selected 20 numeric features out of 75 (kept all 2 categorical features)
Final dataset: 100000 rows √ó 23 columns


# 13. Save Processed Data

Save the human-readable and ML-ready versions of the processed dataset to CSV files, along with reports and summaries.

In [14]:
# Save processed data

# Ensure output directory exists
ensure_dir('data/processed_enhanced')

# Save human-readable version
readable_path = 'data/processed_enhanced/diabetes_enhanced_readable.csv'
df.to_csv(readable_path, index=False)

# Save ML-ready version (same as readable in this case, since we did the transformations in place)
ml_path = 'data/processed_enhanced/diabetes_enhanced_ml_ready.csv'
df.to_csv(ml_path, index=False)

print("‚úÖ Processed data saved!")
print(f"üìÅ Human-readable data: {readable_path}")
print(f"ü§ñ ML-ready data: {ml_path}")
print(f"üìä Final dataset: {df.shape[0]} rows √ó {df.shape[1]} columns")
print(f"üìã EDA reports available in: {reports_dir}")

‚úÖ Processed data saved!
üìÅ Human-readable data: data/processed_enhanced/diabetes_enhanced_readable.csv
ü§ñ ML-ready data: data/processed_enhanced/diabetes_enhanced_ml_ready.csv
üìä Final dataset: 100000 rows √ó 23 columns
üìã EDA reports available in: data/reports_enhanced
