In [None]:
# Cell 1: Import Libraries and Setup
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.stats import pearsonr, spearmanr
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Configure plotting
plt.style.use("seaborn-v0_8")
sns.set_palette("husl")
warnings.filterwarnings("ignore")

# Set figure size defaults
plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams["font.size"] = 10

print("📊 Abalone Age Prediction - Comprehensive EDA")
print("=" * 50)

In [None]:
# Cell 1: Import Libraries and Setup
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.stats import pearsonr, spearmanr
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Configure plotting
plt.style.use("seaborn-v0_8")
sns.set_palette("husl")
warnings.filterwarnings("ignore")

# Set figure size defaults
plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams["font.size"] = 10

print("📊 Abalone Age Prediction - Comprehensive EDA")
print("=" * 50)

In [None]:
# Cell 2: Load and Initial Data Inspection
# Load the dataset
df = pd.read_csv("../data/abalone.csv")

print("🔍 DATASET OVERVIEW")
print("=" * 30)
print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\n📋 Column Information:")
print(df.info())

print("\n🎯 First 5 rows:")
df.head()

In [None]:
# Cell 3: Data Quality Assessment
print("🔍 DATA QUALITY ASSESSMENT")
print("=" * 40)

# Check for missing values
print("Missing values per column:")
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({"Missing Count": missing_values, "Missing Percentage": missing_percent}).sort_values(
    "Missing Count", ascending=False
)
print(missing_df)

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\n🔄 Duplicate rows: {duplicates}")

# Data types
print(f"\n📊 Data types:")
print(df.dtypes)

# Unique values in categorical columns
categorical_cols = df.select_dtypes(include=["object"]).columns
print(f"\n🏷️ Categorical columns unique values:")
for col in categorical_cols:
    print(f"{col}: {df[col].unique()}")

In [None]:
# Cell 4: Basic Statistical Summary
print("📈 STATISTICAL SUMMARY")
print("=" * 30)

# Basic statistics for numerical columns
print("Numerical columns summary:")
numerical_summary = df.describe()
print(numerical_summary)

# Additional statistics
print("\n📊 Additional Statistics:")
numerical_cols = df.select_dtypes(include=[np.number]).columns

stats_df = pd.DataFrame(
    {
        "Skewness": df[numerical_cols].skew(),
        "Kurtosis": df[numerical_cols].kurtosis(),
        "Variance": df[numerical_cols].var(),
        "Range": df[numerical_cols].max() - df[numerical_cols].min(),
    }
)
print(stats_df)

In [None]:
# Cell 5: Target Variable Analysis (Rings/Age)
print("🎯 TARGET VARIABLE ANALYSIS")
print("=" * 35)

# Assuming 'Rings' is the target variable (age indicator)
target_col = "Rings" if "Rings" in df.columns else df.columns[-1]
print(f"Target variable: {target_col}")

# Target distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Histogram
axes[0, 0].hist(df[target_col], bins=30, alpha=0.7, color="skyblue", edgecolor="black")
axes[0, 0].axvline(df[target_col].mean(), color="red", linestyle="--", label=f"Mean: {df[target_col].mean():.2f}")
axes[0, 0].axvline(df[target_col].median(), color="green", linestyle="--", label=f"Median: {df[target_col].median():.2f}")
axes[0, 0].set_title(f"Distribution of {target_col}")
axes[0, 0].set_xlabel(target_col)
axes[0, 0].set_ylabel("Frequency")
axes[0, 0].legend()

# Box plot
axes[0, 1].boxplot(df[target_col])
axes[0, 1].set_title(f"Box Plot of {target_col}")
axes[0, 1].set_ylabel(target_col)

# Q-Q plot
stats.probplot(df[target_col], dist="norm", plot=axes[1, 0])
axes[1, 0].set_title(f"Q-Q Plot of {target_col}")

# Density plot
df[target_col].plot(kind="density", ax=axes[1, 1], color="purple")
axes[1, 1].set_title(f"Density Plot of {target_col}")
axes[1, 1].set_xlabel(target_col)

plt.tight_layout()
plt.show()

# Target statistics
print(f"\n📊 {target_col} Statistics:")
print(f"Mean: {df[target_col].mean():.2f}")
print(f"Median: {df[target_col].median():.2f}")
print(f"Mode: {df[target_col].mode().iloc[0]:.2f}")
print(f"Standard Deviation: {df[target_col].std():.2f}")
print(f"Skewness: {df[target_col].skew():.2f}")
print(f"Kurtosis: {df[target_col].kurtosis():.2f}")
print(f"Range: {df[target_col].min():.2f} - {df[target_col].max():.2f}")

In [None]:
# Cell 6: Categorical Variable Analysis
print("🏷️ CATEGORICAL VARIABLES ANALYSIS")
print("=" * 40)

categorical_cols = df.select_dtypes(include=["object"]).columns

if len(categorical_cols) > 0:
    for col in categorical_cols:
        print(f"\n📊 Analysis of {col}:")

        # Value counts
        value_counts = df[col].value_counts()
        print("Value counts:")
        print(value_counts)

        # Proportions
        proportions = df[col].value_counts(normalize=True) * 100
        print("\nProportions (%):")
        print(proportions.round(2))

        # Visualization
        fig, axes = plt.subplots(1, 3, figsize=(18, 5))

        # Bar plot
        value_counts.plot(kind="bar", ax=axes[0], color="lightcoral")
        axes[0].set_title(f"Count of {col}")
        axes[0].set_xlabel(col)
        axes[0].set_ylabel("Count")
        axes[0].tick_params(axis="x", rotation=45)

        # Pie chart
        axes[1].pie(value_counts.values, labels=value_counts.index, autopct="%1.1f%%", startangle=90)
        axes[1].set_title(f"Distribution of {col}")

        # Target by category
        if target_col in df.columns:
            df.groupby(col)[target_col].mean().plot(kind="bar", ax=axes[2], color="lightgreen")
            axes[2].set_title(f"Average {target_col} by {col}")
            axes[2].set_xlabel(col)
            axes[2].set_ylabel(f"Average {target_col}")
            axes[2].tick_params(axis="x", rotation=45)

        plt.tight_layout()
        plt.show()
else:
    print("No categorical variables found in the dataset.")

In [None]:
# Cell 7: Numerical Variables Distribution Analysis
print("📊 NUMERICAL VARIABLES DISTRIBUTION")
print("=" * 40)

numerical_cols = df.select_dtypes(include=[np.number]).columns
n_cols = len(numerical_cols)

# Create subplots for histograms
n_rows = (n_cols + 2) // 3
fig, axes = plt.subplots(n_rows, 3, figsize=(18, 6 * n_rows))
axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 else axes

for i, col in enumerate(numerical_cols):
    if i < len(axes):
        # Histogram with KDE
        axes[i].hist(df[col], bins=30, alpha=0.7, density=True, color="skyblue", edgecolor="black")

        # Add KDE curve
        df[col].plot(kind="density", ax=axes[i], color="red", linewidth=2)

        # Add mean and median lines
        axes[i].axvline(df[col].mean(), color="orange", linestyle="--", label=f"Mean: {df[col].mean():.2f}")
        axes[i].axvline(df[col].median(), color="green", linestyle="--", label=f"Median: {df[col].median():.2f}")

        axes[i].set_title(f"Distribution of {col}")
        axes[i].set_xlabel(col)
        axes[i].set_ylabel("Density")
        axes[i].legend()

# Remove empty subplots
for i in range(len(numerical_cols), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

# Distribution statistics
print("\n📈 Distribution Characteristics:")
for col in numerical_cols:
    skew = df[col].skew()
    kurt = df[col].kurtosis()

    skew_interpretation = "Right-skewed" if skew > 0.5 else "Left-skewed" if skew < -0.5 else "Approximately symmetric"
    kurt_interpretation = "Heavy-tailed" if kurt > 3 else "Light-tailed" if kurt < 3 else "Normal-tailed"

    print(f"{col}: {skew_interpretation} (skew: {skew:.2f}), {kurt_interpretation} (kurtosis: {kurt:.2f})")

In [None]:
# Cell 8: Outlier Detection and Analysis
print("🚨 OUTLIER DETECTION AND ANALYSIS")
print("=" * 40)


def detect_outliers_iqr(data, column):
    """Detect outliers using IQR method"""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound


def detect_outliers_zscore(data, column, threshold=3):
    """Detect outliers using Z-score method"""
    z_scores = np.abs(stats.zscore(data[column]))
    outliers = data[z_scores > threshold]
    return outliers


# Outlier analysis for each numerical column
outlier_summary = []

fig, axes = plt.subplots((len(numerical_cols) + 1) // 2, 2, figsize=(15, 6 * ((len(numerical_cols) + 1) // 2)))
axes = axes.flatten() if len(numerical_cols) > 2 else [axes] if len(numerical_cols) == 2 else axes

for i, col in enumerate(numerical_cols):
    if i < len(axes):
        # IQR method
        outliers_iqr, lower_bound, upper_bound = detect_outliers_iqr(df, col)

        # Z-score method
        outliers_zscore = detect_outliers_zscore(df, col)

        # Store results
        outlier_summary.append(
            {
                "Column": col,
                "IQR_Outliers": len(outliers_iqr),
                "IQR_Percentage": (len(outliers_iqr) / len(df)) * 100,
                "ZScore_Outliers": len(outliers_zscore),
                "ZScore_Percentage": (len(outliers_zscore) / len(df)) * 100,
            }
        )

        # Box plot
        axes[i].boxplot(df[col], vert=True)
        axes[i].set_title(f"Box Plot: {col}\nIQR Outliers: {len(outliers_iqr)} ({(len(outliers_iqr) / len(df) * 100):.1f}%)")
        axes[i].set_ylabel(col)

# Remove empty subplots
for i in range(len(numerical_cols), len(axes)):
    if i < len(axes):
        fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

# Outlier summary table
outlier_df = pd.DataFrame(outlier_summary)
print("\n📊 Outlier Summary:")
print(outlier_df)

# Extreme outliers (beyond 3 standard deviations)
print("\n🔍 Extreme Outliers Analysis:")
for col in numerical_cols:
    extreme_outliers = df[np.abs(stats.zscore(df[col])) > 3]
    if len(extreme_outliers) > 0:
        print(f"{col}: {len(extreme_outliers)} extreme outliers ({len(extreme_outliers) / len(df) * 100:.2f}%)")
        print(f"  Values: {extreme_outliers[col].tolist()}")

In [None]:
# Cell 9: Correlation Analysis
print("🔗 CORRELATION ANALYSIS")
print("=" * 30)

# Calculate correlation matrix
correlation_matrix = df[numerical_cols].corr()

# Create correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(
    correlation_matrix, mask=mask, annot=True, cmap="coolwarm", center=0, square=True, fmt=".2f", cbar_kws={"shrink": 0.8}
)
plt.title("Correlation Matrix Heatmap")
plt.tight_layout()
plt.show()

# Strong correlations (absolute value > 0.7)
print("\n🔥 Strong Correlations (|r| > 0.7):")
strong_corr = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i + 1, len(correlation_matrix.columns)):
        corr_val = correlation_matrix.iloc[i, j]
        if abs(corr_val) > 0.7:
            strong_corr.append(
                {
                    "Variable 1": correlation_matrix.columns[i],
                    "Variable 2": correlation_matrix.columns[j],
                    "Correlation": corr_val,
                }
            )

if strong_corr:
    strong_corr_df = pd.DataFrame(strong_corr)
    print(strong_corr_df.sort_values("Correlation", key=abs, ascending=False))
else:
    print("No strong correlations found.")

# Correlation with target variable
if target_col in numerical_cols:
    target_corr = correlation_matrix[target_col].drop(target_col).sort_values(key=abs, ascending=False)

    print(f"\n🎯 Correlation with {target_col}:")
    print(target_corr)

    # Visualize correlations with target
    plt.figure(figsize=(10, 8))
    target_corr.plot(kind="barh", color=["red" if x < 0 else "green" for x in target_corr.values])
    plt.title(f"Correlation with {target_col}")
    plt.xlabel("Correlation Coefficient")
    plt.tight_layout()
    plt.show()

In [None]:
# Cell 10: Feature Relationships and Scatter Plots
print("🔍 FEATURE RELATIONSHIPS")
print("=" * 30)

# Pairwise scatter plots for highly correlated features with target
if target_col in numerical_cols:
    # Get top correlated features with target
    target_corr = df[numerical_cols].corrwith(df[target_col]).abs().sort_values(ascending=False)
    top_features = target_corr.head(6).index.tolist()  # Top 6 including target

    if target_col not in top_features:
        top_features.append(target_col)

    # Create pairplot
    plt.figure(figsize=(15, 15))
    sns.pairplot(df[top_features], diag_kind="kde", plot_kws={"alpha": 0.6})
    plt.suptitle("Pairwise Relationships - Top Correlated Features", y=1.02)
    plt.show()

    # Individual scatter plots with target
    other_features = [col for col in top_features if col != target_col][:4]  # Top 4 features

    if other_features:
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        axes = axes.flatten()

        for i, feature in enumerate(other_features):
            if i < 4:
                # Scatter plot
                axes[i].scatter(df[feature], df[target_col], alpha=0.6, color="blue")

                # Add regression line
                z = np.polyfit(df[feature], df[target_col], 1)
                p = np.poly1d(z)
                axes[i].plot(df[feature], p(df[feature]), "r--", alpha=0.8)

                # Calculate correlation
                corr_coef = df[feature].corr(df[target_col])

                axes[i].set_xlabel(feature)
                axes[i].set_ylabel(target_col)
                axes[i].set_title(f"{feature} vs {target_col}\nCorrelation: {corr_coef:.3f}")
                axes[i].grid(True, alpha=0.3)

        plt.tight_layout()
        plt.show()

In [None]:
# Cell 11: Categorical vs Numerical Analysis
print("🔀 CATEGORICAL vs NUMERICAL ANALYSIS")
print("=" * 45)

if len(categorical_cols) > 0 and len(numerical_cols) > 0:
    for cat_col in categorical_cols:
        print(f"\n📊 Analysis of {cat_col} vs Numerical Variables:")

        # Statistical tests
        for num_col in numerical_cols[:4]:  # Limit to first 4 numerical columns
            groups = [df[df[cat_col] == category][num_col].values for category in df[cat_col].unique()]

            # ANOVA test
            try:
                f_stat, p_value = stats.f_oneway(*groups)
                print(f"{cat_col} vs {num_col}: F-statistic = {f_stat:.3f}, p-value = {p_value:.3f}")
            except:
                print(f"{cat_col} vs {num_col}: Could not perform ANOVA")

        # Visualizations
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        axes = axes.flatten()

        num_cols_subset = numerical_cols[:4]  # First 4 numerical columns

        for i, num_col in enumerate(num_cols_subset):
            if i < 4:
                # Box plot
                df.boxplot(column=num_col, by=cat_col, ax=axes[i])
                axes[i].set_title(f"{num_col} by {cat_col}")
                axes[i].set_xlabel(cat_col)
                axes[i].set_ylabel(num_col)

        plt.tight_layout()
        plt.show()

        # Summary statistics by category
        print(f"\n📈 Summary Statistics of Numerical Variables by {cat_col}:")
        summary_by_cat = df.groupby(cat_col)[numerical_cols[:4]].agg(["mean", "median", "std"])
        print(summary_by_cat.round(3))

In [None]:
# Cell 12: Advanced Statistical Analysis
print("📊 ADVANCED STATISTICAL ANALYSIS")
print("=" * 40)

# Normality tests
print("🔍 Normality Tests (Shapiro-Wilk):")
print("H0: Data is normally distributed")
print("H1: Data is not normally distributed")
print("-" * 50)

normality_results = []
for col in numerical_cols:
    # Sample for large datasets (Shapiro-Wilk works best with n < 5000)
    sample_size = min(1000, len(df))
    sample_data = df[col].sample(sample_size, random_state=42)

    stat, p_value = stats.shapiro(sample_data)
    is_normal = p_value > 0.05

    normality_results.append({"Variable": col, "Statistic": stat, "P-value": p_value, "Is_Normal": is_normal})

    print(f"{col}: Statistic = {stat:.4f}, P-value = {p_value:.4f}, Normal: {is_normal}")

normality_df = pd.DataFrame(normality_results)

# Anderson-Darling test for normality
print(f"\n🔍 Anderson-Darling Test Results:")
for col in numerical_cols[:3]:  # Limit for space
    result = stats.anderson(df[col], dist="norm")
    print(f"{col}: Statistic = {result.statistic:.4f}")
    for i, critical_value in enumerate(result.critical_values):
        significance_level = result.significance_level[i]
        if result.statistic < critical_value:
            print(f"  At {significance_level}% significance: Data appears normal")
            break
    else:
        print(f"  Data does not appear normal at standard significance levels")

# Variance homogeneity test (if categorical variables exist)
if len(categorical_cols) > 0:
    print(f"\n🔍 Levene's Test for Variance Homogeneity:")
    for cat_col in categorical_cols:
        for num_col in numerical_cols[:3]:  # Limit for space
            groups = [df[df[cat_col] == category][num_col].values for category in df[cat_col].unique()]
            try:
                stat, p_value = stats.levene(*groups)
                print(f"{num_col} across {cat_col}: Levene statistic = {stat:.4f}, p-value = {p_value:.4f}")
            except:
                print(f"{num_col} across {cat_col}: Could not perform Levene's test")

In [None]:
# Cell 13: Dimensionality Reduction Analysis
print("🔄 DIMENSIONALITY REDUCTION ANALYSIS")
print("=" * 45)

# Prepare data for PCA
numerical_data = df[numerical_cols].copy()

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numerical_data)

# Perform PCA
pca = PCA()
pca_result = pca.fit_transform(scaled_data)

# Explained variance
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)

print("📊 PCA Results:")
print(f"Number of components: {len(explained_variance_ratio)}")
print(f"Total variance explained by first 2 components: {cumulative_variance[1]:.3f}")
print(f"Total variance explained by first 3 components: {cumulative_variance[2]:.3f}")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Scree plot
axes[0, 0].plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, "bo-")
axes[0, 0].set_title("Scree Plot")
axes[0, 0].set_xlabel("Principal Component")
axes[0, 0].set_ylabel("Explained Variance Ratio")
axes[0, 0].grid(True, alpha=0.3)

# Cumulative variance plot
axes[0, 1].plot(range(1, len(cumulative_variance) + 1), cumulative_variance, "ro-")
axes[0, 1].axhline(y=0.8, color="green", linestyle="--", label="80% Variance")
axes[0, 1].axhline(y=0.95, color="orange", linestyle="--", label="95% Variance")
axes[0, 1].set_title("Cumulative Explained Variance")
axes[0, 1].set_xlabel("Number of Components")
axes[0, 1].set_ylabel("Cumulative Explained Variance")
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# PCA scatter plot (first 2 components)
scatter = axes[1, 0].scatter(pca_result[:, 0], pca_result[:, 1], c=df[target_col], cmap="viridis", alpha=0.6)
axes[1, 0].set_title("PCA: First Two Components")
axes[1, 0].set_xlabel(f"PC1 ({explained_variance_ratio[0]:.3f} variance)")
axes[1, 0].set_ylabel(f"PC2 ({explained_variance_ratio[1]:.3f} variance)")
plt.colorbar(scatter, ax=axes[1, 0], label=target_col)

# Component loadings heatmap
components_df = pd.DataFrame(
    pca.components_[:4].T,  # First 4 components
    columns=[f"PC{i + 1}" for i in range(4)],
    index=numerical_cols,
)

sns.heatmap(components_df, annot=True, cmap="coolwarm", center=0, ax=axes[1, 1])
axes[1, 1].set_title("PCA Component Loadings")

plt.tight_layout()
plt.show()

# Component interpretation
print("\n🔍 Component Loadings (First 3 Components):")
for i in range(min(3, len(pca.components_))):
    print(f"\nPC{i + 1} (Explains {explained_variance_ratio[i]:.3f} of variance):")
    loadings = pd.Series(pca.components_[i], index=numerical_cols)
    top_loadings = loadings.abs().sort_values(ascending=False).head()
    for feature, loading in top_loadings.items():
        print(f"  {feature}: {loadings[feature]:.3f}")

In [None]:
# Cell 14: Data Quality and Recommendations
print("✅ DATA QUALITY SUMMARY & RECOMMENDATIONS")
print("=" * 50)

# Data quality summary
print("📋 DATA QUALITY SUMMARY:")
print(f"✓ Dataset shape: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"✓ Missing values: {df.isnull().sum().sum()} total")
print(f"✓ Duplicate rows: {df.duplicated().sum()}")
print(f"✓ Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Feature summary
print(f"\n📊 FEATURE SUMMARY:")
print(f"✓ Numerical features: {len(numerical_cols)}")
print(f"✓ Categorical features: {len(categorical_cols)}")

# Distribution summary
normal_features = sum([result["Is_Normal"] for result in normality_results])
print(f"✓ Normally distributed features: {normal_features}/{len(numerical_cols)}")

# Correlation insights
if target_col in numerical_cols:
    target_corr = df[numerical_cols].corrwith(df[target_col]).abs().sort_values(ascending=False)
    high_corr_features = sum(target_corr > 0.5) - 1  # Exclude target itself
    print(f"✓ Features highly correlated with target (|r| > 0.5): {high_corr_features}")

print(f"\n🔍 KEY INSIGHTS:")

# Target variable insights
if target_col in df.columns:
    print(f"• {target_col} distribution: Mean = {df[target_col].mean():.2f}, Std = {df[target_col].std():.2f}")

    skew = df[target_col].skew()
    if abs(skew) > 0.5:
        skew_direction = "right" if skew > 0 else "left"
        print(f"• {target_col} is {skew_direction}-skewed (skewness = {skew:.2f})")

# Outlier insights
total_outliers = sum([result["IQR_Outliers"] for result in outlier_summary])
print(f"• Total outliers detected (IQR method): {total_outliers}")

# Feature relationship insights
if target_col in numerical_cols:
    target_corr = df[numerical_cols].corrwith(df[target_col]).abs().sort_values(ascending=False)
    top_feature = target_corr.index[1] if len(target_corr) > 1 else None
    if top_feature:
        print(f"• Strongest predictor: {top_feature} (correlation = {target_corr[top_feature]:.3f})")

print(f"\n💡 RECOMMENDATIONS:")

# Missing data recommendations
if df.isnull().sum().sum() > 0:
    print("• Handle missing values before modeling")

# Outlier recommendations
if total_outliers > len(df) * 0.05:  # More than 5% outliers
    print("• Consider outlier treatment (>5% of data points are outliers)")

# Normality recommendations
non_normal_features = len(numerical_cols) - normal_features
if non_normal_features > 0:
    print(f"• Consider data transformation for {non_normal_features} non-normal features")

# Correlation recommendations
if target_col in numerical_cols:
    target_corr = df[numerical_cols].corrwith(df[target_col]).abs().sort_values(ascending=False)
    weak_features = sum(target_corr < 0.1) - (1 if target_col in target_corr.index else 0)
    if weak_features > 0:
        print(f"• Consider feature selection: {weak_features} features have weak correlation with target")

# PCA recommendations
variance_80_components = np.argmax(cumulative_variance >= 0.8) + 1
if variance_80_components < len(numerical_cols):
    print(f"• Dimensionality reduction possible: {variance_80_components} components explain 80% variance")

print(f"\n🎯 MODELING SUGGESTIONS:")
print("• Consider ensemble methods due to feature complexity")
print("• Evaluate both linear and non-linear models")
print("• Use cross-validation for robust model evaluation")
print("• Monitor for overfitting given the feature relationships")

print(f"\n📝 NEXT STEPS:")
print("1. Handle data quality issues (missing values, outliers)")
print("2. Feature engineering based on correlation insights")
print("3. Split data into train/validation/test sets")
print("4. Baseline model development")
print("5. Feature selection and hyperparameter tuning")

In [None]:
# Cell 15: Export Summary Report
print("📄 GENERATING SUMMARY REPORT")
print("=" * 35)

# Create a comprehensive summary dictionary
summary_report = {
    "dataset_info": {
        "shape": df.shape,
        "memory_usage_mb": df.memory_usage(deep=True).sum() / 1024**2,
        "missing_values": df.isnull().sum().sum(),
        "duplicates": df.duplicated().sum(),
    },
    "feature_info": {
        "numerical_features": len(numerical_cols),
        "categorical_features": len(categorical_cols),
        "total_features": len(df.columns),
    },
    "target_analysis": {
        "target_variable": target_col,
        "mean": df[target_col].mean() if target_col in df.columns else None,
        "std": df[target_col].std() if target_col in df.columns else None,
        "skewness": df[target_col].skew() if target_col in df.columns else None,
        "range": [df[target_col].min(), df[target_col].max()] if target_col in df.columns else None,
    },
    "correlation_insights": {
        "strong_correlations": len(strong_corr) if "strong_corr" in locals() else 0,
        "top_predictor": target_corr.index[1] if target_col in numerical_cols and len(target_corr) > 1 else None,
        "top_correlation": float(target_corr.iloc[1]) if target_col in numerical_cols and len(target_corr) > 1 else None,
    },
    "data_quality": {
        "outlier_percentage": (total_outliers / len(df)) * 100,
        "normal_features": normal_features,
        "non_normal_features": len(numerical_cols) - normal_features,
    },
    "pca_insights": {
        "components_for_80_variance": int(variance_80_components),
        "first_two_components_variance": float(cumulative_variance[1]),
    },
}

# Save summary to file
import json

with open("../data/eda_summary.json", "w") as f:
    json.dump(summary_report, f, indent=2, default=str)

print("✅ EDA Summary saved to '../data/eda_summary.json'")

# Display final summary
print("\n🎉 EDA COMPLETE!")
print("=" * 20)
print("Key files generated:")
print("• EDA summary: ../data/eda_summary.json")
print("\nThis comprehensive EDA provides insights for:")
print("• Data preprocessing decisions")
print("• Feature engineering strategies")
print("• Model selection guidance")
print("• Performance evaluation metrics")

print(f"\n📱 Dataset Overview:")
print(f"• {df.shape[0]:,} samples with {df.shape[1]} features")
print(f"• Target variable: {target_col}")
print(f"• Ready for ML pipeline development! 🚀")