# **AI TECH INSTITUTE** · *Intermediate AI & Data Science*
### Week 04 · Notebook 05 – Statistical EDA Fundamentals
**Instructor:** Amir Charkhi  |  **Goal:** Master statistical techniques for exploratory data analysis.

> Format: short theory → quick practice → build understanding → mini-challenges.


---
## Learning Objectives
- Use statistical summaries to understand data
- Detect outliers and anomalies statistically
- Understand relationships between variables
- Build comprehensive EDA workflow

## 1. The Five-Number Summary & Beyond
Starting with the basics but going deeper.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)

In [None]:
# Generate realistic e-commerce data
np.random.seed(42)
n_customers = 1000

# Create customer purchase data
customer_data = pd.DataFrame({
    'customer_id': range(1, n_customers + 1),
    'age': np.random.normal(35, 12, n_customers).clip(18, 80).astype(int),
    'total_purchases': np.random.poisson(5, n_customers),
    'avg_order_value': np.random.lognormal(3.5, 0.8, n_customers),
    'days_since_signup': np.random.exponential(180, n_customers),
    'email_opens': np.random.binomial(20, 0.3, n_customers),
    'is_premium': np.random.choice([0, 1], n_customers, p=[0.8, 0.2])
})

# Add some outliers
customer_data.loc[np.random.choice(customer_data.index, 10), 'avg_order_value'] *= 10

print("Dataset Overview:")
print(customer_data.head())
print(f"\nShape: {customer_data.shape}")

In [None]:
def comprehensive_summary(df, column):
    """Enhanced statistical summary"""
    data = df[column].dropna()
    
    summary = {
        'count': len(data),
        'missing': df[column].isna().sum(),
        'mean': data.mean(),
        'median': data.median(),
        'mode': data.mode()[0] if len(data.mode()) > 0 else None,
        'std': data.std(),
        'variance': data.var(),
        'min': data.min(),
        'Q1': data.quantile(0.25),
        'Q2': data.quantile(0.50),
        'Q3': data.quantile(0.75),
        'max': data.max(),
        'IQR': data.quantile(0.75) - data.quantile(0.25),
        'range': data.max() - data.min(),
        'skewness': data.skew(),
        'kurtosis': data.kurtosis(),
        'CV': data.std() / data.mean() * 100  # Coefficient of variation
    }
    
    return pd.Series(summary)

# Apply to our data
print("Comprehensive Summary: Average Order Value")
print("="*50)
summary = comprehensive_summary(customer_data, 'avg_order_value')
for key, value in summary.items():
    if key == 'CV':
        print(f"{key:15s}: {value:.1f}%")
    else:
        print(f"{key:15s}: {value:.2f}")

In [None]:
# Visual statistical summary
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Box plot with annotations
box = axes[0].boxplot(customer_data['avg_order_value'], vert=True, patch_artist=True)
box['boxes'][0].set_facecolor('lightblue')
axes[0].set_ylabel('Average Order Value ($)')
axes[0].set_title('Five-Number Summary')

# Annotate quartiles
quartiles = customer_data['avg_order_value'].quantile([0.25, 0.5, 0.75])
for q, label in zip(quartiles, ['Q1', 'Median', 'Q3']):
    axes[0].annotate(f'{label}: ${q:.0f}', xy=(1.2, q), fontsize=10)

# Histogram with statistics overlay
axes[1].hist(customer_data['avg_order_value'], bins=50, density=True, alpha=0.7, color='green')
axes[1].axvline(customer_data['avg_order_value'].mean(), color='red', linestyle='--', label=f'Mean: ${customer_data["avg_order_value"].mean():.0f}')
axes[1].axvline(customer_data['avg_order_value'].median(), color='blue', linestyle='--', label=f'Median: ${customer_data["avg_order_value"].median():.0f}')
axes[1].set_xlabel('Average Order Value ($)')
axes[1].set_ylabel('Density')
axes[1].set_title('Distribution with Central Tendency')
axes[1].legend()

# Violin plot for shape understanding
parts = axes[2].violinplot([customer_data['avg_order_value']], positions=[1], 
                           showmeans=True, showmedians=True)
axes[2].set_ylabel('Average Order Value ($)')
axes[2].set_title('Distribution Shape')
axes[2].set_xticks([])

plt.tight_layout()
plt.show()

**Exercise 1 – Interpret Skewness (easy)**  
Calculate and interpret skewness for different variables.


In [None]:
# Your turn
# Calculate skewness for age, total_purchases, and days_since_signup
# Interpret what each tells you about the distribution


<details>
<summary><b>Solution</b></summary>

```python
variables = ['age', 'total_purchases', 'days_since_signup']

print("Skewness Analysis:")
print("="*50)
for var in variables:
    skew = customer_data[var].skew()
    print(f"\n{var}:")
    print(f"  Skewness: {skew:.3f}")
    
    if abs(skew) < 0.5:
        interpretation = "Fairly symmetric"
    elif skew > 0.5:
        interpretation = "Right-skewed (long tail to the right)"
    else:
        interpretation = "Left-skewed (long tail to the left)"
    
    print(f"  Interpretation: {interpretation}")
    
    if skew > 1:
        print(f"  💡 Consider log transformation")
    elif skew < -1:
        print(f"  💡 Consider square transformation")

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for i, var in enumerate(variables):
    axes[i].hist(customer_data[var], bins=30, alpha=0.7, edgecolor='black')
    axes[i].set_title(f'{var}\nSkew: {customer_data[var].skew():.2f}')
    axes[i].set_xlabel(var)
plt.tight_layout()
plt.show()
```
</details>

## 2. Statistical Outlier Detection
Multiple methods to find anomalies.

In [None]:
def detect_outliers_multiple_methods(df, column):
    """Compare different outlier detection methods"""
    data = df[column].dropna()
    
    outliers = {}
    
    # Method 1: IQR Method
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers['IQR'] = data[(data < lower_bound) | (data > upper_bound)]
    
    # Method 2: Z-Score Method
    z_scores = np.abs(stats.zscore(data))
    outliers['Z-Score'] = data[z_scores > 3]
    
    # Method 3: Modified Z-Score (using MAD)
    median = data.median()
    mad = np.median(np.abs(data - median))
    modified_z_scores = 0.6745 * (data - median) / mad
    outliers['Modified Z-Score'] = data[np.abs(modified_z_scores) > 3.5]
    
    # Method 4: Isolation Forest
    from sklearn.ensemble import IsolationForest
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    predictions = iso_forest.fit_predict(data.values.reshape(-1, 1))
    outliers['Isolation Forest'] = data[predictions == -1]
    
    return outliers

# Apply outlier detection
outliers = detect_outliers_multiple_methods(customer_data, 'avg_order_value')

print("Outlier Detection Results:")
print("="*50)
for method, outlier_values in outliers.items():
    print(f"{method:20s}: {len(outlier_values)} outliers found")
    if len(outlier_values) > 0:
        print(f"  Range: ${outlier_values.min():.0f} - ${outlier_values.max():.0f}")

In [None]:
# Visualize outliers
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
methods = list(outliers.keys())

for idx, (method, outlier_values) in enumerate(outliers.items()):
    ax = axes[idx // 2, idx % 2]
    
    # Plot all data
    ax.scatter(range(len(customer_data)), customer_data['avg_order_value'], 
              alpha=0.5, s=20, label='Normal')
    
    # Highlight outliers
    outlier_indices = customer_data[customer_data['avg_order_value'].isin(outlier_values)].index
    ax.scatter(outlier_indices, outlier_values, 
              color='red', s=50, label='Outliers', zorder=5)
    
    ax.set_xlabel('Customer Index')
    ax.set_ylabel('Average Order Value ($)')
    ax.set_title(f'{method} Method\n({len(outlier_values)} outliers)')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

**Exercise 2 – Outlier Impact Analysis (medium)**  
Compare statistics with and without outliers.


In [None]:
# Your turn
# Remove outliers using IQR method
# Compare mean, median, std before and after
# Which statistics are robust to outliers?


<details>
<summary><b>Solution</b></summary>

```python
# Remove outliers using IQR
data = customer_data['avg_order_value']
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

data_clean = data[(data >= lower) & (data <= upper)]
n_outliers = len(data) - len(data_clean)

# Compare statistics
comparison = pd.DataFrame({
    'With Outliers': [
        data.mean(),
        data.median(),
        data.std(),
        data.quantile(0.25),
        data.quantile(0.75)
    ],
    'Without Outliers': [
        data_clean.mean(),
        data_clean.median(),
        data_clean.std(),
        data_clean.quantile(0.25),
        data_clean.quantile(0.75)
    ]
}, index=['Mean', 'Median', 'Std Dev', 'Q1', 'Q3'])

comparison['% Change'] = (comparison['Without Outliers'] - comparison['With Outliers']) / comparison['With Outliers'] * 100

print(f"Outlier Impact Analysis ({n_outliers} outliers removed)")
print("="*60)
print(comparison.round(2))

print("\n📊 Insights:")
print(f"• Mean changed by {comparison.loc['Mean', '% Change']:.1f}% (NOT robust)")
print(f"• Median changed by {comparison.loc['Median', '% Change']:.1f}% (Robust)")
print(f"• Std Dev changed by {comparison.loc['Std Dev', '% Change']:.1f}% (NOT robust)")
print(f"• Quartiles changed by <{max(abs(comparison.loc['Q1', '% Change']), abs(comparison.loc['Q3', '% Change'])):.1f}% (Robust)")
```
</details>

## 3. Correlation Analysis & Relationships

In [None]:
# Different types of correlation
def correlation_analysis(df):
    """Comprehensive correlation analysis"""
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    # Pearson correlation (linear relationships)
    pearson_corr = df[numeric_cols].corr(method='pearson')
    
    # Spearman correlation (monotonic relationships)
    spearman_corr = df[numeric_cols].corr(method='spearman')
    
    # Kendall correlation (ordinal relationships)
    kendall_corr = df[numeric_cols].corr(method='kendall')
    
    return pearson_corr, spearman_corr, kendall_corr

pearson, spearman, kendall = correlation_analysis(customer_data)

# Visualize different correlations
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Pearson
sns.heatmap(pearson, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, ax=axes[0], cbar_kws={"shrink": 0.8})
axes[0].set_title('Pearson Correlation\n(Linear Relationships)')

# Spearman
sns.heatmap(spearman, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, ax=axes[1], cbar_kws={"shrink": 0.8})
axes[1].set_title('Spearman Correlation\n(Monotonic Relationships)')

# Kendall
sns.heatmap(kendall, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, ax=axes[2], cbar_kws={"shrink": 0.8})
axes[2].set_title('Kendall Correlation\n(Ordinal Relationships)')

plt.tight_layout()
plt.show()

In [None]:
# Statistical significance of correlations
def correlation_significance(df, var1, var2):
    """Test correlation significance"""
    data1 = df[var1].dropna()
    data2 = df[var2].dropna()
    
    # Ensure same length
    common_idx = data1.index.intersection(data2.index)
    data1 = data1[common_idx]
    data2 = data2[common_idx]
    
    # Pearson
    pearson_r, pearson_p = stats.pearsonr(data1, data2)
    
    # Spearman
    spearman_r, spearman_p = stats.spearmanr(data1, data2)
    
    print(f"Correlation between {var1} and {var2}:")
    print("="*50)
    print(f"Pearson:  r = {pearson_r:.3f}, p-value = {pearson_p:.4f}")
    print(f"Spearman: ρ = {spearman_r:.3f}, p-value = {spearman_p:.4f}")
    
    # Interpretation
    if pearson_p < 0.05:
        strength = "Strong" if abs(pearson_r) > 0.7 else "Moderate" if abs(pearson_r) > 0.3 else "Weak"
        direction = "positive" if pearson_r > 0 else "negative"
        print(f"\n✅ Significant {strength} {direction} linear relationship")
    else:
        print(f"\n❌ No significant linear relationship")
    
    # Scatter plot with regression line
    plt.figure(figsize=(8, 5))
    plt.scatter(data1, data2, alpha=0.5)
    z = np.polyfit(data1, data2, 1)
    p = np.poly1d(z)
    plt.plot(data1, p(data1), "r--", alpha=0.8, label=f'r = {pearson_r:.3f}')
    plt.xlabel(var1)
    plt.ylabel(var2)
    plt.title(f'Relationship: {var1} vs {var2}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# Test a relationship
correlation_significance(customer_data, 'total_purchases', 'email_opens')

## 4. Distribution Testing & Normality

In [None]:
def test_normality(data, var_name):
    """Multiple normality tests"""
    print(f"Normality Tests for {var_name}:")
    print("="*50)
    
    # Shapiro-Wilk Test
    stat_sw, p_sw = stats.shapiro(data[:min(5000, len(data))])  # Limited to 5000 samples
    print(f"Shapiro-Wilk: statistic={stat_sw:.4f}, p-value={p_sw:.4f}")
    
    # Kolmogorov-Smirnov Test
    stat_ks, p_ks = stats.kstest(data, 'norm', args=(data.mean(), data.std()))
    print(f"Kolmogorov-Smirnov: statistic={stat_ks:.4f}, p-value={p_ks:.4f}")
    
    # Anderson-Darling Test
    result_ad = stats.anderson(data, dist='norm')
    print(f"Anderson-Darling: statistic={result_ad.statistic:.4f}")
    print(f"  Critical values: {result_ad.critical_values}")
    print(f"  Significance levels: {result_ad.significance_level}%")
    
    # D'Agostino-Pearson Test
    stat_dp, p_dp = stats.normaltest(data)
    print(f"D'Agostino-Pearson: statistic={stat_dp:.4f}, p-value={p_dp:.4f}")
    
    # Overall conclusion
    p_values = [p_sw, p_ks, p_dp]
    normal_count = sum(p > 0.05 for p in p_values)
    
    print(f"\n📊 Conclusion: {normal_count}/3 tests suggest normality")
    if normal_count >= 2:
        print("✅ Data appears to be normally distributed")
    else:
        print("⚠️ Data may not be normally distributed")
        print("Consider transformations: log, sqrt, or Box-Cox")

# Test normality for age
test_normality(customer_data['age'].values, 'Age')

In [None]:
# Visual normality assessment
def visual_normality_check(data, var_name):
    """Visual methods to assess normality"""
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # Histogram with normal overlay
    axes[0, 0].hist(data, bins=30, density=True, alpha=0.7, color='blue', edgecolor='black')
    mu, std = data.mean(), data.std()
    x = np.linspace(data.min(), data.max(), 100)
    axes[0, 0].plot(x, stats.norm.pdf(x, mu, std), 'r-', linewidth=2, label='Normal fit')
    axes[0, 0].set_title(f'Histogram of {var_name}')
    axes[0, 0].set_xlabel('Value')
    axes[0, 0].set_ylabel('Density')
    axes[0, 0].legend()
    
    # Q-Q plot
    stats.probplot(data, dist="norm", plot=axes[0, 1])
    axes[0, 1].set_title('Q-Q Plot')
    
    # Box plot
    axes[1, 0].boxplot(data, vert=True)
    axes[1, 0].set_title('Box Plot')
    axes[1, 0].set_ylabel('Value')
    
    # P-P plot
    probplot = stats.probplot(data, dist="norm")
    theoretical_percentiles = np.linspace(0, 100, len(data))
    sample_percentiles = np.percentile(data, theoretical_percentiles)
    norm_percentiles = stats.norm.ppf(theoretical_percentiles/100, mu, std)
    
    axes[1, 1].scatter(norm_percentiles, sample_percentiles, alpha=0.5)
    axes[1, 1].plot([data.min(), data.max()], [data.min(), data.max()], 'r--', linewidth=2)
    axes[1, 1].set_xlabel('Theoretical Percentiles')
    axes[1, 1].set_ylabel('Sample Percentiles')
    axes[1, 1].set_title('P-P Plot')
    
    plt.suptitle(f'Normality Assessment: {var_name}', fontsize=14)
    plt.tight_layout()
    plt.show()

visual_normality_check(customer_data['age'].values, 'Age')

**Exercise 3 – Transform to Normality (medium)**  
Apply transformations to make skewed data more normal.


In [None]:
# Your turn
# Take avg_order_value (skewed) and try:
# 1. Log transformation
# 2. Square root transformation
# 3. Box-Cox transformation
# Which works best?


<details>
<summary><b>Solution</b></summary>

```python
# Original data
original = customer_data['avg_order_value'].values

# Transformations
log_transform = np.log1p(original)  # log(1+x) to handle zeros
sqrt_transform = np.sqrt(original)
boxcox_transform, lambda_param = stats.boxcox(original + 1)  # Add 1 to handle zeros

# Test normality for each
transformations = {
    'Original': original,
    'Log': log_transform,
    'Square Root': sqrt_transform,
    'Box-Cox': boxcox_transform
}

results = []
for name, data in transformations.items():
    stat, p_value = stats.shapiro(data[:5000])
    skew = stats.skew(data)
    kurt = stats.kurtosis(data)
    results.append({
        'Transformation': name,
        'Shapiro p-value': p_value,
        'Skewness': skew,
        'Kurtosis': kurt,
        'Normal?': 'Yes' if p_value > 0.05 else 'No'
    })

results_df = pd.DataFrame(results)
print("Transformation Comparison:")
print(results_df.to_string(index=False))
print(f"\nBox-Cox lambda: {lambda_param:.3f}")

# Visualize best transformation
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
for i, (name, data) in enumerate(transformations.items()):
    axes[i].hist(data, bins=30, density=True, alpha=0.7)
    mu, std = data.mean(), data.std()
    x = np.linspace(data.min(), data.max(), 100)
    axes[i].plot(x, stats.norm.pdf(x, mu, std), 'r-', linewidth=2)
    axes[i].set_title(f'{name}\nSkew: {stats.skew(data):.2f}')
    axes[i].set_xlabel('Value')

plt.suptitle('Transformation Effects on Distribution')
plt.tight_layout()
plt.show()

print(f"\n✅ Best transformation: {results_df.loc[results_df['Shapiro p-value'].idxmax(), 'Transformation']}")
```
</details>

## 5. Comprehensive EDA Framework

In [None]:
class StatisticalEDA:
    """Complete statistical EDA framework"""
    
    def __init__(self, df):
        self.df = df
        self.numeric_cols = df.select_dtypes(include=[np.number]).columns
        self.categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    
    def basic_info(self):
        """Dataset overview"""
        print("📊 DATASET OVERVIEW")
        print("="*50)
        print(f"Shape: {self.df.shape[0]} rows × {self.df.shape[1]} columns")
        print(f"Memory usage: {self.df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        print(f"\nColumn Types:")
        print(self.df.dtypes.value_counts())
        print(f"\nMissing Values:")
        missing = self.df.isnull().sum()
        if missing.sum() > 0:
            print(missing[missing > 0].sort_values(ascending=False))
        else:
            print("No missing values")
    
    def univariate_analysis(self):
        """Analyze each variable individually"""
        print("\n📈 UNIVARIATE ANALYSIS")
        print("="*50)
        
        for col in self.numeric_cols:
            print(f"\n{col}:")
            data = self.df[col].dropna()
            
            # Key statistics
            print(f"  Mean: {data.mean():.2f}, Median: {data.median():.2f}")
            print(f"  Std: {data.std():.2f}, IQR: {data.quantile(0.75) - data.quantile(0.25):.2f}")
            print(f"  Skewness: {data.skew():.2f}, Kurtosis: {data.kurtosis():.2f}")
            
            # Normality test
            if len(data) > 20:
                _, p_value = stats.shapiro(data[:5000])
                print(f"  Normal: {'Yes' if p_value > 0.05 else 'No'} (p={p_value:.4f})")
    
    def bivariate_analysis(self):
        """Analyze relationships between variables"""
        print("\n🔗 BIVARIATE ANALYSIS")
        print("="*50)
        
        # Find strong correlations
        corr_matrix = self.df[self.numeric_cols].corr()
        strong_corr = []
        
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                if abs(corr_matrix.iloc[i, j]) > 0.5:
                    strong_corr.append((
                        corr_matrix.columns[i],
                        corr_matrix.columns[j],
                        corr_matrix.iloc[i, j]
                    ))
        
        if strong_corr:
            print("Strong correlations (|r| > 0.5):")
            for var1, var2, corr in sorted(strong_corr, key=lambda x: abs(x[2]), reverse=True):
                print(f"  {var1} ↔ {var2}: {corr:.3f}")
        else:
            print("No strong correlations found")
    
    def outlier_summary(self):
        """Summarize outliers across all variables"""
        print("\n⚠️ OUTLIER SUMMARY")
        print("="*50)
        
        for col in self.numeric_cols:
            data = self.df[col].dropna()
            Q1 = data.quantile(0.25)
            Q3 = data.quantile(0.75)
            IQR = Q3 - Q1
            outliers = data[(data < Q1 - 1.5*IQR) | (data > Q3 + 1.5*IQR)]
            
            if len(outliers) > 0:
                print(f"{col}: {len(outliers)} outliers ({len(outliers)/len(data)*100:.1f}%)")
    
    def generate_report(self):
        """Generate complete EDA report"""
        self.basic_info()
        self.univariate_analysis()
        self.bivariate_analysis()
        self.outlier_summary()

# Run comprehensive EDA
eda = StatisticalEDA(customer_data)
eda.generate_report()

**Exercise 4 – Custom EDA Function (hard)**  
Create a function that automatically detects and reports data quality issues.


In [None]:
# Your turn
# Create data_quality_check() function that:
# 1. Checks for duplicates
# 2. Identifies constant columns
# 3. Finds high-cardinality categoricals
# 4. Detects potential data leakage


<details>
<summary><b>Solution</b></summary>

```python
def data_quality_check(df):
    """Comprehensive data quality assessment"""
    print("🔍 DATA QUALITY CHECK")
    print("="*60)
    
    issues = []
    
    # 1. Check for duplicates
    n_duplicates = df.duplicated().sum()
    if n_duplicates > 0:
        issues.append(f"⚠️ {n_duplicates} duplicate rows found")
        print(f"Duplicate rows: {n_duplicates} ({n_duplicates/len(df)*100:.1f}%)")
    else:
        print("✅ No duplicate rows")
    
    # 2. Identify constant columns
    constant_cols = []
    for col in df.columns:
        if df[col].nunique() == 1:
            constant_cols.append(col)
            issues.append(f"⚠️ Column '{col}' has only one unique value")
    
    if constant_cols:
        print(f"\nConstant columns: {constant_cols}")
    else:
        print("\n✅ No constant columns")
    
    # 3. High cardinality check
    print("\nCardinality Check:")
    for col in df.select_dtypes(include=['object', 'category']).columns:
        cardinality = df[col].nunique()
        cardinality_ratio = cardinality / len(df)
        
        if cardinality_ratio > 0.95:
            issues.append(f"⚠️ Column '{col}' has very high cardinality ({cardinality_ratio:.1%})")
            print(f"  {col}: {cardinality} unique values ({cardinality_ratio:.1%} of rows)")
    
    # 4. Potential data leakage detection
    print("\nData Leakage Check:")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        # Check for perfect correlations
        for other_col in numeric_cols:
            if col != other_col:
                corr = df[col].corr(df[other_col])
                if abs(corr) > 0.99:
                    issues.append(f"⚠️ Potential leakage: '{col}' and '{other_col}' are perfectly correlated")
                    print(f"  {col} ↔ {other_col}: correlation = {corr:.3f}")
    
    # 5. Missing value patterns
    print("\nMissing Value Patterns:")
    missing_cols = df.columns[df.isnull().any()]
    if len(missing_cols) > 0:
        for col in missing_cols:
            missing_pct = df[col].isnull().sum() / len(df) * 100
            if missing_pct > 50:
                issues.append(f"⚠️ Column '{col}' has {missing_pct:.1f}% missing values")
            print(f"  {col}: {missing_pct:.1f}% missing")
    else:
        print("  No missing values")
    
    # 6. Outlier prevalence
    print("\nOutlier Prevalence:")
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]
        outlier_pct = len(outliers) / len(df) * 100
        
        if outlier_pct > 10:
            issues.append(f"⚠️ Column '{col}' has {outlier_pct:.1f}% outliers")
            print(f"  {col}: {outlier_pct:.1f}% outliers")
    
    # Summary
    print("\n" + "="*60)
    print(f"SUMMARY: Found {len(issues)} potential issues")
    if issues:
        print("\nIssues to address:")
        for issue in issues:
            print(f"  {issue}")
    else:
        print("✅ Data quality looks good!")
    
    return issues

# Run data quality check
issues = data_quality_check(customer_data)
```
</details>

## 6. Mini-Challenges
- **M1 (easy):** Calculate and interpret the coefficient of variation for all numeric columns
- **M2 (medium):** Implement a function to detect multimodal distributions
- **M3 (hard):** Build an automated EDA report generator with visualizations

In [None]:
# Your turn - try the challenges!


<details>
<summary><b>Solutions</b></summary>

```python
# M1 - Coefficient of Variation
cv_results = []
for col in customer_data.select_dtypes(include=[np.number]).columns:
    mean = customer_data[col].mean()
    std = customer_data[col].std()
    cv = (std / mean) * 100 if mean != 0 else np.inf
    cv_results.append({'Column': col, 'CV': cv})

cv_df = pd.DataFrame(cv_results).sort_values('CV')
print("Coefficient of Variation (Relative Variability):")
for _, row in cv_df.iterrows():
    interpretation = "Low" if row['CV'] < 30 else "Moderate" if row['CV'] < 60 else "High"
    print(f"{row['Column']:20s}: {row['CV']:6.1f}% ({interpretation} variability)")

# M2 - Multimodal Detection
from scipy.signal import find_peaks

def detect_multimodal(data, col_name):
    hist, bins = np.histogram(data, bins=50)
    hist = hist / hist.max()  # Normalize
    
    # Find peaks
    peaks, properties = find_peaks(hist, height=0.3, distance=5)
    
    n_modes = len(peaks)
    
    plt.figure(figsize=(10, 5))
    plt.hist(data, bins=50, alpha=0.7, density=True)
    
    for peak in peaks:
        plt.axvline(bins[peak], color='red', linestyle='--', alpha=0.7)
    
    plt.title(f'{col_name}: {n_modes} mode(s) detected')
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.show()
    
    if n_modes > 1:
        print(f"⚠️ {col_name} appears to be multimodal with {n_modes} peaks")
        print("This might indicate multiple subpopulations or segments")
    return n_modes

detect_multimodal(customer_data['avg_order_value'].values, 'Average Order Value')

# M3 - Automated EDA Report
def generate_visual_eda_report(df, output_file='eda_report.html'):
    from io import StringIO
    import base64
    from io import BytesIO
    
    html_report = StringIO()
    html_report.write('<html><head><title>EDA Report</title></head><body>')
    html_report.write('<h1>Automated EDA Report</h1>')
    
    # Basic info
    html_report.write('<h2>Dataset Overview</h2>')
    html_report.write(f'<p>Shape: {df.shape[0]} rows × {df.shape[1]} columns</p>')
    
    # Statistics table
    html_report.write('<h2>Statistical Summary</h2>')
    html_report.write(df.describe().to_html())
    
    # Correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    
    # Save plot to base64 string
    buffer = BytesIO()
    plt.savefig(buffer, format='png')
    buffer.seek(0)
    img_str = base64.b64encode(buffer.read()).decode()
    html_report.write(f'<h2>Correlation Matrix</h2>')
    html_report.write(f'<img src="data:image/png;base64,{img_str}" />')
    plt.close()
    
    html_report.write('</body></html>')
    
    # Save to file
    with open(output_file, 'w') as f:
        f.write(html_report.getvalue())
    
    print(f"✅ Report saved to {output_file}")

# Generate report (uncomment to run)
# generate_visual_eda_report(customer_data, 'customer_eda.html')
```
</details>

## Wrap-Up & Next Steps
✅ You can perform comprehensive statistical summaries  
✅ You know multiple methods for outlier detection  
✅ You can assess relationships and correlations  
✅ You can test distributions and normality  
✅ You built a complete EDA framework  

**Next:** Sampling Theory and Law of Large Numbers - Understanding the foundations of statistical inference!
