In [1]:
# Question: Evaluating Data Distribution
# Description: Analyze the distribution of a numeric column using histograms and descriptive statistics.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

def analyze_distribution(df, column_name):
    """
    Analyze the distribution of a numeric column with:
    - Descriptive statistics
    - Histogram with KDE
    - Boxplot
    - Normality test
    """
    
    # 1. Basic Information
    print(f"\n{'='*50}")
    print(f"ANALYZING DISTRIBUTION OF: {column_name}")
    print(f"{'='*50}")
    
    # 2. Descriptive Statistics
    print("\nDESCRIPTIVE STATISTICS:")
    stats = df[column_name].describe(percentiles=[.01, .05, .25, .5, .75, .95, .99])
    print(stats)
    
    # Additional statistics
    print(f"\nSkewness: {df[column_name].skew():.3f}")
    print(f"Kurtosis: {df[column_name].kurtosis():.3f}")
    
    # 3. Normality Test
    print("\nNORMALITY TEST (Shapiro-Wilk):")
    stat, p = stats.shapiro(df[column_name].dropna())
    print(f"Test Statistic: {stat:.3f}, p-value: {p:.4f}")
    print("Data appears normally distributed" if p > 0.05 else "Data does not appear normally distributed")
    
    # 4. Visualization
    plt.figure(figsize=(15, 5))
    
    # Histogram with KDE
    plt.subplot(1, 2, 1)
    sns.histplot(data=df, x=column_name, kde=True, bins=30)
    plt.title(f'Distribution of {column_name}')
    plt.axvline(df[column_name].mean(), color='r', linestyle='--', label='Mean')
    plt.axvline(df[column_name].median(), color='g', linestyle='-', label='Median')
    plt.legend()
    
    # Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(data=df, y=column_name)
    plt.title(f'Boxplot of {column_name}')
    
    plt.tight_layout()
    plt.show()
    
    # 5. Outlier Analysis
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    
    outliers = df[(df[column_name] < lower_bound) | (df[column_name] > upper_bound)]
    print(f"\nPotential outliers (using IQR method): {len(outliers)}")
    if len(outliers) > 0:
        print(outliers[[column_name]].sort_values(column_name))

# Example Usage
if __name__ == "__main__":
    # Load dataset (replace with your data)
    print("Loading sample dataset...")
    diamonds = sns.load_dataset('diamonds')
    
    # Analyze price distribution
    analyze_distribution(diamonds, 'price')
    
    # Analyze carat distribution
    analyze_distribution(diamonds, 'carat')

Loading sample dataset...

ANALYZING DISTRIBUTION OF: price

DESCRIPTIVE STATISTICS:
count    53940.000000
mean      3932.799722
std       3989.439738
min        326.000000
1%         429.000000
5%         544.000000
25%        950.000000
50%       2401.000000
75%       5324.250000
95%      13107.100000
99%      17378.220000
max      18823.000000
Name: price, dtype: float64

Skewness: 1.618
Kurtosis: 2.178

NORMALITY TEST (Shapiro-Wilk):


AttributeError: 'Series' object has no attribute 'shapiro'