In [2]:
# Question: Advanced Data Profiling and Outlier Detection
# Description: Perform detailed data profiling including outlier detection for numeric columns.


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zscore

def advanced_data_profiling(data, column_name):
    df = pd.DataFrame(data)
    
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the dataset.")
    
    # Ensure the column is numeric
    col_data = pd.to_numeric(df[column_name], errors='coerce').dropna()

    if col_data.empty:
        raise ValueError(f"Column '{column_name}' has no valid numeric data.")
    
    # Descriptive Statistics
    description = col_data.describe()
    print(f"Descriptive Statistics for {column_name}:\n", description)
    
    # Outlier Detection using Z-score (threshold = 3)
    z_scores = zscore(col_data)
    outliers_zscore = col_data[np.abs(z_scores) > 3]

    # Outlier Detection using IQR (Interquartile Range)
    Q1 = col_data.quantile(0.25)
    Q3 = col_data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers_iqr = col_data[(col_data < lower_bound) | (col_data > upper_bound)]

    # Print the outliers
    print(f"\nOutliers (Z-score method):\n{outliers_zscore}")
    print(f"\nOutliers (IQR method):\n{outliers_iqr}")
    
    # Visualizations
    plt.figure(figsize=(12, 6))

    # Boxplot
    plt.subplot(1, 2, 1)
    plt.boxplot(col_data, vert=False, patch_artist=True)
    plt.title(f'Boxplot of {column_name}')
    
    # Histogram
    plt.subplot(1, 2, 2)
    plt.hist(col_data, bins=20, color='skyblue', edgecolor='black')
    plt.title(f'Histogram of {column_name}')
    
    plt.tight_layout()
    plt.show()

    return description, outliers_zscore, outliers_iqr

