# Outlier Sensitivity Analysis

This notebook performs a comprehensive sensitivity analysis comparing results with and without outliers across different detection methods.

## Objectives
- Compare statistical measures before and after outlier removal
- Evaluate impact on model performance
- Visualize differences in data distributions
- Provide recommendations for outlier handling strategy

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
import outliers
import utils
import cleaning

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("üìä Outlier Sensitivity Analysis Setup Complete")

## 1. Load and Prepare Data

In [None]:
# Load sample bike demand data
bike_data = pd.read_csv('../data/sample-data.csv')

print("üìà Bike Demand Dataset:")
print(f"Shape: {bike_data.shape}")
print(f"Columns: {bike_data.columns.tolist()}")
print("\nFirst few rows:")
print(bike_data.head())

print("\nüìä Basic Statistics:")
print(bike_data.describe())

## 2. Outlier Detection Summary

In [None]:
# Define columns to analyze
numeric_columns = ['hour', 'temperature', 'humidity', 'demand']
target_column = 'demand'
feature_columns = ['hour', 'temperature', 'humidity']

# Generate outlier summary for different methods
outlier_methods = ['iqr', 'zscore', 'modified_zscore']
summary_df = outliers.outlier_summary(bike_data, numeric_columns, outlier_methods)

print("üîç Outlier Detection Summary:")
print(summary_df)

# Create visualization of outlier counts
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Outlier counts by method
summary_pivot = summary_df.pivot(index='Column', columns='Method', values='Outlier_Count')
summary_pivot.plot(kind='bar', ax=axes[0])
axes[0].set_title('Outlier Count by Detection Method')
axes[0].set_ylabel('Number of Outliers')
axes[0].tick_params(axis='x', rotation=45)
axes[0].legend(title='Method')

# Outlier percentages by method
summary_pivot_pct = summary_df.pivot(index='Column', columns='Method', values='Outlier_Percentage')
summary_pivot_pct.plot(kind='bar', ax=axes[1])
axes[1].set_title('Outlier Percentage by Detection Method')
axes[1].set_ylabel('Percentage of Outliers')
axes[1].tick_params(axis='x', rotation=45)
axes[1].legend(title='Method')

plt.tight_layout()
plt.show()

print("\n‚úÖ Outlier detection summary complete")

## 3. Visual Comparison: Before and After Outlier Removal

In [None]:
# Create comprehensive visual comparison
fig, axes = plt.subplots(3, 4, figsize=(20, 15))
fig.suptitle('Data Distribution: Original vs Outlier Removal Methods', fontsize=16, fontweight='bold')

methods_to_compare = ['original'] + [f'{method}_removed' for method in outlier_methods]
colors = ['blue', 'red', 'green', 'orange']

for i, column in enumerate(['temperature', 'humidity', 'demand']):
    # Box plots
    box_data = []
    box_labels = []
    
    original_data = bike_data[column]
    box_data.append(original_data)
    box_labels.append('Original')
    
    for method in outlier_methods:
        cleaned_data = outliers.remove_outliers(bike_data, [column], method=method)
        box_data.append(cleaned_data[column])
        box_labels.append(f'{method.upper()}')
    
    axes[i, 0].boxplot(box_data, labels=box_labels)
    axes[i, 0].set_title(f'{column.title()} - Box Plot Comparison')
    axes[i, 0].tick_params(axis='x', rotation=45)
    
    # Histograms
    axes[i, 1].hist(original_data, bins=20, alpha=0.7, color='blue', label='Original')
    axes[i, 1].set_title(f'{column.title()} - Original Distribution')
    axes[i, 1].set_xlabel(column.title())
    axes[i, 1].set_ylabel('Frequency')
    
    # IQR method comparison
    iqr_cleaned = outliers.remove_outliers(bike_data, [column], method='iqr')
    axes[i, 2].hist(iqr_cleaned[column], bins=20, alpha=0.7, color='red', label='IQR Cleaned')
    axes[i, 2].set_title(f'{column.title()} - IQR Method')
    axes[i, 2].set_xlabel(column.title())
    axes[i, 2].set_ylabel('Frequency')
    
    # Z-score method comparison
    zscore_cleaned = outliers.remove_outliers(bike_data, [column], method='zscore')
    axes[i, 3].hist(zscore_cleaned[column], bins=20, alpha=0.7, color='green', label='Z-score Cleaned')
    axes[i, 3].set_title(f'{column.title()} - Z-score Method')
    axes[i, 3].set_xlabel(column.title())
    axes[i, 3].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print("üìä Visual comparison complete")

## 4. Statistical Impact Analysis

In [None]:
# Perform comprehensive sensitivity analysis
def custom_analysis(df):
    """Custom analysis function for sensitivity testing"""
    results = {}
    
    # Correlation analysis
    corr_matrix = df[numeric_columns].corr()
    results['demand_correlations'] = corr_matrix['demand'].to_dict()
    
    # Simple linear regression: temperature vs demand
    if len(df) > 5:  # Ensure we have enough data
        X = df[['temperature']].values
        y = df['demand'].values
        
        model = LinearRegression()
        model.fit(X, y)
        y_pred = model.predict(X)
        
        results['regression_r2'] = r2_score(y, y_pred)
        results['regression_mse'] = mean_squared_error(y, y_pred)
        results['regression_coef'] = model.coef_[0]
        results['regression_intercept'] = model.intercept_
    
    return results

# Run sensitivity analysis
sensitivity_results = outliers.sensitivity_analysis(
    bike_data,
    target_column='demand',
    feature_columns=feature_columns,
    outlier_methods=outlier_methods,
    analysis_func=custom_analysis
)

print("üî¨ Sensitivity Analysis Results:")
print("=" * 50)

# Create comparison table
comparison_data = []

for method_name, results in sensitivity_results.items():
    row = {
        'Method': method_name,
        'Data_Points': results['data_shape'][0],
        'Demand_Mean': results['target_stats']['mean'],
        'Demand_Std': results['target_stats']['std'],
        'Demand_Skew': results['target_stats'].get('skew', 'N/A'),
        'Temp_Correlation': results.get('custom_analysis', {}).get('demand_correlations', {}).get('temperature', 'N/A'),
        'Regression_R2': results.get('custom_analysis', {}).get('regression_r2', 'N/A'),
        'Regression_Coef': results.get('custom_analysis', {}).get('regression_coef', 'N/A')
    }
    
    if 'rows_removed' in results:
        row['Rows_Removed'] = results['rows_removed']
        row['Removal_Pct'] = results['removal_percentage']
    else:
        row['Rows_Removed'] = 0
        row['Removal_Pct'] = 0.0
    
    comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data)

print("\nüìä Comparison Table:")
print(comparison_df.round(4))

# Save comparison results
comparison_path = utils.save_with_timestamp(
    df=comparison_df,
    prefix="outlier_sensitivity_comparison",
    source="analysis",
    ext="csv"
)

print(f"\nüíæ Comparison results saved to: {comparison_path}")

## 5. Model Performance Impact

In [None]:
# Compare model performance with and without outliers
def evaluate_model_performance(df, model_type='linear'):
    """Evaluate model performance on given dataset"""
    if len(df) < 10:  # Need minimum data for meaningful evaluation
        return {'error': 'Insufficient data'}
    
    X = df[feature_columns]
    y = df[target_column]
    
    if model_type == 'linear':
        model = LinearRegression()
    else:
        model = RandomForestRegressor(n_estimators=50, random_state=42)
    
    model.fit(X, y)
    y_pred = model.predict(X)
    
    return {
        'r2_score': r2_score(y, y_pred),
        'mse': mean_squared_error(y, y_pred),
        'rmse': np.sqrt(mean_squared_error(y, y_pred)),
        'mean_actual': y.mean(),
        'std_actual': y.std()
    }

# Test different models
model_results = {}

# Original data
model_results['original_linear'] = evaluate_model_performance(bike_data, 'linear')
model_results['original_rf'] = evaluate_model_performance(bike_data, 'rf')

# Test each outlier removal method
for method in outlier_methods:
    cleaned_data = outliers.remove_outliers(bike_data, numeric_columns, method=method)
    model_results[f'{method}_linear'] = evaluate_model_performance(cleaned_data, 'linear')
    model_results[f'{method}_rf'] = evaluate_model_performance(cleaned_data, 'rf')

# Create model performance comparison
model_comparison = []
for key, results in model_results.items():
    if 'error' not in results:
        method, model_type = key.rsplit('_', 1)
        model_comparison.append({
            'Method': method,
            'Model': model_type,
            'R2_Score': results['r2_score'],
            'RMSE': results['rmse'],
            'Mean_Actual': results['mean_actual'],
            'Std_Actual': results['std_actual']
        })

model_comparison_df = pd.DataFrame(model_comparison)

print("ü§ñ Model Performance Comparison:")
print(model_comparison_df.round(4))

# Visualize model performance
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# R¬≤ Score comparison
pivot_r2 = model_comparison_df.pivot(index='Method', columns='Model', values='R2_Score')
pivot_r2.plot(kind='bar', ax=axes[0])
axes[0].set_title('Model R¬≤ Score Comparison')
axes[0].set_ylabel('R¬≤ Score')
axes[0].tick_params(axis='x', rotation=45)
axes[0].legend(title='Model Type')

# RMSE comparison
pivot_rmse = model_comparison_df.pivot(index='Method', columns='Model', values='RMSE')
pivot_rmse.plot(kind='bar', ax=axes[1])
axes[1].set_title('Model RMSE Comparison')
axes[1].set_ylabel('RMSE')
axes[1].tick_params(axis='x', rotation=45)
axes[1].legend(title='Model Type')

plt.tight_layout()
plt.show()

print("\n‚úÖ Model performance analysis complete")

## 6. Regression Fit Comparison

In [None]:
# Create regression fit comparison plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Regression Fit Comparison: Temperature vs Demand', fontsize=16, fontweight='bold')

datasets = {
    'Original': bike_data,
    'IQR Cleaned': outliers.remove_outliers(bike_data, numeric_columns, method='iqr'),
    'Z-Score Cleaned': outliers.remove_outliers(bike_data, numeric_columns, method='zscore'),
    'Modified Z-Score Cleaned': outliers.remove_outliers(bike_data, numeric_columns, method='modified_zscore')
}

colors = ['blue', 'red', 'green', 'orange']
positions = [(0,0), (0,1), (1,0), (1,1)]

for i, (name, data) in enumerate(datasets.items()):
    row, col = positions[i]
    
    # Scatter plot
    axes[row, col].scatter(data['temperature'], data['demand'], alpha=0.6, color=colors[i])
    
    # Fit regression line
    if len(data) > 5:
        X = data[['temperature']].values
        y = data['demand'].values
        
        model = LinearRegression()
        model.fit(X, y)
        
        # Plot regression line
        x_range = np.linspace(data['temperature'].min(), data['temperature'].max(), 100)
        y_pred = model.predict(x_range.reshape(-1, 1))
        axes[row, col].plot(x_range, y_pred, color='black', linewidth=2, linestyle='--')
        
        # Calculate R¬≤
        y_pred_all = model.predict(X)
        r2 = r2_score(y, y_pred_all)
        
        axes[row, col].set_title(f'{name}\nR¬≤ = {r2:.4f}, N = {len(data)}')
    else:
        axes[row, col].set_title(f'{name}\nInsufficient data')
    
    axes[row, col].set_xlabel('Temperature')
    axes[row, col].set_ylabel('Demand')
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("üìà Regression fit comparison complete")

## 7. Recommendations and Summary

In [None]:
# Generate recommendations based on analysis
print("üéØ OUTLIER ANALYSIS RECOMMENDATIONS")
print("=" * 50)

# Analyze results to provide recommendations
original_r2 = model_results.get('original_linear', {}).get('r2_score', 0)
best_method = None
best_r2 = original_r2

for method in outlier_methods:
    method_r2 = model_results.get(f'{method}_linear', {}).get('r2_score', 0)
    if method_r2 > best_r2:
        best_r2 = method_r2
        best_method = method

print(f"\nüìä ANALYSIS SUMMARY:")
print(f"Original dataset R¬≤ score: {original_r2:.4f}")
if best_method:
    print(f"Best performing method: {best_method.upper()} (R¬≤ = {best_r2:.4f})")
    improvement = ((best_r2 - original_r2) / original_r2) * 100
    print(f"Performance improvement: {improvement:.2f}%")
else:
    print("No outlier removal method improved model performance")

print(f"\nüîç OUTLIER DETECTION INSIGHTS:")
for method in outlier_methods:
    method_summary = summary_df[summary_df['Method'] == method]
    total_outliers = method_summary['Outlier_Count'].sum()
    avg_percentage = method_summary['Outlier_Percentage'].mean()
    print(f"- {method.upper()}: {total_outliers} total outliers ({avg_percentage:.2f}% average)")

print(f"\nüí° RECOMMENDATIONS:")
if best_method:
    print(f"1. Use {best_method.upper()} method for outlier detection in this dataset")
    print(f"2. This method provides the best balance of outlier removal and model performance")
else:
    print("1. Consider keeping outliers as they may contain valuable information")
    print("2. Outlier removal did not improve model performance for this dataset")

print(f"3. Monitor outliers regularly as they may indicate data quality issues")
print(f"4. Consider domain expertise when deciding on outlier treatment")
print(f"5. Document outlier assumptions and treatment decisions for reproducibility")

print(f"\nüìã NEXT STEPS:")
print(f"- Implement chosen outlier detection method in data preprocessing pipeline")
print(f"- Set up automated outlier monitoring and alerting")
print(f"- Document outlier handling decisions in project documentation")
print(f"- Consider ensemble methods that are robust to outliers")

# Save final summary
summary_results = {
    'analysis_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'dataset_shape': bike_data.shape,
    'methods_tested': outlier_methods,
    'original_r2': original_r2,
    'best_method': best_method,
    'best_r2': best_r2,
    'improvement_percentage': ((best_r2 - original_r2) / original_r2) * 100 if best_method else 0
}

summary_df_final = pd.DataFrame([summary_results])
summary_path = utils.save_with_timestamp(
    df=summary_df_final,
    prefix="outlier_analysis_summary",
    source="analysis",
    ext="csv"
)

print(f"\nüíæ Analysis summary saved to: {summary_path}")
print("\n‚úÖ Outlier sensitivity analysis complete!")