# Homework 07: Outliers + Risk Analysis

**Assignment**: Implement reusable functions to detect and handle outliers, run sensitivity analysis, and reflect on assumptions.

## Objectives
- Implement outlier detection functions (IQR, Z-score, winsorizing)
- Apply methods to financial data
- Conduct sensitivity analysis
- Document assumptions and risks

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import utils

# Set random seed for reproducibility
np.random.seed(42)

print("📊 Homework 07: Outliers + Risk Analysis")

## 1. Load Financial Data

In [None]:
# Load financial data for outlier analysis
symbols = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA']
print(f"Loading data for: {symbols}")

data = utils.fetch_multiple_stocks(symbols, prefer_alphavantage=False, period='1y')

if not data.empty:
    # Calculate returns for analysis
    data_with_returns = []
    for symbol in symbols:
        symbol_data = data[data['symbol'] == symbol].copy()
        symbol_data = symbol_data.sort_values('date')
        symbol_data['daily_return'] = symbol_data['close'].pct_change()
        symbol_data['volume_change'] = symbol_data['volume'].pct_change()
        data_with_returns.append(symbol_data)
    
    financial_data = pd.concat(data_with_returns, ignore_index=True)
    financial_data = financial_data.dropna()
    
    print(f"✅ Data loaded: {financial_data.shape}")
    print(f"Columns: {list(financial_data.columns)}")
else:
    print("❌ Failed to load data")

## 2. Implement Outlier Detection Functions

In [None]:
def detect_outliers_iqr(series, multiplier=1.5):
    """
    Detect outliers using Interquartile Range (IQR) method.
    
    Parameters:
    -----------
    series : pd.Series
        Input data series
    multiplier : float, default=1.5
        IQR multiplier for outlier threshold
    
    Returns:
    --------
    pd.Series
        Boolean series indicating outliers (True = outlier)
    """
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    
    outliers = (series < lower_bound) | (series > upper_bound)
    return outliers

def detect_outliers_zscore(series, threshold=3.0):
    """
    Detect outliers using Z-score method.
    
    Parameters:
    -----------
    series : pd.Series
        Input data series
    threshold : float, default=3.0
        Z-score threshold for outlier detection
    
    Returns:
    --------
    pd.Series
        Boolean series indicating outliers (True = outlier)
    """
    z_scores = np.abs(stats.zscore(series))
    outliers = z_scores > threshold
    return pd.Series(outliers, index=series.index)

def winsorize_series(series, lower=0.05, upper=0.95):
    """
    Winsorize a series by capping extreme values at specified percentiles.
    
    Parameters:
    -----------
    series : pd.Series
        Input data series
    lower : float, default=0.05
        Lower percentile threshold
    upper : float, default=0.95
        Upper percentile threshold
    
    Returns:
    --------
    pd.Series
        Winsorized series
    """
    lower_bound = series.quantile(lower)
    upper_bound = series.quantile(upper)
    
    winsorized = series.clip(lower=lower_bound, upper=upper_bound)
    return winsorized

print("✅ Outlier detection functions implemented")

## 3. Apply Outlier Detection to Financial Data

In [None]:
if not financial_data.empty:
    # Focus on daily returns for outlier analysis
    returns = financial_data['daily_return']
    
    # Apply outlier detection methods
    outliers_iqr = detect_outliers_iqr(returns)
    outliers_zscore = detect_outliers_zscore(returns)
    
    # Create winsorized version
    returns_winsorized = winsorize_series(returns)
    
    # Add outlier flags to dataframe
    financial_data['outlier_iqr'] = outliers_iqr
    financial_data['outlier_zscore'] = outliers_zscore
    financial_data['return_winsorized'] = returns_winsorized
    
    print(f"📊 Outlier Detection Results:")
    print(f"IQR method: {outliers_iqr.sum()} outliers ({outliers_iqr.mean():.2%})")
    print(f"Z-score method: {outliers_zscore.sum()} outliers ({outliers_zscore.mean():.2%})")
    
    # Visualize outliers
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Box plot
    axes[0,0].boxplot([returns, returns_winsorized], labels=['Original', 'Winsorized'])
    axes[0,0].set_title('Returns Distribution')
    axes[0,0].set_ylabel('Daily Return')
    
    # Histogram
    axes[0,1].hist(returns, bins=50, alpha=0.7, label='Original')
    axes[0,1].hist(returns_winsorized, bins=50, alpha=0.7, label='Winsorized')
    axes[0,1].set_title('Returns Histogram')
    axes[0,1].legend()
    
    # Scatter plot with outliers highlighted
    axes[1,0].scatter(range(len(returns)), returns, c=outliers_iqr, cmap='coolwarm', alpha=0.6)
    axes[1,0].set_title('IQR Outliers (Red = Outlier)')
    axes[1,0].set_xlabel('Observation')
    axes[1,0].set_ylabel('Daily Return')
    
    axes[1,1].scatter(range(len(returns)), returns, c=outliers_zscore, cmap='coolwarm', alpha=0.6)
    axes[1,1].set_title('Z-score Outliers (Red = Outlier)')
    axes[1,1].set_xlabel('Observation')
    axes[1,1].set_ylabel('Daily Return')
    
    plt.tight_layout()
    plt.show()

## 4. Sensitivity Analysis

In [None]:
if not financial_data.empty:
    # Create different datasets for comparison
    data_original = financial_data.copy()
    data_no_iqr_outliers = financial_data[~financial_data['outlier_iqr']].copy()
    data_no_zscore_outliers = financial_data[~financial_data['outlier_zscore']].copy()
    data_winsorized = financial_data.copy()
    data_winsorized['daily_return'] = data_winsorized['return_winsorized']
    
    datasets = {
        'Original': data_original,
        'No IQR Outliers': data_no_iqr_outliers,
        'No Z-score Outliers': data_no_zscore_outliers,
        'Winsorized': data_winsorized
    }
    
    # Summary statistics comparison
    summary_stats = []
    
    for name, dataset in datasets.items():
        returns = dataset['daily_return']
        stats_dict = {
            'Dataset': name,
            'Count': len(returns),
            'Mean': returns.mean(),
            'Median': returns.median(),
            'Std': returns.std(),
            'Min': returns.min(),
            'Max': returns.max(),
            'Skewness': returns.skew(),
            'Kurtosis': returns.kurtosis()
        }
        summary_stats.append(stats_dict)
    
    summary_df = pd.DataFrame(summary_stats)
    print("📊 Summary Statistics Comparison:")
    print(summary_df.round(6))

## 5. Linear Regression Analysis

In [None]:
if not financial_data.empty:
    # Simple regression: predict returns using volume changes
    regression_results = []
    
    for name, dataset in datasets.items():
        # Prepare data
        X = dataset[['volume_change']].fillna(0)
        y = dataset['daily_return']
        
        # Fit model
        model = LinearRegression()
        model.fit(X, y)
        
        # Predictions and metrics
        y_pred = model.predict(X)
        r2 = r2_score(y, y_pred)
        mae = mean_absolute_error(y, y_pred)
        
        result = {
            'Dataset': name,
            'Coefficient': model.coef_[0],
            'Intercept': model.intercept_,
            'R²': r2,
            'MAE': mae,
            'Sample Size': len(y)
        }
        regression_results.append(result)
    
    regression_df = pd.DataFrame(regression_results)
    print("\n📈 Linear Regression Comparison:")
    print(regression_df.round(6))
    
    # Visualize regression results
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, (name, dataset) in enumerate(datasets.items()):
        X = dataset[['volume_change']].fillna(0)
        y = dataset['daily_return']
        
        model = LinearRegression().fit(X, y)
        y_pred = model.predict(X)
        
        axes[i].scatter(X, y, alpha=0.5, s=10)
        axes[i].plot(X, y_pred, 'r-', linewidth=2)
        axes[i].set_title(f'{name}\nR² = {r2_score(y, y_pred):.4f}')
        axes[i].set_xlabel('Volume Change')
        axes[i].set_ylabel('Daily Return')
    
    plt.tight_layout()
    plt.show()

## 6. Reflection and Analysis

### Method Selection and Rationale

**IQR Method (1.5 multiplier):**
- Chosen for its robustness to extreme values
- Works well with financial returns which often have fat tails
- Less sensitive to distributional assumptions than Z-score

**Z-score Method (threshold = 3.0):**
- Assumes normal distribution of returns
- More conservative approach (higher threshold)
- Useful for comparison with IQR method

**Winsorizing (5th-95th percentiles):**
- Preserves sample size while reducing extreme influence
- Maintains data structure for time series analysis
- Conservative approach to outlier treatment

### Key Assumptions

1. **Stationarity**: Assumes return distribution is stable over time
2. **Independence**: Daily returns are independent observations
3. **Outlier Definition**: Extreme values are truly anomalous, not structural
4. **Linear Relationship**: Volume changes have linear relationship with returns

### Observed Impacts

Based on the analysis above:
- **Sample Size**: Outlier removal reduces sample size, potentially affecting statistical power
- **Distribution Shape**: Outlier treatment affects skewness and kurtosis
- **Model Performance**: Different treatments show varying R² and MAE values
- **Parameter Estimates**: Regression coefficients change with outlier treatment

### Risks if Assumptions are Wrong

1. **False Positives**: Removing legitimate extreme market events
   - Risk: Missing important market signals (crashes, rallies)
   - Impact: Models may underestimate tail risks

2. **Distributional Misspecification**: Assuming normality when returns have fat tails
   - Risk: Z-score method may be too aggressive or conservative
   - Impact: Incorrect outlier identification

3. **Time-Varying Volatility**: Ignoring volatility clustering
   - Risk: Treating high-volatility periods as outliers
   - Impact: Removing important market regime information

4. **Structural Breaks**: Market conditions change over time
   - Risk: Historical outliers may become normal in new regimes
   - Impact: Models may be poorly calibrated for current conditions

### Recommendations

1. **Context-Aware Approach**: Consider market conditions when identifying outliers
2. **Multiple Methods**: Use ensemble of detection methods for robustness
3. **Domain Knowledge**: Incorporate financial expertise in outlier assessment
4. **Sensitivity Testing**: Always test model sensitivity to outlier treatment
5. **Documentation**: Maintain clear records of outlier treatment decisions

## 7. Summary and Conclusions

In [None]:
print("\n🎯 Homework 07 Summary:")
print("✅ Implemented outlier detection functions (IQR, Z-score, Winsorizing)")
print("✅ Applied methods to financial returns data")
print("✅ Conducted sensitivity analysis on summary statistics")
print("✅ Compared linear regression performance across treatments")
print("✅ Documented assumptions, risks, and recommendations")

print("\n📊 Key Findings:")
if 'summary_df' in locals():
    print(f"- IQR method identified {financial_data['outlier_iqr'].sum()} outliers")
    print(f"- Z-score method identified {financial_data['outlier_zscore'].sum()} outliers")
    print(f"- Outlier treatment significantly affects distribution moments")
    print(f"- Model performance varies with outlier handling approach")

print("\n⚠️ Important Considerations:")
print("- Financial data often contains legitimate extreme values")
print("- Outlier treatment should be context-aware and well-documented")
print("- Always test sensitivity of results to outlier handling decisions")