Stage 7: Outlier Analysis - Project ImplementationPortfolio Risk Management SystemThis script implements outlier detection and sensitivity analysis for the project.Integrates with existing project pipeline and uses the outliers.py module.

In [None]:
import sysimport ossys.path.append('../src')import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom scipy import statsimport utilsimport outliersimport warningswarnings.filterwarnings('ignore')print("🔍 Stage 7: Outlier Analysis - Portfolio Risk Management")

## Load project data

In [None]:
def load_project_data():    """Load financial data for outlier analysis"""    symbols = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA']    print(f"Loading data for: {symbols}")        raw_data = utils.fetch_multiple_stocks(symbols, prefer_alphavantage=False, period='2y')        if raw_data.empty:        print("❌ Failed to load data")        return None    

## Basic preprocessing

In [None]:
    processed_data = []    for symbol in symbols:        symbol_data = raw_data[raw_data['symbol'] == symbol].copy()        symbol_data = symbol_data.sort_values('date')        

## Calculate returns and basic features

In [None]:
        symbol_data['daily_return'] = symbol_data['close'].pct_change()        symbol_data['log_return'] = np.log(symbol_data['close'] / symbol_data['close'].shift(1))        symbol_data['price_range'] = (symbol_data['high'] - symbol_data['low']) / symbol_data['close']        symbol_data['volume_normalized'] = symbol_data['volume'] / symbol_data['volume'].rolling(20).mean()                processed_data.append(symbol_data)        df = pd.concat(processed_data, ignore_index=True)    df = df.dropna()        print(f"✅ Data loaded: {df.shape}")    return df

## Outlier detection and analysis

In [None]:
def analyze_outliers(df):    """Comprehensive outlier analysis"""    print("\n📊 Outlier Detection Analysis")    

## Define columns to analyze

In [None]:
    analysis_columns = ['daily_return', 'log_return', 'price_range', 'volume_normalized']    

## Generate outlier summary

In [None]:
    summary = outliers.outlier_summary(df, analysis_columns)    print("\nOutlier Summary by Method:")    print(summary.round(2))    

## Visualize outliers

In [None]:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))    axes = axes.flatten()        for i, col in enumerate(analysis_columns):

## Original data

In [None]:
        axes[i].boxplot([df[col].dropna()], labels=['Original'])        

## Add outlier detection results

In [None]:
        iqr_outliers = outliers.detect_outliers_iqr(df[col])        zscore_outliers = outliers.detect_outliers_zscore(df[col])        

## Mark outliers

In [None]:
        outlier_data = df[col][iqr_outliers | zscore_outliers]        if len(outlier_data) > 0:            axes[i].scatter([1] * len(outlier_data), outlier_data,                           color='red', alpha=0.6, s=20, label='Outliers')                axes[i].set_title(f'{col}\nIQR: {iqr_outliers.sum()}, Z-score: {zscore_outliers.sum()}')        axes[i].legend()        plt.tight_layout()    plt.show()        return summary

## Sensitivity analysis

In [None]:
def perform_sensitivity_analysis(df):    """Perform sensitivity analysis comparing with/without outliers"""    print("\n🔬 Sensitivity Analysis")        analysis_columns = ['daily_return', 'log_return', 'price_range']    target_column = 'daily_return'    

In [None]:
    def correlation_analysis(data):        """Custom analysis function for sensitivity testing"""        corr_matrix = data[analysis_columns].corr()        return {            'mean_correlation': corr_matrix.abs().mean().mean(),            'max_correlation': corr_matrix.abs().max().max(),            'return_volatility': data['daily_return'].std(),            'return_skewness': stats.skew(data['daily_return'].dropna()),            'return_kurtosis': stats.kurtosis(data['daily_return'].dropna())        }    

## Run sensitivity analysis

In [None]:
    sensitivity_results = outliers.sensitivity_analysis(        df, target_column, analysis_columns,         outlier_methods=['iqr', 'zscore', 'modified_zscore'],        analysis_func=correlation_analysis    )    

## Display results

In [None]:
    print("\nSensitivity Analysis Results:")    for method, results in sensitivity_results.items():        print(f"\n{method.upper()}:")        print(f"  Data shape: {results['data_shape']}")        if 'rows_removed' in results:            print(f"  Rows removed: {results['rows_removed']} ({results['removal_percentage']:.1f}%)")                if 'custom_analysis' in results:            analysis = results['custom_analysis']            print(f"  Return volatility: {analysis['return_volatility']:.6f}")            print(f"  Return skewness: {analysis['return_skewness']:.4f}")            print(f"  Return kurtosis: {analysis['return_kurtosis']:.4f}")        return sensitivity_results

## Model comparison with/without outliers

In [None]:
def model_comparison(df):    """Compare simple model performance with and without outliers"""    print("\n📈 Model Impact Analysis")        from sklearn.linear_model import LinearRegression    from sklearn.metrics import r2_score, mean_squared_error    from sklearn.model_selection import train_test_split    

## Prepare features and target

In [None]:
    feature_cols = ['log_return', 'price_range', 'volume_normalized']    target_col = 'daily_return'    

## Create lagged target (next day return prediction)

In [None]:
    df_model = df.copy()    df_model['target'] = df_model.groupby('symbol')[target_col].shift(-1)    df_model = df_model.dropna()        X = df_model[feature_cols]    y = df_model['target']        results = {}    

## Original data

In [None]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)        lr = LinearRegression()    lr.fit(X_train, y_train)    y_pred = lr.predict(X_test)        results['original'] = {        'r2': r2_score(y_test, y_pred),        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),        'n_samples': len(X)    }    

## With outliers removed (IQR method)

In [None]:
    df_clean = outliers.remove_outliers(df_model, feature_cols + [target_col], method='iqr')        X_clean = df_clean[feature_cols]    y_clean = df_clean['target']        X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(        X_clean, y_clean, test_size=0.2, random_state=42)        lr_clean = LinearRegression()    lr_clean.fit(X_train_clean, y_train_clean)    y_pred_clean = lr_clean.predict(X_test_clean)        results['outliers_removed'] = {        'r2': r2_score(y_test_clean, y_pred_clean),        'rmse': np.sqrt(mean_squared_error(y_test_clean, y_pred_clean)),        'n_samples': len(X_clean)    }        print("\nModel Performance Comparison:")    for method, metrics in results.items():        print(f"{method}:")        print(f"  R²: {metrics['r2']:.6f}")        print(f"  RMSE: {metrics['rmse']:.6f}")        print(f"  Samples: {metrics['n_samples']}")    

## Visualization

In [None]:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))    

## R² comparison

In [None]:
    methods = list(results.keys())    r2_scores = [results[m]['r2'] for m in methods]    axes[0].bar(methods, r2_scores, alpha=0.7)    axes[0].set_ylabel('R² Score')    axes[0].set_title('Model R² Comparison')    

## RMSE comparison

In [None]:
    rmse_scores = [results[m]['rmse'] for m in methods]    axes[1].bar(methods, rmse_scores, alpha=0.7)    axes[1].set_ylabel('RMSE')    axes[1].set_title('Model RMSE Comparison')        plt.tight_layout()    plt.show()        return results

## Risk assessment

In [None]:
def assess_outlier_risks(df):    """Assess risks and assumptions related to outlier treatment"""    print("\n⚠️ Outlier Risk Assessment")        analysis_columns = ['daily_return', 'log_return', 'price_range']        risk_assessment = {        'data_loss_risk': {},        'bias_risk': {},        'model_assumptions': {}    }        for col in analysis_columns:

## Data loss risk

In [None]:
        iqr_outliers = outliers.detect_outliers_iqr(df[col])        zscore_outliers = outliers.detect_outliers_zscore(df[col])                data_loss_iqr = (iqr_outliers.sum() / len(df)) * 100        data_loss_zscore = (zscore_outliers.sum() / len(df)) * 100                risk_assessment['data_loss_risk'][col] = {            'iqr_loss_pct': data_loss_iqr,            'zscore_loss_pct': data_loss_zscore        }        

## Bias risk (impact on distribution)

In [None]:
        original_mean = df[col].mean()        original_std = df[col].std()                df_no_outliers = df[~iqr_outliers]        clean_mean = df_no_outliers[col].mean()        clean_std = df_no_outliers[col].std()                risk_assessment['bias_risk'][col] = {            'mean_change_pct': ((clean_mean - original_mean) / original_mean) * 100,            'std_change_pct': ((clean_std - original_std) / original_std) * 100        }    

## Print risk assessment

In [None]:
    print("\nData Loss Risk (% of data removed):")    for col, risks in risk_assessment['data_loss_risk'].items():        print(f"  {col}: IQR={risks['iqr_loss_pct']:.1f}%, Z-score={risks['zscore_loss_pct']:.1f}%")        print("\nDistribution Bias Risk (% change in statistics):")    for col, risks in risk_assessment['bias_risk'].items():        print(f"  {col}: Mean={risks['mean_change_pct']:.1f}%, Std={risks['std_change_pct']:.1f}%")    

## Recommendations

In [None]:
    print("\n💡 Recommendations:")    print("1. Use Modified Z-score for financial returns (more robust)")    print("2. Consider winsorization instead of removal for volume data")    print("3. Apply different thresholds for different asset classes")    print("4. Monitor outlier patterns over time for regime changes")    print("5. Document all outlier treatment decisions for reproducibility")        return risk_assessment

## Main execution

In [None]:
def main():    """Main execution function"""

## Load data

In [None]:
    df = load_project_data()    if df is None:        return    

## Perform analyses

In [None]:
    outlier_summary = analyze_outliers(df)    sensitivity_results = perform_sensitivity_analysis(df)    model_results = model_comparison(df)    risk_assessment = assess_outlier_risks(df)    

## Save results

In [None]:
    output_path = utils.save_with_timestamp(        df=outlier_summary,        prefix="outlier_analysis_summary",        source="project_stage7",        ext="csv"    )        print(f"\n💾 Analysis results saved to: {output_path}")        print("\n✅ Stage 7: Outlier Analysis Complete")    print("Key deliverables:")    print("- Reusable outlier detection functions in src/outliers.py")    print("- Comprehensive sensitivity analysis")    print("- Model impact assessment")    print("- Risk and assumption documentation")if __name__ == "__main__":    main()