Stage 8: Exploratory Data Analysis (EDA) - Project ImplementationPortfolio Risk Management SystemThis script implements comprehensive EDA for the financial dataset.Includes visualizations, statistical summaries, and correlation analysis.

In [None]:
import sysimport ossys.path.append('../src')import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom scipy import statsimport utilsimport warningswarnings.filterwarnings('ignore')print("📊 Stage 8: Exploratory Data Analysis - Portfolio Risk Management")

In [None]:
def load_and_prepare_data():    """Load and prepare data for EDA"""    symbols = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA']    print(f"Loading data for EDA: {symbols}")        raw_data = utils.fetch_multiple_stocks(symbols, prefer_alphavantage=False, period='2y')        if raw_data.empty:        print("❌ Failed to load data")        return None    

## Enhanced preprocessing for EDA

In [None]:
    processed_data = []    for symbol in symbols:        symbol_data = raw_data[raw_data['symbol'] == symbol].copy()        symbol_data = symbol_data.sort_values('date')        

## Price-based features

In [None]:
        symbol_data['daily_return'] = symbol_data['close'].pct_change()        symbol_data['log_return'] = np.log(symbol_data['close'] / symbol_data['close'].shift(1))        symbol_data['price_range'] = (symbol_data['high'] - symbol_data['low']) / symbol_data['close']        

## Moving averages

In [None]:
        symbol_data['sma_20'] = symbol_data['close'].rolling(20).mean()        symbol_data['sma_50'] = symbol_data['close'].rolling(50).mean()        

## Volatility measures

In [None]:
        symbol_data['volatility_20'] = symbol_data['daily_return'].rolling(20).std()        symbol_data['volatility_50'] = symbol_data['daily_return'].rolling(50).std()        

## Volume features

In [None]:
        symbol_data['volume_ma_20'] = symbol_data['volume'].rolling(20).mean()        symbol_data['volume_ratio'] = symbol_data['volume'] / symbol_data['volume_ma_20']        

## Technical indicators

In [None]:
        symbol_data['rsi'] = calculate_rsi(symbol_data['close'])                processed_data.append(symbol_data)        df = pd.concat(processed_data, ignore_index=True)    df = df.dropna()        print(f"✅ Data prepared for EDA: {df.shape}")    return df

In [None]:
def calculate_rsi(prices, window=14):    """Calculate RSI indicator"""    delta = prices.diff()    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()    rs = gain / loss    return 100 - (100 / (1 + rs))

In [None]:
def basic_statistics(df):    """Generate comprehensive statistical summaries"""    print("\n📈 Basic Statistical Summary")    

## Numeric columns for analysis

In [None]:
    numeric_cols = ['close', 'volume', 'daily_return', 'log_return', 'price_range',                    'volatility_20', 'volume_ratio', 'rsi']    

## Overall statistics

In [None]:
    print("\nOverall Dataset Statistics:")    print(f"Total observations: {len(df):,}")    print(f"Date range: {df['date'].min()} to {df['date'].max()}")    print(f"Symbols: {', '.join(df['symbol'].unique())}")    print(f"Missing values: {df[numeric_cols].isnull().sum().sum()}")    

## Detailed statistics by column

In [None]:
    stats_summary = df[numeric_cols].describe()    print("\nDetailed Statistics:")    print(stats_summary.round(6))    

## Distribution characteristics

In [None]:
    print("\nDistribution Characteristics:")    for col in ['daily_return', 'log_return', 'volatility_20']:        data = df[col].dropna()        print(f"\n{col}:")        print(f"  Skewness: {stats.skew(data):.4f}")        print(f"  Kurtosis: {stats.kurtosis(data):.4f}")        print(f"  Jarque-Bera test p-value: {stats.jarque_bera(data)[1]:.6f}")        return stats_summary

In [None]:
def distribution_analysis(df):    """Analyze and visualize distributions"""    print("\n📊 Distribution Analysis")    

## Key variables for distribution analysis

In [None]:
    dist_vars = ['daily_return', 'log_return', 'volatility_20', 'volume_ratio']        fig, axes = plt.subplots(2, 2, figsize=(15, 10))    axes = axes.flatten()        for i, var in enumerate(dist_vars):        data = df[var].dropna()        

## Histogram with normal overlay

In [None]:
        axes[i].hist(data, bins=50, density=True, alpha=0.7, edgecolor='black')        

## Normal distribution overlay

In [None]:
        mu, sigma = stats.norm.fit(data)        x = np.linspace(data.min(), data.max(), 100)        axes[i].plot(x, stats.norm.pdf(x, mu, sigma), 'r-', lw=2, label='Normal fit')                axes[i].set_title(f'{var}\nSkew: {stats.skew(data):.3f}, Kurt: {stats.kurtosis(data):.3f}')        axes[i].set_xlabel(var)        axes[i].set_ylabel('Density')        axes[i].legend()        plt.tight_layout()    plt.show()    

## Q-Q plots for normality assessment

In [None]:
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))    axes = axes.flatten()        for i, var in enumerate(dist_vars):        data = df[var].dropna()        stats.probplot(data, dist="norm", plot=axes[i])        axes[i].set_title(f'Q-Q Plot: {var}')        plt.tight_layout()    plt.show()

In [None]:
def time_series_analysis(df):    """Analyze time series patterns"""    print("\n⏰ Time Series Analysis")    

## Price evolution by symbol

In [None]:
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))    symbols = df['symbol'].unique()        for i, symbol in enumerate(symbols):        row, col = i // 3, i % 3        symbol_data = df[df['symbol'] == symbol].sort_values('date')        

## Price chart

In [None]:
        axes[row, col].plot(symbol_data['date'], symbol_data['close'], label='Close Price')        axes[row, col].plot(symbol_data['date'], symbol_data['sma_20'], label='SMA 20', alpha=0.7)        axes[row, col].plot(symbol_data['date'], symbol_data['sma_50'], label='SMA 50', alpha=0.7)        axes[row, col].set_title(f'{symbol} Price Evolution')        axes[row, col].set_xlabel('Date')        axes[row, col].set_ylabel('Price ($)')        axes[row, col].legend()        axes[row, col].tick_params(axis='x', rotation=45)    

## Remove empty subplot

In [None]:
    if len(symbols) < 6:        fig.delaxes(axes[1, 2])        plt.tight_layout()    plt.show()    

## Return volatility over time

In [None]:
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))    

## Rolling volatility

In [None]:
    for symbol in symbols:        symbol_data = df[df['symbol'] == symbol].sort_values('date')        axes[0].plot(symbol_data['date'], symbol_data['volatility_20'],                     label=symbol, alpha=0.7)        axes[0].set_title('20-Day Rolling Volatility')    axes[0].set_xlabel('Date')    axes[0].set_ylabel('Volatility')    axes[0].legend()    axes[0].tick_params(axis='x', rotation=45)    

## Volume patterns

In [None]:
    monthly_volume = df.groupby([df['date'].dt.to_period('M'), 'symbol'])['volume'].mean().unstack()    monthly_volume.plot(kind='bar', ax=axes[1], alpha=0.7)    axes[1].set_title('Average Monthly Volume by Symbol')    axes[1].set_xlabel('Month')    axes[1].set_ylabel('Average Volume')    axes[1].tick_params(axis='x', rotation=45)        plt.tight_layout()    plt.show()

In [None]:
def correlation_analysis(df):    """Comprehensive correlation analysis"""    print("\n🔗 Correlation Analysis")    

## Select numeric variables for correlation

In [None]:
    corr_vars = ['close', 'volume', 'daily_return', 'log_return', 'price_range',                'volatility_20', 'volume_ratio', 'rsi']    

## Overall correlation matrix

In [None]:
    corr_matrix = df[corr_vars].corr()        plt.figure(figsize=(10, 8))    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,                square=True, fmt='.3f', cbar_kws={'label': 'Correlation'})    plt.title('Correlation Matrix - All Variables')    plt.tight_layout()    plt.show()    

## Symbol-specific correlations

In [None]:
    print("\nCorrelation Analysis by Symbol:")    symbol_correlations = {}        for symbol in df['symbol'].unique():        symbol_data = df[df['symbol'] == symbol]        symbol_corr = symbol_data[corr_vars].corr()        symbol_correlations[symbol] = symbol_corr        

## Key correlations

In [None]:
        return_vol_corr = symbol_corr.loc['daily_return', 'volatility_20']        volume_return_corr = symbol_corr.loc['daily_return', 'volume_ratio']                print(f"\n{symbol}:")        print(f"  Return-Volatility correlation: {return_vol_corr:.4f}")        print(f"  Return-Volume correlation: {volume_return_corr:.4f}")    

## Cross-symbol return correlations

In [None]:
    return_pivot = df.pivot_table(values='daily_return', index='date', columns='symbol')    return_corr = return_pivot.corr()        plt.figure(figsize=(8, 6))    sns.heatmap(return_corr, annot=True, cmap='coolwarm', center=0,                square=True, fmt='.3f')    plt.title('Cross-Symbol Return Correlations')    plt.tight_layout()    plt.show()        return corr_matrix, symbol_correlations

In [None]:
def bivariate_analysis(df):    """Bivariate relationship analysis"""    print("\n🔄 Bivariate Analysis")    

## Key relationships to explore

In [None]:
    relationships = [        ('daily_return', 'volatility_20', 'Return vs Volatility'),        ('daily_return', 'volume_ratio', 'Return vs Volume Ratio'),        ('price_range', 'volatility_20', 'Price Range vs Volatility'),        ('rsi', 'daily_return', 'RSI vs Return')    ]        fig, axes = plt.subplots(2, 2, figsize=(15, 10))    axes = axes.flatten()        for i, (x_var, y_var, title) in enumerate(relationships):

## Scatter plot with regression line

In [None]:
        sns.scatterplot(data=df, x=x_var, y=y_var, hue='symbol',                        alpha=0.6, ax=axes[i])        

## Add regression line

In [None]:
        sns.regplot(data=df, x=x_var, y=y_var, scatter=False,                    color='red', ax=axes[i])        

## Calculate correlation

In [None]:
        corr = df[x_var].corr(df[y_var])        axes[i].set_title(f'{title}\nCorrelation: {corr:.4f}')        

## Remove legend for cleaner look

In [None]:
        if i > 0:            axes[i].get_legend().remove()        plt.tight_layout()    plt.show()

In [None]:
def sector_comparison(df):    """Compare performance across symbols (proxy for sectors)"""    print("\n🏢 Symbol Comparison Analysis")    

## Performance metrics by symbol

In [None]:
    performance_metrics = []        for symbol in df['symbol'].unique():        symbol_data = df[df['symbol'] == symbol]                metrics = {            'Symbol': symbol,            'Avg_Return': symbol_data['daily_return'].mean(),            'Return_Std': symbol_data['daily_return'].std(),            'Sharpe_Ratio': symbol_data['daily_return'].mean() / symbol_data['daily_return'].std(),            'Max_Drawdown': calculate_max_drawdown(symbol_data['close']),            'Avg_Volume': symbol_data['volume'].mean(),            'Price_Range_Avg': symbol_data['price_range'].mean()        }        performance_metrics.append(metrics)        perf_df = pd.DataFrame(performance_metrics)    print("\nPerformance Metrics by Symbol:")    print(perf_df.round(6))    

## Visualizations

In [None]:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))    

## Return vs Risk

In [None]:
    axes[0,0].scatter(perf_df['Return_Std'], perf_df['Avg_Return'])    for i, symbol in enumerate(perf_df['Symbol']):        axes[0,0].annotate(symbol, (perf_df['Return_Std'].iloc[i], perf_df['Avg_Return'].iloc[i]))    axes[0,0].set_xlabel('Risk (Return Std)')    axes[0,0].set_ylabel('Average Return')    axes[0,0].set_title('Risk-Return Profile')    

## Sharpe Ratio comparison

In [None]:
    axes[0,1].bar(perf_df['Symbol'], perf_df['Sharpe_Ratio'])    axes[0,1].set_title('Sharpe Ratio Comparison')    axes[0,1].set_ylabel('Sharpe Ratio')    

## Volume comparison

In [None]:
    axes[1,0].bar(perf_df['Symbol'], perf_df['Avg_Volume'])    axes[1,0].set_title('Average Volume Comparison')    axes[1,0].set_ylabel('Average Volume')    axes[1,0].tick_params(axis='x', rotation=45)    

## Price range comparison

In [None]:
    axes[1,1].bar(perf_df['Symbol'], perf_df['Price_Range_Avg'])    axes[1,1].set_title('Average Price Range Comparison')    axes[1,1].set_ylabel('Average Price Range')    axes[1,1].tick_params(axis='x', rotation=45)        plt.tight_layout()    plt.show()        return perf_df

In [None]:
def calculate_max_drawdown(prices):    """Calculate maximum drawdown"""    peak = prices.expanding().max()    drawdown = (prices - peak) / peak    return drawdown.min()

In [None]:
def generate_insights(df, stats_summary, corr_matrix, perf_df):    """Generate key insights from EDA"""    print("\n💡 Key EDA Insights")        insights = []    

## Distribution insights

In [None]:
    returns_skew = stats.skew(df['daily_return'].dropna())    if abs(returns_skew) > 0.5:        insights.append(f"Returns show significant skewness ({returns_skew:.3f}), indicating asymmetric distribution")    

## Volatility insights

In [None]:
    vol_mean = df['volatility_20'].mean()    vol_std = df['volatility_20'].std()    insights.append(f"Average volatility: {vol_mean:.4f} with high variation (std: {vol_std:.4f})")    

## Correlation insights

In [None]:
    strongest_corr = corr_matrix.abs().unstack().sort_values(ascending=False)    strongest_corr = strongest_corr[strongest_corr < 1.0].iloc[0]    insights.append(f"Strongest correlation found: {strongest_corr:.4f}")    

## Performance insights

In [None]:
    best_performer = perf_df.loc[perf_df['Sharpe_Ratio'].idxmax(), 'Symbol']    worst_performer = perf_df.loc[perf_df['Sharpe_Ratio'].idxmin(), 'Symbol']    insights.append(f"Best risk-adjusted performer: {best_performer}")    insights.append(f"Worst risk-adjusted performer: {worst_performer}")    

## Volume insights

In [None]:
    volume_return_corr = df['volume_ratio'].corr(df['daily_return'])    if abs(volume_return_corr) > 0.1:        insights.append(f"Volume shows meaningful correlation with returns ({volume_return_corr:.4f})")        print("\nKey Findings:")    for i, insight in enumerate(insights, 1):        print(f"{i}. {insight}")        return insights

In [None]:
def main():    """Main execution function"""

## Load and prepare data

In [None]:
    df = load_and_prepare_data()    if df is None:        return    

## Perform EDA components

In [None]:
    stats_summary = basic_statistics(df)    distribution_analysis(df)    time_series_analysis(df)    corr_matrix, symbol_correlations = correlation_analysis(df)    bivariate_analysis(df)    perf_df = sector_comparison(df)    insights = generate_insights(df, stats_summary, corr_matrix, perf_df)    

## Save EDA results

In [None]:
    eda_summary = {        'dataset_shape': df.shape,        'date_range': f"{df['date'].min()} to {df['date'].max()}",        'symbols': list(df['symbol'].unique()),        'key_insights': insights    }    

## Save performance metrics

In [None]:
    output_path = utils.save_with_timestamp(        df=perf_df,        prefix="eda_performance_metrics",        source="project_stage8",        ext="csv"    )        print(f"\n💾 EDA results saved to: {output_path}")        print("\n✅ Stage 8: Exploratory Data Analysis Complete")    print("Key deliverables:")    print("- Comprehensive statistical summaries")    print("- Distribution analysis and normality tests")    print("- Time series pattern identification")    print("- Correlation matrix and bivariate analysis")    print("- Performance comparison across symbols")    print("- Key insights for feature engineering")if __name__ == "__main__":    main()