Stage 10a: Regression Modeling - Project ImplementationPortfolio Risk Management SystemThis script implements regression modeling with diagnostics and evaluation.Includes train-test split, feature selection, and comprehensive model analysis.

In [None]:
import sysimport ossys.path.append('../src')import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom scipy import statsimport utilsfrom sklearn.linear_model import LinearRegression, Ridge, Lassofrom sklearn.ensemble import RandomForestRegressorfrom sklearn.model_selection import train_test_split, cross_val_scorefrom sklearn.preprocessing import StandardScalerfrom sklearn.metrics import mean_squared_error, r2_score, mean_absolute_errorfrom sklearn.pipeline import Pipelineimport warningswarnings.filterwarnings('ignore')print("📈 Stage 10a: Regression Modeling - Portfolio Risk Management")

In [None]:
def load_engineered_data():    """Load engineered features dataset"""    print("Loading engineered features dataset...")    

## For demo, recreate key features

In [None]:
    symbols = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA']    raw_data = utils.fetch_multiple_stocks(symbols, prefer_alphavantage=False, period='2y')        if raw_data.empty:        print("❌ Failed to load data")        return None    

## Create essential features

In [None]:
    processed_data = []    for symbol in symbols:        symbol_data = raw_data[raw_data['symbol'] == symbol].copy().sort_values('date')        

## Base features

In [None]:
        symbol_data['daily_return'] = symbol_data['close'].pct_change()        symbol_data['log_return'] = np.log(symbol_data['close'] / symbol_data['close'].shift(1))        symbol_data['price_range'] = (symbol_data['high'] - symbol_data['low']) / symbol_data['close']        

## Technical features

In [None]:
        symbol_data['sma_20'] = symbol_data['close'].rolling(20).mean()        symbol_data['volatility_20'] = symbol_data['daily_return'].rolling(20).std()        symbol_data['volume_ma_20'] = symbol_data['volume'].rolling(20).mean()        symbol_data['volume_ratio'] = symbol_data['volume'] / symbol_data['volume_ma_20']        

## Engineered features

In [None]:
        symbol_data['vol_adj_return'] = symbol_data['daily_return'] / symbol_data['volatility_20']        symbol_data['price_momentum_10'] = symbol_data['close'].pct_change(10)        symbol_data['rsi'] = calculate_rsi(symbol_data['close'])        

## Lag features

In [None]:
        symbol_data['return_lag_1'] = symbol_data['daily_return'].shift(1)        symbol_data['return_lag_2'] = symbol_data['daily_return'].shift(2)        symbol_data['vol_lag_1'] = symbol_data['volatility_20'].shift(1)        

## Target variable

In [None]:
        symbol_data['target_return'] = symbol_data['daily_return'].shift(-1)                processed_data.append(symbol_data)        df = pd.concat(processed_data, ignore_index=True)    df = df.dropna()        print(f"✅ Dataset loaded: {df.shape}")    return df

In [None]:
def calculate_rsi(prices, window=14):    """Calculate RSI indicator"""    delta = prices.diff()    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()    rs = gain / loss    return 100 - (100 / (1 + rs))

In [None]:
def prepare_features_target(df):    """Prepare feature matrix and target vector"""    print("\n🎯 Preparing Features and Target")    

## Define feature sets

In [None]:
    base_features = ['daily_return', 'log_return', 'price_range', 'volatility_20', 'volume_ratio']    technical_features = ['vol_adj_return', 'price_momentum_10', 'rsi']    lag_features = ['return_lag_1', 'return_lag_2', 'vol_lag_1']    

## Combine all features

In [None]:
    feature_cols = base_features + technical_features + lag_features    target_col = 'target_return'    

## Create feature matrix

In [None]:
    X = df[feature_cols].copy()    y = df[target_col].copy()    

## Remove missing values

In [None]:
    mask = ~(X.isnull().any(axis=1) | y.isnull())    X = X[mask]    y = y[mask]    dates = df[mask]['date']    symbols = df[mask]['symbol']        print(f"Feature matrix shape: {X.shape}")    print(f"Target vector shape: {y.shape}")    print(f"Features used: {feature_cols}")        return X, y, dates, symbols, feature_cols

In [None]:
def time_series_split(X, y, dates, test_size=0.2):    """Perform time-aware train-test split"""    print("\n📅 Time Series Train-Test Split")    

## Sort by date to maintain temporal order

In [None]:
    sort_idx = dates.argsort()    X_sorted = X.iloc[sort_idx]    y_sorted = y.iloc[sort_idx]    dates_sorted = dates.iloc[sort_idx]    

## Split based on time

In [None]:
    split_idx = int(len(X_sorted) * (1 - test_size))        X_train = X_sorted.iloc[:split_idx]    X_test = X_sorted.iloc[split_idx:]    y_train = y_sorted.iloc[:split_idx]    y_test = y_sorted.iloc[split_idx:]        train_dates = dates_sorted.iloc[:split_idx]    test_dates = dates_sorted.iloc[split_idx:]        print(f"Train period: {train_dates.min()} to {train_dates.max()}")    print(f"Test period: {test_dates.min()} to {test_dates.max()}")    print(f"Train samples: {len(X_train)}")    print(f"Test samples: {len(X_test)}")        return X_train, X_test, y_train, y_test

In [None]:
def build_regression_models(X_train, y_train, X_test, y_test):    """Build and compare multiple regression models"""    print("\n🔧 Building Regression Models")        models = {        'Linear Regression': LinearRegression(),        'Ridge Regression': Ridge(alpha=1.0),        'Lasso Regression': Lasso(alpha=0.01),        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)    }        results = {}    

## Scale features

In [None]:
    scaler = StandardScaler()    X_train_scaled = scaler.fit_transform(X_train)    X_test_scaled = scaler.transform(X_test)        for name, model in models.items():        print(f"\nTraining {name}...")        

## Fit model

In [None]:
        if name == 'Random Forest':

## Random Forest doesn't need scaling

In [None]:
            model.fit(X_train, y_train)            y_train_pred = model.predict(X_train)            y_test_pred = model.predict(X_test)        else:            model.fit(X_train_scaled, y_train)            y_train_pred = model.predict(X_train_scaled)            y_test_pred = model.predict(X_test_scaled)        

## Calculate metrics

In [None]:
        train_r2 = r2_score(y_train, y_train_pred)        test_r2 = r2_score(y_test, y_test_pred)        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))        train_mae = mean_absolute_error(y_train, y_train_pred)        test_mae = mean_absolute_error(y_test, y_test_pred)                results[name] = {            'model': model,            'train_r2': train_r2,            'test_r2': test_r2,            'train_rmse': train_rmse,            'test_rmse': test_rmse,            'train_mae': train_mae,            'test_mae': test_mae,            'y_train_pred': y_train_pred,            'y_test_pred': y_test_pred        }                print(f"  Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}")        print(f"  Train RMSE: {train_rmse:.6f}, Test RMSE: {test_rmse:.6f}")        return results, scaler

In [None]:
def model_diagnostics(results, X_train, y_train, X_test, y_test, feature_cols):    """Perform comprehensive model diagnostics"""    print("\n🔍 Model Diagnostics")    

## Select best model based on test R²

In [None]:
    best_model_name = max(results.keys(), key=lambda x: results[x]['test_r2'])    best_model = results[best_model_name]        print(f"Best model: {best_model_name} (Test R²: {best_model['test_r2']:.4f})")    

## Residual analysis

In [None]:
    train_residuals = y_train - best_model['y_train_pred']    test_residuals = y_test - best_model['y_test_pred']    

## Diagnostic plots

In [None]:
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))    

## 1. Residuals vs Fitted

In [None]:
    axes[0,0].scatter(best_model['y_train_pred'], train_residuals, alpha=0.6, s=20)    axes[0,0].axhline(y=0, color='red', linestyle='--')    axes[0,0].set_xlabel('Fitted Values')    axes[0,0].set_ylabel('Residuals')    axes[0,0].set_title('Residuals vs Fitted (Train)')    

## 2. Q-Q Plot

In [None]:
    stats.probplot(train_residuals, dist="norm", plot=axes[0,1])    axes[0,1].set_title('Q-Q Plot (Train Residuals)')    

## 3. Histogram of residuals

In [None]:
    axes[0,2].hist(train_residuals, bins=50, alpha=0.7, edgecolor='black')    axes[0,2].set_xlabel('Residuals')    axes[0,2].set_ylabel('Frequency')    axes[0,2].set_title('Residual Distribution (Train)')    

## 4. Actual vs Predicted (Train)

In [None]:
    axes[1,0].scatter(y_train, best_model['y_train_pred'], alpha=0.6, s=20)    axes[1,0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)    axes[1,0].set_xlabel('Actual Returns')    axes[1,0].set_ylabel('Predicted Returns')    axes[1,0].set_title(f'Train: Actual vs Predicted (R²={best_model["train_r2"]:.4f})')    

## 5. Actual vs Predicted (Test)

In [None]:
    axes[1,1].scatter(y_test, best_model['y_test_pred'], alpha=0.6, s=20)    axes[1,1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)    axes[1,1].set_xlabel('Actual Returns')    axes[1,1].set_ylabel('Predicted Returns')    axes[1,1].set_title(f'Test: Actual vs Predicted (R²={best_model["test_r2"]:.4f})')    

## 6. Feature importance (if available)

In [None]:
    if hasattr(best_model['model'], 'feature_importances_'):        importances = best_model['model'].feature_importances_        feature_imp = pd.DataFrame({            'feature': feature_cols,            'importance': importances        }).sort_values('importance', ascending=True)                axes[1,2].barh(feature_imp['feature'], feature_imp['importance'])        axes[1,2].set_xlabel('Feature Importance')        axes[1,2].set_title('Feature Importance')    elif hasattr(best_model['model'], 'coef_'):        coefficients = best_model['model'].coef_        coef_df = pd.DataFrame({            'feature': feature_cols,            'coefficient': coefficients        }).sort_values('coefficient', key=abs, ascending=True)                colors = ['red' if x < 0 else 'blue' for x in coef_df['coefficient']]        axes[1,2].barh(coef_df['feature'], coef_df['coefficient'], color=colors, alpha=0.7)        axes[1,2].set_xlabel('Coefficient Value')        axes[1,2].set_title('Model Coefficients')        axes[1,2].axvline(x=0, color='black', linestyle='-', alpha=0.3)        plt.tight_layout()    plt.show()    

## Statistical tests

In [None]:
    print(f"\nResidual Analysis:")    print(f"Mean residual: {train_residuals.mean():.8f} (should be ~0)")    print(f"Residual std: {train_residuals.std():.6f}")    print(f"Skewness: {stats.skew(train_residuals):.4f}")    print(f"Kurtosis: {stats.kurtosis(train_residuals):.4f}")    

## Normality test

In [None]:
    shapiro_stat, shapiro_p = stats.shapiro(train_residuals[:5000])  # Limit sample size    print(f"Shapiro-Wilk normality test p-value: {shapiro_p:.6f}")    

## Durbin-Watson test for autocorrelation

In [None]:
    def durbin_watson(residuals):        diff = np.diff(residuals)        return np.sum(diff**2) / np.sum(residuals**2)        dw_stat = durbin_watson(train_residuals)    print(f"Durbin-Watson statistic: {dw_stat:.4f} (2.0 = no autocorrelation)")        return best_model_name, best_model

In [None]:
def cross_validation_analysis(X, y, feature_cols):    """Perform cross-validation analysis"""    print("\n🔄 Cross-Validation Analysis")        models = {        'Linear Regression': LinearRegression(),        'Ridge Regression': Ridge(alpha=1.0),        'Lasso Regression': Lasso(alpha=0.01),        'Random Forest': RandomForestRegressor(n_estimators=50, random_state=42)    }        cv_results = {}        for name, model in models.items():        if name == 'Random Forest':

## Random Forest pipeline without scaling

In [None]:
            pipeline = Pipeline([('model', model)])        else:

## Other models with scaling

In [None]:
            pipeline = Pipeline([                ('scaler', StandardScaler()),                ('model', model)            ])        

## 5-fold cross-validation

In [None]:
        cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')                cv_results[name] = {            'mean_cv_score': cv_scores.mean(),            'std_cv_score': cv_scores.std(),            'cv_scores': cv_scores        }                print(f"{name}: CV R² = {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")    

## Visualization

In [None]:
    plt.figure(figsize=(10, 6))    model_names = list(cv_results.keys())    means = [cv_results[name]['mean_cv_score'] for name in model_names]    stds = [cv_results[name]['std_cv_score'] for name in model_names]        plt.bar(model_names, means, yerr=stds, alpha=0.7, capsize=5)    plt.ylabel('Cross-Validation R² Score')    plt.title('Model Comparison - Cross-Validation Results')    plt.xticks(rotation=45)    plt.tight_layout()    plt.show()        return cv_results

In [None]:
def model_interpretation(best_model_name, best_model, feature_cols):    """Interpret the best model"""    print(f"\n🔍 Model Interpretation: {best_model_name}")        model = best_model['model']        if hasattr(model, 'coef_'):

## Linear model interpretation

In [None]:
        coefficients = model.coef_        intercept = model.intercept_                print(f"Intercept: {intercept:.6f}")        print("\nCoefficient Analysis:")                coef_df = pd.DataFrame({            'Feature': feature_cols,            'Coefficient': coefficients,            'Abs_Coefficient': np.abs(coefficients)        }).sort_values('Abs_Coefficient', ascending=False)                print(coef_df.round(6))                print("\nInterpretation:")        for _, row in coef_df.head(5).iterrows():            direction = "increases" if row['Coefficient'] > 0 else "decreases"            print(f"• {row['Feature']}: {direction} target return by {abs(row['Coefficient']):.6f} per unit")        elif hasattr(model, 'feature_importances_'):

## Tree-based model interpretation

In [None]:
        importances = model.feature_importances_                feature_imp = pd.DataFrame({            'Feature': feature_cols,            'Importance': importances        }).sort_values('Importance', ascending=False)                print("Feature Importance Analysis:")        print(feature_imp.round(6))                print("\nTop 5 Most Important Features:")        for _, row in feature_imp.head(5).iterrows():            print(f"• {row['Feature']}: {row['Importance']:.4f}")

In [None]:
def risk_assessment(results):    """Assess model risks and assumptions"""    print("\n⚠️ Risk Assessment and Assumptions")        print("Model Assumptions:")    print("1. Linear relationship between features and target (for linear models)")    print("2. Independence of residuals")    print("3. Homoscedasticity (constant variance)")    print("4. Normality of residuals")    print("5. No perfect multicollinearity")        print("\nModel Risks:")    print("1. Overfitting - monitor train vs test performance")    print("2. Regime changes - model may not adapt to market shifts")    print("3. Feature stability - engineered features may become less predictive")    print("4. Data quality - outliers and missing values impact performance")    

## Performance comparison

In [None]:
    print("\nModel Performance Summary:")    for name, result in results.items():        overfitting = result['train_r2'] - result['test_r2']        print(f"{name}:")        print(f"  Test R²: {result['test_r2']:.4f}")        print(f"  Overfitting: {overfitting:.4f}")        if overfitting > 0.1:            print(f"  ⚠️ High overfitting detected")        else:            print(f"  ✅ Acceptable generalization")

In [None]:
def main():    """Main execution function"""

## Load data

In [None]:
    df = load_engineered_data()    if df is None:        return    

## Prepare features and target

In [None]:
    X, y, dates, symbols, feature_cols = prepare_features_target(df)    

## Time series split

In [None]:
    X_train, X_test, y_train, y_test = time_series_split(X, y, dates)    

## Build models

In [None]:
    results, scaler = build_regression_models(X_train, y_train, X_test, y_test)    

## Model diagnostics

In [None]:
    best_model_name, best_model = model_diagnostics(results, X_train, y_train, X_test, y_test, feature_cols)    

## Cross-validation

In [None]:
    cv_results = cross_validation_analysis(X, y, feature_cols)    

## Model interpretation

In [None]:
    model_interpretation(best_model_name, best_model, feature_cols)    

## Risk assessment

In [None]:
    risk_assessment(results)    

## Save results

In [None]:
    model_summary = pd.DataFrame({        'Model': list(results.keys()),        'Train_R2': [results[name]['train_r2'] for name in results.keys()],        'Test_R2': [results[name]['test_r2'] for name in results.keys()],        'Test_RMSE': [results[name]['test_rmse'] for name in results.keys()],        'Test_MAE': [results[name]['test_mae'] for name in results.keys()]    })        output_path = utils.save_with_timestamp(        df=model_summary,        prefix="regression_model_results",        source="project_stage10a",        ext="csv"    )        print(f"\n💾 Model results saved to: {output_path}")        print("\n✅ Stage 10a: Regression Modeling Complete")    print("Key deliverables:")    print("- Multiple regression models trained and compared")    print("- Comprehensive residual diagnostics")    print("- Cross-validation analysis")    print("- Model interpretation and coefficient analysis")    print("- Risk assessment and assumption validation")if __name__ == "__main__":    main()