# Homework 10a: Linear Regression Modeling

**Assignment**: Build linear regression model with residual diagnostics and interpretation.

## Objectives
- Fit linear regression using engineered features
- Perform comprehensive residual analysis
- Interpret coefficients and model performance
- Test model assumptions

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import utils
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

print("üìà Homework 10a: Linear Regression Modeling")

## 1. Load Engineered Features Dataset

In [None]:
# Load data with engineered features
symbols = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA']
raw_data = utils.fetch_multiple_stocks(symbols, prefer_alphavantage=False, period='2y')

if not raw_data.empty:
    # Recreate engineered features (from homework 09)
    processed_data = []
    
    for symbol in symbols:
        symbol_data = raw_data[raw_data['symbol'] == symbol].copy().sort_values('date')
        
        # Basic features
        symbol_data['daily_return'] = symbol_data['close'].pct_change()
        symbol_data['log_return'] = np.log(symbol_data['close'] / symbol_data['close'].shift(1))
        
        # Moving averages
        symbol_data['sma_20'] = symbol_data['close'].rolling(20).mean()
        symbol_data['volume_ma_20'] = symbol_data['volume'].rolling(20).mean()
        
        # Engineered features
        rolling_vol = symbol_data['daily_return'].rolling(20).std()
        symbol_data['vol_adj_return'] = symbol_data['daily_return'] / rolling_vol
        
        volume_ratio = symbol_data['volume'] / symbol_data['volume_ma_20']
        price_momentum = symbol_data['close'].pct_change(10)
        symbol_data['volume_momentum'] = volume_ratio * np.sign(price_momentum) * np.abs(price_momentum)
        
        # Target variable
        symbol_data['target_return'] = symbol_data['daily_return'].shift(-1)
        
        processed_data.append(symbol_data)
    
    df = pd.concat(processed_data, ignore_index=True)
    df = df.dropna()
    
    print(f"‚úÖ Dataset loaded: {df.shape}")
    print(f"Symbols: {df['symbol'].unique()}")
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
else:
    print("‚ùå Failed to load data")

## 2. Prepare Features and Target

In [None]:
if not df.empty:
    # Define feature columns
    feature_cols = ['daily_return', 'vol_adj_return', 'volume_momentum']
    target_col = 'target_return'
    
    # Create feature matrix and target vector
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    
    # Remove any remaining NaN values
    mask = ~(X.isna().any(axis=1) | y.isna())
    X = X[mask]
    y = y[mask]
    
    print(f"üìä Feature Matrix: {X.shape}")
    print(f"üéØ Target Vector: {y.shape}")
    
    # Feature statistics
    print("\nüìà Feature Statistics:")
    print(X.describe().round(4))
    
    print(f"\nüéØ Target Statistics:")
    print(f"Mean: {y.mean():.6f}")
    print(f"Std: {y.std():.6f}")
    print(f"Range: [{y.min():.6f}, {y.max():.6f}]")

## 3. Train-Test Split and Scaling

In [None]:
if not df.empty:
    # Split data (80-20 split)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )
    
    print(f"üîÑ Train set: {X_train.shape}")
    print(f"üîÑ Test set: {X_test.shape}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Convert back to DataFrame for easier handling
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_cols, index=X_train.index)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_cols, index=X_test.index)
    
    print("\n‚öñÔ∏è Features scaled using StandardScaler")
    print("Scaled feature means (should be ~0):")
    print(X_train_scaled.mean().round(6))
    print("Scaled feature stds (should be ~1):")
    print(X_train_scaled.std().round(6))

## 4. Fit Linear Regression Model

In [None]:
if not df.empty:
    # Fit linear regression
    lr_model = LinearRegression()
    lr_model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_train_pred = lr_model.predict(X_train_scaled)
    y_test_pred = lr_model.predict(X_test_scaled)
    
    print("üìà Linear Regression Model Fitted")
    print(f"Intercept: {lr_model.intercept_:.6f}")
    print("\nCoefficients:")
    for feature, coef in zip(feature_cols, lr_model.coef_):
        print(f"  {feature}: {coef:.6f}")
    
    # Model performance metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    print("\nüìä Model Performance:")
    print(f"Train R¬≤: {train_r2:.6f}")
    print(f"Test R¬≤: {test_r2:.6f}")
    print(f"Train RMSE: {train_rmse:.6f}")
    print(f"Test RMSE: {test_rmse:.6f}")
    print(f"Train MAE: {train_mae:.6f}")
    print(f"Test MAE: {test_mae:.6f}")

## 5. Residual Analysis

In [None]:
if not df.empty:
    # Calculate residuals
    train_residuals = y_train - y_train_pred
    test_residuals = y_test - y_test_pred
    
    # Residual diagnostics plots
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # 1. Residuals vs Fitted
    axes[0,0].scatter(y_train_pred, train_residuals, alpha=0.6, s=20)
    axes[0,0].axhline(y=0, color='red', linestyle='--')
    axes[0,0].set_xlabel('Fitted Values')
    axes[0,0].set_ylabel('Residuals')
    axes[0,0].set_title('Residuals vs Fitted Values')
    
    # 2. Q-Q Plot
    stats.probplot(train_residuals, dist="norm", plot=axes[0,1])
    axes[0,1].set_title('Q-Q Plot of Residuals')
    
    # 3. Histogram of residuals
    axes[1,0].hist(train_residuals, bins=50, alpha=0.7, edgecolor='black')
    axes[1,0].set_xlabel('Residuals')
    axes[1,0].set_ylabel('Frequency')
    axes[1,0].set_title('Distribution of Residuals')
    
    # 4. Scale-Location plot
    sqrt_abs_residuals = np.sqrt(np.abs(train_residuals))
    axes[1,1].scatter(y_train_pred, sqrt_abs_residuals, alpha=0.6, s=20)
    axes[1,1].set_xlabel('Fitted Values')
    axes[1,1].set_ylabel('‚àö|Residuals|')
    axes[1,1].set_title('Scale-Location Plot')
    
    plt.tight_layout()
    plt.show()
    
    # Residual statistics
    print("\nüìä Residual Analysis:")
    print(f"Mean residual: {train_residuals.mean():.8f} (should be ~0)")
    print(f"Std residual: {train_residuals.std():.6f}")
    print(f"Skewness: {stats.skew(train_residuals):.4f}")
    print(f"Kurtosis: {stats.kurtosis(train_residuals):.4f}")
    
    # Normality test
    shapiro_stat, shapiro_p = stats.shapiro(train_residuals[:5000])  # Limit sample size
    print(f"\nShapiro-Wilk normality test:")
    print(f"Statistic: {shapiro_stat:.6f}, p-value: {shapiro_p:.6f}")
    
    # Durbin-Watson test for autocorrelation
    def durbin_watson(residuals):
        diff = np.diff(residuals)
        return np.sum(diff**2) / np.sum(residuals**2)
    
    dw_stat = durbin_watson(train_residuals)
    print(f"\nDurbin-Watson statistic: {dw_stat:.4f}")
    print("(Values around 2 indicate no autocorrelation)")

## 6. Model Interpretation

In [None]:
if not df.empty:
    # Feature importance visualization
    coefficients = pd.DataFrame({
        'Feature': feature_cols,
        'Coefficient': lr_model.coef_,
        'Abs_Coefficient': np.abs(lr_model.coef_)
    }).sort_values('Abs_Coefficient', ascending=True)
    
    plt.figure(figsize=(10, 6))
    colors = ['red' if x < 0 else 'blue' for x in coefficients['Coefficient']]
    plt.barh(coefficients['Feature'], coefficients['Coefficient'], color=colors, alpha=0.7)
    plt.xlabel('Coefficient Value')
    plt.title('Linear Regression Coefficients')
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("\nüîç Model Interpretation:")
    print("\nCoefficient Analysis:")
    for _, row in coefficients.iterrows():
        direction = "increases" if row['Coefficient'] > 0 else "decreases"
        print(f"‚Ä¢ {row['Feature']}: {direction} target by {abs(row['Coefficient']):.6f} per unit increase")
    
    # Prediction vs Actual scatter plot
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Training set
    axes[0].scatter(y_train, y_train_pred, alpha=0.6, s=20)
    axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
    axes[0].set_xlabel('Actual Returns')
    axes[0].set_ylabel('Predicted Returns')
    axes[0].set_title(f'Training Set (R¬≤ = {train_r2:.4f})')
    
    # Test set
    axes[1].scatter(y_test, y_test_pred, alpha=0.6, s=20)
    axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[1].set_xlabel('Actual Returns')
    axes[1].set_ylabel('Predicted Returns')
    axes[1].set_title(f'Test Set (R¬≤ = {test_r2:.4f})')
    
    plt.tight_layout()
    plt.show()
    
    # Model assumptions check
    print("\n‚úÖ Model Assumptions Check:")
    print(f"1. Linearity: Check residuals vs fitted plot")
    print(f"2. Independence: Durbin-Watson = {dw_stat:.4f}")
    print(f"3. Homoscedasticity: Check scale-location plot")
    print(f"4. Normality: Shapiro-Wilk p = {shapiro_p:.6f}")
    
    # Performance summary
    print(f"\nüìà Model Performance Summary:")
    print(f"‚Ä¢ Explains {test_r2*100:.2f}% of variance in test set")
    print(f"‚Ä¢ Average prediction error: {test_mae:.6f}")
    print(f"‚Ä¢ Root mean squared error: {test_rmse:.6f}")
    
    overfitting = train_r2 - test_r2
    if overfitting > 0.05:
        print(f"‚ö†Ô∏è Potential overfitting detected (train R¬≤ - test R¬≤ = {overfitting:.4f})")
    else:
        print(f"‚úÖ No significant overfitting (train R¬≤ - test R¬≤ = {overfitting:.4f})")

## 7. Model Validation and Robustness

In [None]:
if not df.empty:
    # Cross-validation by symbol
    print("\nüîÑ Cross-Validation by Symbol:")
    
    symbol_performance = []
    
    for symbol in df['symbol'].unique():
        # Get symbol data
        symbol_mask = df[mask]['symbol'] == symbol
        symbol_indices = df[mask][symbol_mask].index
        
        if len(symbol_indices) > 50:  # Minimum data points
            X_symbol = X.loc[symbol_indices]
            y_symbol = y.loc[symbol_indices]
            
            # Scale features
            X_symbol_scaled = scaler.transform(X_symbol)
            
            # Predict
            y_symbol_pred = lr_model.predict(X_symbol_scaled)
            
            # Calculate metrics
            symbol_r2 = r2_score(y_symbol, y_symbol_pred)
            symbol_rmse = np.sqrt(mean_squared_error(y_symbol, y_symbol_pred))
            
            symbol_performance.append({
                'Symbol': symbol,
                'R¬≤': symbol_r2,
                'RMSE': symbol_rmse,
                'N_samples': len(symbol_indices)
            })
    
    symbol_df = pd.DataFrame(symbol_performance)
    print(symbol_df.round(4))
    
    # Feature stability across symbols
    print("\nüéØ Model Consistency:")
    print(f"R¬≤ range: {symbol_df['R¬≤'].min():.4f} to {symbol_df['R¬≤'].max():.4f}")
    print(f"R¬≤ std: {symbol_df['R¬≤'].std():.4f}")
    print(f"Average R¬≤: {symbol_df['R¬≤'].mean():.4f}")
    
    # Save model results
    model_results = {
        'coefficients': dict(zip(feature_cols, lr_model.coef_)),
        'intercept': lr_model.intercept_,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'features_used': feature_cols
    }
    
    print("\nüíæ Model Results Summary:")
    for key, value in model_results.items():
        if isinstance(value, (int, float)):
            print(f"{key}: {value:.6f}")
        else:
            print(f"{key}: {value}")

## 8. Summary and Conclusions

### Model Performance
- **R¬≤ Score**: Measures proportion of variance explained
- **RMSE**: Root mean squared error in same units as target
- **MAE**: Mean absolute error, robust to outliers

### Key Findings
1. **Feature Importance**: Coefficients show relative impact of each feature
2. **Model Assumptions**: Residual analysis reveals assumption violations
3. **Generalization**: Cross-validation shows model consistency across symbols

### Limitations
- Linear models assume linear relationships
- Financial returns often exhibit non-linear patterns
- Model may not capture regime changes or volatility clustering

### Next Steps
1. **Feature Engineering**: Add interaction terms or polynomial features
2. **Regularization**: Try Ridge/Lasso regression to prevent overfitting
3. **Non-linear Models**: Consider tree-based or neural network models
4. **Time Series**: Account for temporal dependencies in data