# Prediction Analysis & Confidence Intervals

This notebook visualizes the performance of our trained model on the unseen test data. 

In [None]:
# Load Artifacts & Setup
import os
import joblib
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Define paths
MODEL_PATH = os.path.join('..', 'models', 'random_forest_model.joblib')
X_TEST_PATH = os.path.join('..', 'models', 'X_test.parquet')
Y_TEST_PATH = os.path.join('..', 'models', 'y_test.parquet')

# Load the saved model and test data
try:
    model = joblib.load(MODEL_PATH)
    X_test = pd.read_parquet(X_TEST_PATH)
    y_test = pd.read_parquet(Y_TEST_PATH)
    print("Successfully loaded model and test data.")
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please run `src/modeling.py` first to generate the model artifacts.")
    model = None
    X_test = pd.DataFrame()
    y_test = pd.DataFrame()


Successfully loaded model and test data.


In [None]:
# Calculate Prediction Intervals

def get_prediction_intervals(model, X_test, percentile=90):
    """
    Calculates prediction intervals for a RandomForestRegressor model.
    """
    print("Calculating prediction intervals...")
    # Get the predictions from each individual tree in the forest
    individual_tree_preds = [tree.predict(X_test) for tree in model.estimators_]
    preds_array = np.stack(individual_tree_preds)
    
    # Calculate the lower and upper bounds of the interval
    lower_percentile = (100 - percentile) / 2
    upper_percentile = 100 - lower_percentile
    
    lower_bound = np.percentile(preds_array, lower_percentile, axis=0)
    upper_bound = np.percentile(preds_array, upper_percentile, axis=0)
    
    # Get the main prediction (mean of all tree predictions)
    mean_prediction = preds_array.mean(axis=0)
    
    results_df = pd.DataFrame({
        'predicted_rate': mean_prediction,
        'lower_bound': lower_bound,
        'upper_bound': upper_bound
    })
    
    print("Calculation complete.")
    return results_df

if model:
    prediction_results = get_prediction_intervals(model, X_test)
    
    # Combine with the actual values for plotting
    final_df = pd.concat([
        y_test.reset_index(drop=True),
        prediction_results.reset_index(drop=True)
    ], axis=1)
    
    final_df = final_df.rename(columns={'rate_hourly': 'actual_rate'})
    
    display(final_df.head())


Calculating prediction intervals...
Calculation complete.




Unnamed: 0,actual_rate,predicted_rate,lower_bound,upper_bound
0,400.0,445.0,400.0,500.0
1,200.0,265.530952,200.0,300.0
2,400.0,399.0,400.0,400.0
3,350.0,407.5,400.0,450.0
4,400.0,413.5,350.0,500.0


In [None]:
# Visualize Results with Binned Bar Chart

if model:
    # Bin the Data
    # Create bins for the actual hourly rate to group the data.
    bin_size = 50
    min_val = int(np.floor(final_df['actual_rate'].min() / bin_size)) * bin_size
    max_val = int(np.ceil(final_df['actual_rate'].max() / bin_size)) * bin_size
    
    bins = np.arange(min_val, max_val + bin_size, bin_size)
    labels = [f"${int(bins[i])} - ${int(bins[i+1])}" for i in range(len(bins)-1)]
    
    final_df['actual_rate_bin'] = pd.cut(final_df['actual_rate'], bins=bins, labels=labels, right=False)

    # Prepare Data for Comparison
    # For each bin, we need the actual midpoint and the median of our predictions.
    
    # Find the median prediction for each bin
    binned_predictions = final_df.groupby('actual_rate_bin')['predicted_rate'].median().reset_index()
    
    # Find the midpoint of each actual rate bin to use as the "Actual" value
    binned_predictions['actual_midpoint'] = bins[:-1] + bin_size / 2
    
    # Melt the df into a long format suitable for a grouped bar chart
    plot_df = pd.melt(
        binned_predictions,
        id_vars=['actual_rate_bin'],
        value_vars=['actual_midpoint', 'predicted_rate'],
        var_name='type',
        value_name='rate'
    )
    plot_df['type'] = plot_df['type'].map({'actual_midpoint': 'Actual Rate', 'predicted_rate': 'Predicted Rate'})

    # Create the Bar Chart
    # This chart provides the clearest, most direct comparison of performance.
    fig = px.bar(
        plot_df,
        x='actual_rate_bin',
        y='rate',
        color='type',
        barmode='group', # Creates the side-by-side bars
        labels={
            "actual_rate_bin": "Actual Hourly Rate Bins (USD)",
            "rate": "Hourly Rate (USD)",
            "type": "Rate Type"
        },
        title="Actual vs. Median Predicted Rate for Each Pay Tier"
    )

    fig.update_layout(
        height=600,
        legend_title_text=None,
        xaxis={'categoryorder':'array', 'categoryarray': labels}
    )
    
    fig.show()






# 4. Visualize Individual Predictions with Jitter

The bar chart above provides an excellent summary of performance, but it's also useful to see the individual predictions. To visualize every prediction without the unreadable overplotting we saw in our first attempt, we can use a scatter plot with **jitter** and **transparency**.

-   **Jitter:** A small amount of random noise is added to the vertical position of each point. This spreads the overlapping points out into "prediction clouds."
-   **Transparency:** The points are made semi-transparent, so the areas with the most dense predictions appear darker.

This gives us an intuitive and professional view of the model's behavior on individual data points.


In [None]:
# Create the Jittered Scatter Plot
if model:
    # Create a copy to avoid modifying the original dataframe
    jitter_df = final_df.copy()
    
    # Add a small amount of random "jitter" to the y-axis values.
    # The amount of jitter is scaled to the standard deviation of the predictions.
    jitter_strength = 0.2
    prediction_std = jitter_df['predicted_rate'].std()
    jitter_df['predicted_rate_jittered'] = jitter_df['predicted_rate'] + \
        np.random.randn(len(jitter_df)) * prediction_std * jitter_strength

    fig = go.Figure()

    # Add the jittered and transparent scatter points
    fig.add_trace(go.Scatter(
        x=jitter_df['actual_rate'],
        y=jitter_df['predicted_rate_jittered'],
        mode='markers',
        name='Individual Predictions',
        marker=dict(
            color='blue',
            opacity=0.3, # Transparency to show density
            size=6
        )
    ))

    # Add a perfect prediction line (y=x) for reference
    max_val = max(jitter_df['actual_rate'].max(), jitter_df['predicted_rate'].max())
    min_val = min(jitter_df['actual_rate'].min(), jitter_df['predicted_rate'].min())
    fig.add_trace(go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        name='Perfect Prediction',
        line=dict(color='red', dash='dash')
    ))

    fig.update_layout(
        title='Actual vs. Predicted Rates (with Jitter)',
        xaxis_title='Actual Hourly Rate (USD)',
        yaxis_title='Predicted Hourly Rate (USD)',
        legend_title="Legend",
        height=700
    )

    fig.show()
