# Predictive Modeling - Figure Generation

This notebook generates all figures for the predictive modeling chapter with consistent matplotlib styling.

In [None]:
# Setup and styling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm

# Apply our custom matplotlib style
plt.style.use('notes-base.mplstyle')

# Load data for Tufte models
url = 'https://raw.githubusercontent.com/alexanderthclark/pols4728/refs/heads/main/data/tufte_midterms.csv'
df = pd.read_csv(url)

# Tufte analysis
tufte_model = smf.ols(formula='vote_loss ~ pres_approval + delta_rdi', data=df[df.in_original==True]).fit()
tufte_model_sub = smf.ols(formula='vote_loss ~ pres_approval + dpi_pc_pct_yoy', data=df[df.in_original==True]).fit()
out_of_sample_average = sm.OLS(df[df.year > 1970].vote_loss, np.ones(13)).fit()
tufte_approval_only = smf.ols(formula='vote_loss ~ pres_approval', data=df[df.in_original==True]).fit()

df['single_lin_reg_residuals'] = df.vote_loss - tufte_approval_only.predict(df.pres_approval)
df['multi_reg_residuals'] = df.vote_loss - tufte_model_sub.predict(df[['pres_approval', 'dpi_pc_pct_yoy']])
df['multi_reg_predictions'] = tufte_model_sub.predict(df[['pres_approval', 'dpi_pc_pct_yoy']])

df1 = df[df.in_original==True]
df2 = df[df.year > 1970]

In [None]:
# Model 1: Presidential approval only
eq = r"$\hat{y} = " + fr"{tufte_approval_only.params.iloc[0]:+.1f}{tufte_approval_only.params.iloc[1]:+.1f}x$"

fig, axs = plt.subplots(1,2, figsize=(8,3))

ax = axs[0]
y = df1.vote_loss
ax.scatter(df1.pres_approval, y, color='C0', label='training')
y = df2.vote_loss
ax.scatter(df2.pres_approval, y, color='C1', label='test')

ax.set_ylabel("Standardized Vote Loss") 
ax.set_xlabel("Presidential Approval Rating")
ax.set_title(eq)
ax.legend()

x0, x1 = df1.pres_approval.min(), df1.pres_approval.max()
y0 = tufte_approval_only.params.values @ np.array([1, x0])
y1 = tufte_approval_only.params.values @ np.array([1, x1])
ax.plot([x0, x1], [y0, y1], ls='dashed')

# residuals
ax = axs[1]
ax.set_title("Residual by year")
ax.scatter(df1.year, df1.single_lin_reg_residuals)
ax.scatter(df2.year, df2.single_lin_reg_residuals)
ax.axhline(0, ls='dotted', color='gray')
ax.set_ylabel("Residual")
ax.set_xlabel("Year")

plt.suptitle('Model 1')
plt.tight_layout()

# Save for LaTeX
plt.savefig("../tex/images/model1_scatter.pdf", transparent=True)
plt.show()

In [None]:
# Model 2: Multi-regression model
fig, axs = plt.subplots(1,2, figsize=(8,3))

ax = axs[0]
ax.scatter(df1.multi_reg_predictions, df1.vote_loss, label='training') 
ax.scatter(df2.multi_reg_predictions, df2.vote_loss, label='test') 
ax.set_xlabel("Predicted")
ax.set_ylabel("Observed")
ax.set_title("Predicted vs Actual")

d = df1.multi_reg_predictions.min(), df1.multi_reg_predictions.max()
ax.plot(d, d, color='gray', ls='dotted')
ax.legend()

ax = axs[1]
ax.scatter(df1.year, df1.multi_reg_residuals)
ax.scatter(df2.year, df2.multi_reg_residuals)
ax.axhline(0, ls='dotted', color='gray')
ax.set_ylabel("Residual")
ax.set_xlabel("Year")
ax.set_title("Residual by year")

plt.suptitle("Model 2")
plt.tight_layout()

# Save for LaTeX
plt.savefig("../tex/images/model2_scatter.pdf", transparent=True)
plt.show()

In [None]:
# Sine function examples - bias-variance with sine data
np.random.seed(11)

x = np.linspace(0, 2*np.pi, 2_000)
y = np.sin(x)
n_axs = 3
fig, axs = plt.subplots(n_axs,1, figsize=(7,9), sharex=True, sharey=True)

ax = axs[0]
ax.set_aspect(1)
ax.plot(x, np.sin(x))
ax.set_title(r"Sine from 0 to $\pi$")
ax.set_xticks([0, np.pi/2, np.pi])
ax.set_xticklabels(['0', r"$\pi/2$", r"$\pi$"]) 

xlim, ylim = ax.get_xlim(), ax.get_ylim()

n_sims = 100
n=10
for i in range(n_sims):
    x0 = np.random.uniform(0, np.pi, size=n)
    noise = np.random.normal(0, 0.1, size=n)
    y0 = np.sin(x0) + noise
    if 0 < i < n_axs:
        ax = axs[i]
        ax.set_aspect(1)
        ax.plot(x,np.sin(x), lw=0.5, color='gray')
        ax.scatter(x0, y0, ec='black', fc='white')
        
        # Add both linear and quadratic regression
        p1 = np.polyfit(x0, y0, 1)
        y_lin = np.polyval(p1, x)
        ax.plot(x, y_lin, label='Linear fit', lw=2, color='C1')
        
        p2 = np.polyfit(x0, y0, 2)
        y_quad = np.polyval(p2, x)
        ax.plot(x, y_quad, label='Quadratic fit', lw=2, color='C2')
        
        ax.legend()
        ax.set_title(f"Dataset {i} - Linear & Quadratic Regression")

for ax in axs:
    ax.set_ylim(ylim)
    ax.set_xlim(xlim)
        
plt.tight_layout()

# Save for LaTeX
plt.savefig("../tex/images/sine_examples.pdf", transparent=True)
plt.show()

In [None]:
# Bias-variance scatter plot - quadratic truth with noise
x = np.linspace(0, 2, 2_000)
y = x**2  # True function
n_axs = 3
n_cols = 2

fig, axs = plt.subplots(n_axs, n_cols, figsize=(9, 10), sharex='col', sharey='row')

n = 30
n_test = 20

for i in range(0, n_axs):
    # Generate training data
    x_train = np.random.uniform(0, 2, size=n)
    noise_train = np.random.normal(0, 10, size=n)
    y_train = x_train**2 + noise_train
    
    # Generate test data from same distribution
    x_test = np.random.uniform(0, 2, size=n_test)
    noise_test = np.random.normal(0, 6, size=n_test)
    y_test = x_test**2 + noise_test
    
    # Fit models on training data
    p1 = np.polyfit(x_train, y_train, 1)
    p2 = np.polyfit(x_train, y_train, 2)
    
    # LEFT COLUMN: Training data
    ax = axs[i, 0]
    ax.plot(x, y, lw=0.5, color='gray', alpha=0.5)
    ax.scatter(x_train, y_train, ec='black', fc='white', s=50, zorder=5)
    
    y_lin = np.polyval(p1, x)
    ax.plot(x, y_lin, label='Linear fit', lw=2, color='C1')
    
    y_quad = np.polyval(p2, x)
    ax.plot(x, y_quad, label='Quadratic fit', lw=1, color='C2')
    
    ax.legend(loc='upper left')
    ax.set_title(f"Training Set {i+1}")
    
    # RIGHT COLUMN: Test data
    ax = axs[i, 1]
    ax.plot(x, y, lw=0.5, color='gray', alpha=0.5, label='True: $y=x^2$')
    ax.scatter(x_test, y_test, ec='blue', fc='lightblue', s=30, alpha=0.6, zorder=5)
    
    # Apply fitted models to test data
    ax.plot(x, y_lin, label='Linear pred', lw=2, color='C1')
    ax.plot(x, y_quad, label='Quadratic pred', lw=1, color='C2')
    
    # Calculate test R-squared
    y_lin_test = np.polyval(p1, x_test)
    y_quad_test = np.polyval(p2, x_test)
    
    ss_tot_test = np.sum((y_test - np.mean(y_test))**2)
    ss_res_lin_test = np.sum((y_test - y_lin_test)**2)
    ss_res_quad_test = np.sum((y_test - y_quad_test)**2)
    
    r2_lin_test = 1 - (ss_res_lin_test / ss_tot_test)
    r2_quad_test = 1 - (ss_res_quad_test / ss_tot_test)
    
    # Add comparison to title with R² values
    if r2_quad_test > r2_lin_test:
        title_text = f"Test Set {i+1} - Quad R² = {r2_quad_test:.2f} > Lin R² = {r2_lin_test:.2f}"
    else:
        title_text = f"Test Set {i+1} - Lin R² = {r2_lin_test:.2f} > Quad R² = {r2_quad_test:.2f}"
    
    ax.set_title(title_text)

# Set labels
axs[0, 0].set_ylabel('y')
axs[1, 0].set_ylabel('y')
axs[2, 0].set_ylabel('y')
axs[2, 0].set_xlabel('x')
axs[2, 1].set_xlabel('x')

plt.suptitle('Quadratic Truth with Noise: Training vs Test Performance', fontsize=14, y=1.02)
plt.tight_layout()

# Save for LaTeX
plt.savefig("../tex/images/scatter_bias_variance.pdf", transparent=True)
plt.show()