In [22]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.formula.api as smf

In [23]:
lr_data = pd.read_csv('CDC-2019-2023-DATA_nums.csv', low_memory=False)

In [24]:
lr_data.columns

Index(['Unnamed: 0', 'BIRTHSEX', 'MENTHLTH', 'POORHLTH', 'ADDEPEV3', 'DECIDE',
       'DIFFALON', 'IYEAR', 'ACEDEPRS', 'ACEDRINK', 'ACEDRUGS', 'ACEPRISN',
       'ACEDIVRC', 'ACEPUNCH', 'ACEHURT1', 'ACESWEAR', 'ACETOUCH', 'ACETTHEM',
       'ACEHVSEX', 'EMPLOY1', 'AVEDRNK3', 'EXEROFT1', 'STRENGTH', 'PHYSHLTH'],
      dtype='object')

In [25]:
lr_data = lr_data.drop(['Unnamed: 0'], axis=1)

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.preprocessing import SplineTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression


In [27]:
ace_vars = ["ACEDEPRS", "ACESWEAR", "ACETTHEM"]

num_cols_base = ['AVEDRNK3', 'EXEROFT1', 'STRENGTH', 'PHYSHLTH', 'POORHLTH']
other_cat_cols = ['IYEAR', 'EMPLOY1']

In [28]:
def adjusted_r2_score(y_true, y_pred, n_features):
    r2 = r2_score(y_true, y_pred)
    n = len(y_true)
    return 1 - (1 - r2) * (n - 1) / (n - n_features - 1)

In [29]:
ace_models = {}           # fitted models
ace_metrics = {}          # RMSE, R2 for train/test
ace_categories_map = {}   # ACE category labels
ace_predictions = {}      # predicted curves

for ace in ace_vars:
    print(f"Fitting model for ACE variable: {ace}")

    # Columns for this model
    num_cols = num_cols_base.copy()
    cat_cols = other_cat_cols + [ace]

    all_cols = ['MENTHLTH'] + num_cols + cat_cols

    df_ace = lr_data.dropna(subset=all_cols).copy()

    # Split X/y
    X = df_ace[num_cols + cat_cols]
    y = df_ace['MENTHLTH']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Preprocess: one-hot encode categorical, pass through numeric
    preprocess_ace = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(drop='first'), cat_cols),
            ('num', 'passthrough', num_cols)
        ]
    )

    # Pipeline for preprocessing + spline regression
    preprocess = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(drop='first'), cat_cols),
            ('num', 'passthrough', num_cols)
        ]
    )

    model = Pipeline([
        ("preprocess", preprocess),
        ("spline", SplineTransformer(degree=3, n_knots=8, include_bias=False)),
        ("linreg", LinearRegression())
    ])

    # Fit model
    model.fit(X_train, y_train)

    # Predict
    yhat_train = model.predict(X_train)
    yhat_test = model.predict(X_test)

    n_features = model.named_steps['preprocess'].transform(X_train).shape[1]

    metrics = {
    "Train RMSE": mean_squared_error(y_train, yhat_train) ** 0.5,
    "Test RMSE": mean_squared_error(y_test, yhat_test) ** 0.5,
    "Train R2": r2_score(y_train, yhat_train),
    "Test R2": r2_score(y_test, yhat_test),
    "Train Adjusted R2": adjusted_r2_score(y_train, yhat_train, n_features),
    "Test Adjusted R2": adjusted_r2_score(y_test, yhat_test, n_features)
}

    ace_metrics[ace] = metrics
    print(metrics)

    # ---------------------------------------------------
    # Build prediction curves for dashboard visualization
    # ---------------------------------------------------

    categories = sorted(df_ace[ace].dropna().unique())
    ace_categories_map[ace] = categories

    # Construct "typical person" baseline row
    base = {}
    for col in num_cols:
        base[col] = df_ace[col].mean()
    for col in other_cat_cols:
        base[col] = df_ace[col].mode()[0]

    # Build a row per ACE value
    plot_df = pd.DataFrame([base.copy() for _ in categories])
    plot_df[ace] = categories

    # Ensure column order matches the model's training data
    plot_df = plot_df[num_cols + other_cat_cols + [ace]]

    # Predict spline curve
    preds = model.predict(plot_df)
    ace_predictions[ace] = preds

Fitting model for ACE variable: ACEDEPRS
{'Train RMSE': 7.820848580273953, 'Test RMSE': 8.113683404630423, 'Train R2': 0.20972787506968837, 'Test R2': 0.19558876742849718, 'Train Adjusted R2': 0.20923738839830663, 'Test Adjusted R2': 0.193588076251483}
Fitting model for ACE variable: ACESWEAR
{'Train RMSE': 7.972782292352605, 'Test RMSE': 7.733606671376591, 'Train R2': 0.19753414445349282, 'Test R2': 0.2146914224475659, 'Train Adjusted R2': 0.19699878800564452, 'Test Adjusted R2': 0.21259166689261277}
Fitting model for ACE variable: ACETTHEM
{'Train RMSE': 7.930070952283632, 'Test RMSE': 7.919401106150371, 'Train R2': 0.19442619970896724, 'Test R2': 0.198161426924493, 'Train Adjusted R2': 0.19388863534292244, 'Test Adjusted R2': 0.19601675671900853}


In [31]:
bar_color = "#a4a4e3" 

ace_labels_pretty = {
    "ACEDEPRS": "ACEDEPRS",
    "ACESWEAR": "ACESWEAR",
    "ACETTHEM": "ACETTHEM"
}

fig_ace = go.Figure()
buttons = []

for i, ace in enumerate(ace_vars):

    x_vals = ace_categories_map[ace]
    y_vals = ace_predictions[ace]

    # Metrics for this ACE variable
    adj_r2 = ace_metrics[ace]["Test Adjusted R2"]

    # Make first visible
    is_visible = (i == 0)

    fig_ace.add_trace(
        go.Bar(
            x=x_vals,
            y=y_vals,
            name=ace_labels_pretty.get(ace, ace),
            visible=is_visible,
            marker_color=bar_color
        )
    )

    # Visibility mask
    visible_mask = [False] * len(ace_vars)
    visible_mask[i] = True

    buttons.append(
        dict(
            label=ace_labels_pretty.get(ace, ace),
            method="update",
            args=[
                {"visible": visible_mask},
                {
                    "title": (
                        f"Cubic Spline Prediction of Bad Mental Health Days<br>"
                        f"<sup>{ace_labels_pretty.get(ace, ace)} — "
                        f"Test Adjusted R² = {adj_r2:.3f}</sup>"
                    ),
                    "xaxis": {"title": ace},
                    "yaxis": {"title": "Predicted Bad Mental Health Days (MENTHLTH)"}
                }
            ]
        )
    )

# Default title uses the first ACE variable
first_ace = ace_vars[0]
first_adj_r2 = ace_metrics[first_ace]["Test Adjusted R2"]

fig_ace.update_layout(
    title=(
        f"Cubic Spline Prediction of Bad Mental Health Days<br>"
        f"<sup>{ace_labels_pretty.get(first_ace, first_ace)} — "
        f"Test Adjusted R² = {first_adj_r2:.3f}</sup>"
    ),
    xaxis_title=first_ace,
    yaxis_title="Predicted Bad Mental Health Days (MENTHLTH)",
    updatemenus=[
        dict(
            buttons=buttons,
            direction="down",
            showactive=True,
            x=1.05,
            xanchor="left",
            y=1.0,
            yanchor="top"
        )
    ]
)

fig_ace.show()
