In [3]:
import pandas as pd
import numpy as np
import plotly.express as px

from dash import Dash, html, dcc, callback, Output, Input
import dash_ag_grid as dag
import dash_bootstrap_components as dbc

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    log_loss,
    confusion_matrix,
    roc_curve,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

# data prep 
df = pd.read_csv("CDC-2019-2021-2023-DATA.csv", low_memory=False)
# Drop 2024, drop NA, clean target
df = df.query("IYEAR != 2024").dropna().drop("Unnamed: 0", axis=1)
df["ADDEPEV3"] = df["ADDEPEV3"].replace({"Yes": 1, "No": 0}).astype(float)



# Target and predictors
y = df["ADDEPEV3"]
X = df[
    [
        "BIRTHSEX",
        "MENTHLTH",
        "POORHLTH",
        "DECIDE",
        "DIFFALON",
        "IYEAR",
        "ACEDEPRS",
        "ACEDRINK",
        "ACEDRUGS",
        "ACEPRISN",
        "ACEDIVRC",
        "ACEPUNCH",
        "ACEHURT1",
        "ACESWEAR",
        "ACETOUCH",
        "ACETTHEM",
        "ACEHVSEX",
    ]
]

nums = ["POORHLTH", "MENTHLTH"]
cats = [
    "IYEAR",
    "BIRTHSEX",
    "ACEDEPRS",
    "DECIDE",
    "DIFFALON",
    "ACEDRINK",
    "ACEDRUGS",
    "ACEPRISN",
    "ACEDIVRC",
    "ACEPUNCH",
    "ACEHURT1",
    "ACESWEAR",
    "ACETOUCH",
    "ACETTHEM",
    "ACEHVSEX",
]

preprocess = ColumnTransformer(
    transformers=[
        ("encoder", OneHotEncoder(drop="first"), cats),
        ("numeric", "passthrough", nums),
    ]
)

pipe = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", LogisticRegression(max_iter=1000)),
    ]
)


# app setup 
app = Dash(__name__, suppress_callback_exceptions=True)

app.layout = html.Div(
    [
        html.H1(children="Behavioral Risk Mental Health Dashboard"),
        dcc.Tabs(
            id="tabs",
            value="tab1",
            children=[
                dcc.Tab(label="README: Project Overview", value="tab1"),
                dcc.Tab(label="Data Table", value="tab2"),
                dcc.Tab(label="Models", value="tab3"),
            ],
        ),
        html.Div(id="tabs-content"),
    ]
)


# tab content 
@callback(Output("tabs-content", "children"), Input("tabs", "value"))
def render_content(tab):

    # ---------- TAB 1 ---------- #
    if tab == "tab1":
        return html.Div(
            [
                html.H2(
                    "Behavioral Risk Mental Health Dashboard: Predicting Mental Health with Behavioral Risk Factor Variables"
                ),
                html.P(
                    """This app uses behavioral risk variables from 2019, 2021, and 2023 to predict mental health outcomes,
                    focusing primarily on variables relating to adverse childhood experiences, as well as a few other variables."""
                ),
                html.H3("About the Dataset"),
                html.P(
                    """This dataset comes from the CDC's Behavioral Risk Factor Surveillance System,
                    a system of comprehensive telephone surveys conducted every year regarding health-related risk behaviors,
                    chronic health conditions, and use of preventative health services for adults in the United States."""
                ),
                html.H3("Target Variable"),
                html.P(
                    [
                        html.B("ADDEPEV3: "),
                        "Ever told you had a depressive disorder? (1 = Yes, 0 = No).",
                    ]
                ),
                html.H3("Key Features"),
                html.Ul(
                    [
                        html.Li("Interactive view of the cleaned dataset."),
                        html.Li(
                            "Logistic regression to predict depression from behavioral and ACE variables."
                        ),
                        html.Li(
                            "Adjustable train/test split and classification threshold with live performance updates."
                        ),
                    ]
                ),
            ]
        )

    # ---------- TAB 2 ---------- #
    if tab == "tab2":
        return html.Div(
            [
                dag.AgGrid(
                    rowData=df.to_dict("records"),
                    columnDefs=[{"field": c} for c in df.columns],
                )
            ]
        )

    # ---------- TAB 3 ---------- #
    if tab == "tab3":
        return html.Div(
            [
                html.H2("Logistic Regression Model"),

                html.Div(
                    [
                        html.Label("Test set size (%)"),
                        dcc.Slider(
                            id="test-size-slider",
                            min=10,
                            max=50,
                            step=5,
                            value=30,
                            marks={i: f"{i}%" for i in range(10, 55, 5)},
                        ),
                    ],
                    style={"margin-bottom": "30px"},
                ),

                html.Div(
                    [
                        html.Label("Classification threshold"),
                        dcc.Slider(
                            id="threshold-slider",
                            min=0.1,
                            max=0.9,
                            step=0.05,
                            value=0.5,
                            marks={
                                0.1: "0.1",
                                0.3: "0.3",
                                0.5: "0.5",
                                0.7: "0.7",
                                0.9: "0.9",
                            },
                        ),
                    ],
                    style={"margin-bottom": "30px"},
                ),

                html.H3("Model Performance"),
                html.Div(id="logit-metrics"),

                html.Br(),
                html.H3("Confusion Matrix (Test Set)"),
                dcc.Graph(id="logit-confusion"),

                html.Br(),
                html.H3("ROC Curve"),
                dcc.Graph(id="logit-roc"),

                html.Br(),
                html.H3("Top Logistic Regression Coefficients"),
                dcc.Graph(id="logit-coefs"),
            ]
        )


# model + callback 


def do_logit(test_size, threshold):
    """
    Fit logistic regression with given test_size and threshold.
    Returns metrics, confusion matrix, ROC info, and coefficient table.
    """
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=0, stratify=y
    )

    # fit pipeline
    pipe.fit(X_train, y_train)

    # predicted probabilities + labels on test set
    p_test = pipe.predict_proba(X_test)[:, 1]
    y_hat_test = (p_test >= threshold).astype(int)

    # metrics
    acc = accuracy_score(y_test, y_hat_test)
    ll = log_loss(y_test, p_test)
    cm = confusion_matrix(y_test, y_hat_test)

    # ROC + AUC
    fpr, tpr, _ = roc_curve(y_test, p_test)
    auc = roc_auc_score(y_test, p_test)

    # coefficient importance
    logit = pipe.named_steps["model"]
    preprocess_step = pipe.named_steps["preprocess"]

    try:
        feature_names = preprocess_step.get_feature_names_out()
    except AttributeError:
        # Fallback
        feature_names = [f"feature_{i}" for i in range(logit.coef_.shape[1])]

    coefs = logit.coef_.ravel()

    coef_df = (
        pd.DataFrame(
            {
                "feature": feature_names,
                "coefficient": coefs,
                "abs_coeff": np.abs(coefs),
            }
        )
        .sort_values("abs_coeff", ascending=False)
        .head(15)
    )

    return acc, ll, cm, fpr, tpr, auc, coef_df


@callback(
    Output("logit-metrics", "children"),
    Output("logit-confusion", "figure"),
    Output("logit-roc", "figure"),
    Output("logit-coefs", "figure"),
    Input("test-size-slider", "value"),
    Input("threshold-slider", "value"),
)
def update_logit_tab(test_size_pct, threshold):
    # convert slider % to proportion
    test_size = test_size_pct / 100.0

    acc, ll, cm, fpr, tpr, auc, coef_df = do_logit(test_size, threshold)

    # ----- metrics text ----- #
    metrics = html.Ul(
        [
            html.Li(f"Test size: {test_size_pct}%"),
            html.Li(f"Threshold: {threshold:.2f}"),
            html.Li(f"Accuracy: {acc:.3f}"),
            html.Li(f"Log loss: {ll:.3f}"),
            html.Li(f"AUC: {auc:.3f}"),
        ]
    )

    # ----- confusion matrix heatmap ----- #
    cm_fig = px.imshow(
        cm,
        text_auto=True,
        x=["Predicted: No depression", "Predicted: Yes depression"],
        y=["Actual: No depression", "Actual: Yes depression"],
        labels=dict(x="Predicted label", y="Actual label", color="Count"),
    )
    cm_fig.update_layout(margin=dict(l=40, r=40, t=40, b=40))

    # ----- ROC curve ----- #
    roc_fig = px.area(
        x=fpr,
        y=tpr,
        labels=dict(x="False positive rate", y="True positive rate"),
        title=f"ROC Curve (AUC = {auc:.3f})",
    )
    roc_fig.add_shape(
        type="line",
        x0=0,
        y0=0,
        x1=1,
        y1=1,
        line=dict(dash="dash"),
    )
    roc_fig.update_layout(margin=dict(l=40, r=40, t=40, b=40))

    # ----- coefficient importance bar chart ----- #
    coef_df_sorted = coef_df.sort_values("abs_coeff", ascending=True)
    coef_fig = px.bar(
        coef_df_sorted,
        x="coefficient",
        y="feature",
        orientation="h",
        title="Top Logistic Regression Coefficients (by |beta|)",
    )
    coef_fig.update_layout(margin=dict(l=40, r=40, t=40, b=40))

    return metrics, cm_fig, roc_fig, coef_fig


if __name__ == "__main__":
    app.run(debug=True, port=8051)
