In [None]:
# Logistic Regression using our replace_map

import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# -------- 0) Load data--------

df = pd.read_csv("CDC_2019Subset.csv")


# -------- 1) Group's replacement map --------
replace_map = {
    'BIRTHSEX': {1:'Male', 2:'Female', 7:np.nan, 9:np.nan},
    'MENTHLTH': {88:0, 77:np.nan, 99:np.nan},
    'POORHLTH': {88:0, 77:np.nan, 99:np.nan},
    'ADDEPEV3': {1:'Yes', 2:'No', 7:np.nan, 9:np.nan},
    'DECIDE':   {1:'Yes', 2:'No', 7:np.nan, 9:np.nan},
    'DIFFALON': {1:'Yes', 2:'No', 7:np.nan, 9:np.nan},
    'ACEDEPRS': {1:'Yes', 2:'No', 7:np.nan, 9:np.nan},
    'ACEDRINK': {1:'Yes', 2:'No', 7:np.nan, 9:np.nan},
    'ACEDRUGS': {1:'Yes', 2:'No', 7:np.nan, 9:np.nan},
    'ACEPRISN': {1:'Yes', 2:'No', 7:np.nan, 9:np.nan},
    'ACEDIVRC': {1:'Yes', 2:'No', 8:'Parents not married', 7:np.nan, 9:np.nan},
    'ACEPUNCH': {1:'Never', 2:'Once', 3:'More than once', 7:np.nan, 9:np.nan},
    'ACEHURT1': {1:'Never', 2:'Once', 3:'More than once', 7:np.nan, 9:np.nan},
    'ACESWEAR': {1:'Never', 2:'Once', 3:'More than once', 7:np.nan, 9:np.nan},
    'ACETOUCH': {1:'Never', 2:'Once', 3:'More than once', 7:np.nan, 9:np.nan},
    'ACETTHEM': {1:'Never', 2:'Once', 3:'More than once', 7:np.nan, 9:np.nan},
    'ACEHVSEX': {1:'Never', 2:'Once', 3:'More than once', 7:np.nan, 9:np.nan}
}

# Apply mapping where columns exist
for col, mapping in replace_map.items():
    if col in df.columns:
        df[col] = df[col].replace(mapping)

#### assigning baseline fix, manually picking the references: GROUP LETS KEEP THIS CONSISTENT
baseline_fix = {
    # binary baseline drop='first' uses the 0_* level
    'ADDEPEV3': {'Yes':'1_Yes', 'No':'0_No'},       # <-- NOTE: this is for predictors only; y is set later
    'DECIDE':   {'Yes':'1_Yes', 'No':'0_No'},
    'DIFFALON': {'Yes':'1_Yes', 'No':'0_No'},
    'BIRTHSEX': {'Male':'1_Male', 'Female':'0_Female'},

    # 3-level ACE frequency vars â€” make 'Never' the reference
    'ACEPUNCH': {'Never':'0_Never', 'Once':'1_Once', 'More than once':'2_More_than_once'},
    'ACEHURT1': {'Never':'0_Never', 'Once':'1_Once', 'More than once':'2_More_than_once'},
    'ACESWEAR': {'Never':'0_Never', 'Once':'1_Once', 'More than once':'2_More_than_once'},
    'ACETOUCH': {'Never':'0_Never', 'Once':'1_Once', 'More than once':'2_More_than_once'},
    'ACETTHEM': {'Never':'0_Never', 'Once':'1_Once', 'More than once':'2_More_than_once'},
    'ACEHVSEX': {'Never':'0_Never', 'Once':'1_Once', 'More than once':'2_More_than_once'},

    # binary ACEs
    'ACEDEPRS': {'Yes':'1_Yes', 'No':'0_No'},
    'ACEDRINK': {'Yes':'1_Yes', 'No':'0_No'},
    'ACEDRUGS': {'Yes':'1_Yes', 'No':'0_No'},
    'ACEPRISN': {'Yes':'1_Yes', 'No':'0_No'},
    'ACEDIVRC': {'Yes':'1_Yes', 'No':'0_No', 'Parents not married':'2_Parents_not_married'},
}

for col, m in baseline_fix.items():
    if col in df.columns:
        df[col] = df[col].replace(m)


# -------- 2) Choose variables --------

# our target variable is ADDEPEV3 (depression ever)
core_mh_vars = ['CADULT1', 'BIRTHSEX', 'MENTHLTH', 'POORHLTH',
                'ADDEPEV3', 'DECIDE', 'DIFFALON']
optional_mh_vars = [
    'ACEDEPRS','ACEDRINK','ACEDRUGS','ACEPRISN','ACEDIVRC',
    'ACEPUNCH','ACEHURT1','ACESWEAR','ACETOUCH','ACETTHEM','ACEHVSEX'
]
all_vars = core_mh_vars + optional_mh_vars
present = [c for c in all_vars if c in df.columns]
df = df[present].copy()

# -------- 3) Target and predictors --------

# Target variable (depression: 1 = Yes, 0 = No)
if 'ADDEPEV3' not in df.columns:
    raise ValueError("ADDEPEV3 not found in df after subsetting.")

df['depressed'] = df['ADDEPEV3'].map({
    '1_Yes': 1, '0_No': 0,   # after baseline_fix
    'Yes': 1, 'No': 0         
})

y = df['depressed']


# Predictors = everything except ADDEPEV3 (og coded version) and target
predictors = [c for c in df.columns if c not in ['ADDEPEV3','depressed','_STATE']]
X = df[predictors].copy()

# -------- 4) Identify column types dynamically --------
# Numeric-like (already numeric after mapping): MENTHLTH, POORHLTH, CADULT1
numeric_cols = [c for c in ['MENTHLTH','POORHLTH','CADULT1'] if c in X.columns]

# Everything else treat as categorical (strings or codes), incl. _STATE, BIRTHSEX, Yes/No, Never/Once/More than once
categorical_cols = [c for c in X.columns if c not in numeric_cols]

# -------- 5) Keep rows with known target (don't over-drop predictors yet) --------
mask = ~y.isna()
X = X.loc[mask].copy()
y = y.loc[mask].copy()
print("Rows after requiring target present:", len(X))

# -------- 6) Preprocess & model --------
# For categoricals: impute most frequent, then OneHotEncode (drop first to avoid dummy trap)
cat_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

# For numeric: impute median
num_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_pipe, categorical_cols),
        ('num', num_pipe, numeric_cols)
    ]
)

model = Pipeline(steps=[
    ('prep', preprocessor),
    ('logit', LogisticRegression(max_iter=2000, class_weight='balanced'))
])

# incase all rows are dropped 
if len(X) == 0:
    raise ValueError("No rows left after filtering for target. Check ADDEPEV3 mapping.")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Fit
model.fit(X_train, y_train)

# -------- 7) Evaluate --------
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

# -------- 8) Bonus: show feature names & odds ratios --------
# Recover feature names after OHE
ohe = model.named_steps['prep'].named_transformers_['cat'].named_steps['ohe']
cat_feat_names = ohe.get_feature_names_out(categorical_cols).tolist()
all_feat_names = cat_feat_names + numeric_cols

coefs = model.named_steps['logit'].coef_.ravel()
odds = np.exp(coefs)
coef_table = (
    pd.DataFrame({'feature': all_feat_names, 'coef': coefs, 'odds_ratio': odds})
      .sort_values('odds_ratio', ascending=False)
)
print("\nTop positive odds ratios:\n", coef_table.head(15))
print("\nMost protective (lowest) odds ratios:\n", coef_table.tail(15))


  df = pd.read_csv("CDC_2019Subset.csv")


Rows after requiring target present: 415985
Accuracy: 0.8016515018570382
ROC AUC: 0.7984842339439385

Confusion matrix:
 [[57202 10159]
 [ 6343  9493]]

Classification report:
               precision    recall  f1-score   support

         0.0       0.90      0.85      0.87     67361
         1.0       0.48      0.60      0.53     15836

    accuracy                           0.80     83197
   macro avg       0.69      0.72      0.70     83197
weighted avg       0.82      0.80      0.81     83197


Top positive odds ratios:
                       feature      coef  odds_ratio
1                DECIDE_1_Yes  1.239595    3.454213
3              ACEDEPRS_1_Yes  0.869418    2.385522
2              DIFFALON_1_Yes  0.487980    1.629023
16  ACETOUCH_2_More_than_once  0.417423    1.518044
15            ACETOUCH_1_Once  0.403355    1.496839
14  ACESWEAR_2_More_than_once  0.276468    1.318465
18  ACETTHEM_2_More_than_once  0.169525    1.184742
17            ACETTHEM_1_Once  0.126532    1.134886


In [29]:
from sklearn.metrics import log_loss

logloss = log_loss(y_test, y_prob)
print("Log Loss:", round(logloss, 4))


Log Loss: 0.5454


#checking

In [27]:
# Get feature names after preprocessing
ohe = model.named_steps['prep'].named_transformers_['cat'].named_steps['ohe']
cat_feature_names = ohe.get_feature_names_out(model.named_steps['prep'].transformers_[0][2]).tolist()

num_feature_names = model.named_steps['prep'].transformers_[1][2]  # numeric columns
all_features = cat_feature_names + num_feature_names

print("Total features used:", len(all_features))
print("Sample of features:\n", all_features[:20])


Total features used: 24
Sample of features:
 ['BIRTHSEX_1_Male', 'DECIDE_1_Yes', 'DIFFALON_1_Yes', 'ACEDEPRS_1_Yes', 'ACEDRINK_1_Yes', 'ACEDRUGS_1_Yes', 'ACEPRISN_1_Yes', 'ACEDIVRC_1_Yes', 'ACEDIVRC_2_Parents_not_married', 'ACEPUNCH_1_Once', 'ACEPUNCH_2_More_than_once', 'ACEHURT1_1_Once', 'ACEHURT1_2_More_than_once', 'ACESWEAR_1_Once', 'ACESWEAR_2_More_than_once', 'ACETOUCH_1_Once', 'ACETOUCH_2_More_than_once', 'ACETTHEM_1_Once', 'ACETTHEM_2_More_than_once', 'ACEHVSEX_1_Once']


In [28]:
import pandas as pd
import numpy as np

coefs = model.named_steps['logit'].coef_.ravel()
odds = np.exp(coefs)

coef_table = (
    pd.DataFrame({'feature': all_features, 'coef': coefs, 'odds_ratio': odds})
      .sort_values('odds_ratio', ascending=False)
)

display(coef_table.head(15))  # strongest positive predictors
display(coef_table.tail(15))  # strongest protective predictors


Unnamed: 0,feature,coef,odds_ratio
1,DECIDE_1_Yes,1.239595,3.454213
3,ACEDEPRS_1_Yes,0.869418,2.385522
2,DIFFALON_1_Yes,0.48798,1.629023
16,ACETOUCH_2_More_than_once,0.417423,1.518044
15,ACETOUCH_1_Once,0.403355,1.496839
14,ACESWEAR_2_More_than_once,0.276468,1.318465
18,ACETTHEM_2_More_than_once,0.169525,1.184742
17,ACETTHEM_1_Once,0.126532,1.134886
4,ACEDRINK_1_Yes,0.098216,1.103201
19,ACEHVSEX_1_Once,0.097924,1.102879


Unnamed: 0,feature,coef,odds_ratio
19,ACEHVSEX_1_Once,0.097924,1.102879
21,MENTHLTH,0.092097,1.096471
20,ACEHVSEX_2_More_than_once,0.051474,1.052822
22,POORHLTH,0.022287,1.022537
10,ACEPUNCH_2_More_than_once,0.012807,1.012889
5,ACEDRUGS_1_Yes,-0.020141,0.98006
13,ACESWEAR_1_Once,-0.030779,0.96969
9,ACEPUNCH_1_Once,-0.042194,0.958684
11,ACEHURT1_1_Once,-0.066208,0.935936
12,ACEHURT1_2_More_than_once,-0.091604,0.912466
