# **Logistic Regression**

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
from sklearn.model_selection import train_test_split

In [11]:
df = pd.read_csv('/Users/randap/Desktop/MSDS/Fall25/DS6021/ML-Healthdata-project/joining_data/CDC-2019-2021-2023-DATA.csv')

  df = pd.read_csv('/Users/randap/Desktop/MSDS/Fall25/DS6021/ML-Healthdata-project/joining_data/CDC-2019-2021-2023-DATA.csv')


In [12]:
# Cleaning the target variable
df.ADDEPEV3 = df['ADDEPEV3'].replace({'Yes':1,'No':0}).astype(float)

In [13]:
df['ADDEPEV3'].value_counts()

ADDEPEV3
0.0    1030169
1.0     252728
Name: count, dtype: int64

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,BIRTHSEX,MENTHLTH,POORHLTH,ADDEPEV3,DECIDE,DIFFALON,ACEDEPRS,ACEDRINK,ACEDRUGS,ACEPRISN,ACEDIVRC,ACEPUNCH,ACEHURT1,ACESWEAR,ACETOUCH,ACETTHEM,ACEHVSEX,IYEAR
0,0,,0.0,0.0,0.0,No,No,No,No,No,No,Yes,Never,More than once,Never,Never,Never,Never,2019
1,1,,0.0,10.0,0.0,No,No,No,Yes,No,No,No,Never,Once,More than once,Never,Never,Never,2019
2,2,,30.0,0.0,0.0,No,No,No,No,No,No,No,,Never,Never,Never,Never,Never,2019
3,3,,0.0,0.0,0.0,No,,,,,,,,,,,,,2019
4,4,,0.0,,0.0,No,No,No,No,No,No,No,Never,Never,Never,Never,Never,Never,2019


In [16]:
# Removing values from 2024 (included in 2023 dataset)
df = df[df['IYEAR'] != 2024]

In [17]:
# making sure 2024 was dropped properly
df['IYEAR'].value_counts()

IYEAR
2021    438693
2019    418268
2023    408012
Name: count, dtype: int64

In [18]:
# Drop all rows that contain even one missing value, because the model will not work with any NaNs
df = df.dropna()

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,BIRTHSEX,MENTHLTH,POORHLTH,ADDEPEV3,DECIDE,DIFFALON,ACEDEPRS,ACEDRINK,ACEDRUGS,ACEPRISN,ACEDIVRC,ACEPUNCH,ACEHURT1,ACESWEAR,ACETOUCH,ACETTHEM,ACEHVSEX,IYEAR
309121,309121,Male,0.0,3.0,0.0,No,No,No,Yes,Yes,Yes,No,More than once,More than once,More than once,Never,Never,Never,2019
309123,309123,Male,0.0,1.0,0.0,No,No,No,No,No,No,No,Never,Never,Never,Never,Never,Never,2019
309124,309124,Male,1.0,0.0,0.0,No,No,No,No,No,No,No,Never,More than once,More than once,Never,Never,Never,2019
309126,309126,Female,0.0,0.0,0.0,No,No,No,No,No,No,No,Never,Never,Never,More than once,Never,Never,2019
309133,309133,Female,1.0,0.0,0.0,No,No,No,No,No,No,No,Never,Once,Never,Never,Never,Never,2019


In [21]:
# Separating target variable and predictors

y = df['ADDEPEV3']
X = df[['BIRTHSEX', 'MENTHLTH', 'POORHLTH',
         'DECIDE', 'DIFFALON', 'IYEAR', 
        'ACEDEPRS', 'ACEDRINK', 'ACEDRUGS','ACEPRISN', 
        'ACEDIVRC', 'ACEPUNCH', 'ACEHURT1', 'ACESWEAR',
        'ACETOUCH','ACETTHEM', 'ACEHVSEX']]

In [38]:
# Creating train test split

test_sz = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_sz, stratify=y, random_state=123)

In [23]:
# Separating numeric and categorical variables

nums = ['POORHLTH', 'MENTHLTH']
cats = ['IYEAR', 'BIRTHSEX', 'ACEDEPRS', 
        'DECIDE', 'DIFFALON', 'ACEDRINK', 
        'ACEDRUGS','ACEPRISN', 'ACEDIVRC', 
        'ACEPUNCH', 'ACEHURT1', 'ACESWEAR',
        'ACETOUCH','ACETTHEM', 'ACEHVSEX']

In [26]:
# Printing the value counts for all categorical variables

for c in cats:
    print(X[c].value_counts())

IYEAR
2023    9322
2021    3652
2019    2816
Name: count, dtype: int64
BIRTHSEX
Female    9138
Male      6652
Name: count, dtype: int64
ACEDEPRS
No     11815
Yes     3975
Name: count, dtype: int64
DECIDE
No     13135
Yes     2655
Name: count, dtype: int64
DIFFALON
No     13947
Yes     1843
Name: count, dtype: int64
ACEDRINK
No     11396
Yes     4394
Name: count, dtype: int64
ACEDRUGS
No     13693
Yes     2097
Name: count, dtype: int64
ACEPRISN
No     14307
Yes     1483
Name: count, dtype: int64
ACEDIVRC
No                     11059
Yes                     4459
Parents not married      272
Name: count, dtype: int64
ACEPUNCH
Never             12761
More than once     2241
Once                788
Name: count, dtype: int64
ACEHURT1
Never             11211
More than once     3342
Once               1237
Name: count, dtype: int64
ACESWEAR
Never             9277
More than once    5508
Once              1005
Name: count, dtype: int64
ACETOUCH
Never             13528
More than once     1444
Onc

## **Performing The Logstic Regression**

In [27]:
# One hot encoding categorical variables and keeping numeric variables the same

preprocess = ColumnTransformer(transformers=[('encoder',OneHotEncoder(drop='first'),cats),
                                             ('numeric','passthrough',nums)])

In [30]:
# Piping and fitting the model to our data

pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=2000))
])
pipe.fit(X=X_train,y=y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('encoder', ...), ('numeric', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [36]:
# Predicting probabilities and setting cutoff for what is classified as a yes versus a no

thresh = 0.5

p = pipe.predict_proba(X_test)[:,1]
y_hat = (p >= thresh).astype(int)

In [37]:
# Outputting the actual, predicted probability, and predicted label of depression

results = pd.DataFrame({
    "Actual Depressed": y_test,
    "Predicted Prob Depressed": p.round(3),
    "Predicted Label": y_hat
})
results

Unnamed: 0,Actual Depressed,Predicted Prob Depressed,Predicted Label
518701,0.0,0.329,0
1097094,0.0,0.700,1
314052,0.0,0.332,0
1074362,0.0,0.143,0
516748,0.0,0.230,0
...,...,...,...
1174712,0.0,0.112,0
1071044,0.0,0.433,0
1071689,1.0,0.297,0
1071129,0.0,0.419,0


## **Model Evaluation**

In [35]:
# Calculate and Output the Accuracy Score and Log Loss
acc = accuracy_score(y_test,y_hat)
ll = log_loss(y_test,p)

print(f'Accuracy: {str(acc)}')
print(f'Log Loss: {str(ll)}')

Accuracy: 0.7701076630778974
Log Loss: 0.4921285283848655
