## Health Survey Risk Factors
source: https://www.kaggle.com/datasets/cdc/behavioral-risk-factor-surveillance-system

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression,LinearRegression,Lasso,Ridge,ElasticNet
from xgboost import XGBClassifier
from sklearn.naive_bayes import BernoulliNB, CategoricalNB, GaussianNB, MultinomialNB  # Naive Bayes classifiers


In [2]:
df=pd.read_csv('/Users/aliceqichaowu/Desktop/38615/FinalProject/brfss2013/brfss2013_new.csv',low_memory=False)
df.head()

Unnamed: 0,genhlth,physhlth,menthlth,hlthpln1,persdoc2,medcost,checkup1,sleptim1,bphigh4,bloodcho,...,fc60_,strfreq_,pamiss1_,X_pastrng,X_lmtact1,X_lmtwrk1,X_lmtscl1,X_rfseat2,X_rfseat3,X_age80
0,Good,0.0,0.0,Yes,"Yes, only one",No,Within past year,6.0,No,Yes,...,506.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,50.0
1,Good,3.0,2.0,Yes,"Yes, only one",No,Within past year,9.0,No,Yes,...,474.0,0.0,0.0,Did not meet muscle strengthening recommendations,Told have arthritis and have limited usual act...,Told have arthritis and have limited work,Told have arthritis and social activities limi...,Always or almost always wear seat belt,Always wear seat belt,55.0
2,Very good,2.0,0.0,Yes,"Yes, only one",No,Within past 2 years,8.0,No,Yes,...,417.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,64.0
3,Good,10.0,2.0,Yes,"Yes, only one",No,5 or more years ago,6.0,Yes,Yes,...,406.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,66.0
4,Very good,0.0,0.0,Yes,"Yes, only one",No,Within past year,8.0,Yes,Yes,...,512.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,49.0


In [40]:
for i in range(len(df.columns)):
    if df.columns[i]=='genhlth':
        print('the index of the general health column:', i)

the index of the general health column: 18


In [41]:
## select columns with general health and other predictors
df=df.iloc[:,18:]


In [42]:
## Check missing values
# print('There are %i nan in the dataframe' % df.isna().sum().sum())
df.isnull().sum()/df.shape[0]

genhlth      0.004036
physhlth     0.022281
menthlth     0.017543
poorhlth     0.494440
hlthpln1     0.003872
               ...   
X_rfseat3    0.079185
X_flshot6    0.697634
X_pneumo2    0.709784
X_aidtst3    0.111016
X_age80      0.000022
Length: 312, dtype: float64

In [56]:
## find out how many columns have missing values < 10% data pts for each column
## 0.1: 0.69; 0.15: 0.491; 0.2: 0.42;
mask=df.isnull().sum()/df.shape[0]<0.1
features=df.columns[mask]
df1=df[features]
print('Percentage of columns remained: ', len(features)/len(df.columns))

Percentage of columns remained:  0.3974358974358974


In [57]:
rows_remain=len(df1[df1.isnull().any(axis=1)])/len(df1)
print('Percentage of rows remained after removing missing values: ',rows_remain )

Percentage of rows remained after removing missing values:  0.3100869299984749


In [58]:
df1.dropna(inplace=True)
df1.reset_index(drop=True,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.dropna(inplace=True)


In [61]:
df1.tail()
df1.shape

(339282, 124)

In [65]:
import os  
os.makedirs('/Users/aliceqichaowu/Desktop/38615/brfss2013/', exist_ok=True)  
df1.to_csv('/Users/aliceqichaowu/Desktop/38615/brfss2013/brfss2013_new.csv',index=False)  


## Transform categorical data

In [14]:
df_cat=df.select_dtypes(include='object')
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_num=df.select_dtypes(include=numerics)


(339282, 84)

In [30]:
for i in range(df_cat.shape[1]):
    col_name=df_cat.columns[i]
    df_cat[col_name]=df_cat[col_name].astype('category').cat.codes
  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[col_name]=df_cat[col_name].astype('category').cat.codes


In [36]:
df_cat

Unnamed: 0,genhlth,hlthpln1,persdoc2,medcost,checkup1,bphigh4,bloodcho,cvdinfr4,cvdcrhd4,cvdstrk3,...,X_veg23,X_fruitex,X_vegetex,X_totinda,X_pastrng,X_lmtact1,X_lmtwrk1,X_lmtscl1,X_rfseat2,X_rfseat3
0,2,1,2,0,4,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,2,1,2,0,4,0,1,0,0,0,...,0,1,0,1,0,1,1,1,0,0
2,4,1,2,0,2,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,2,1,2,0,0,2,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4,4,1,2,0,4,2,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339277,2,1,2,0,4,0,1,0,0,0,...,0,1,0,1,0,2,2,3,0,0
339278,2,1,2,0,0,2,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
339279,1,1,0,0,4,2,1,1,1,0,...,0,1,0,1,0,2,2,2,0,0
339280,1,1,0,1,4,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [72]:

df1['new_genhlth']=df1['genhlth'].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['new_genhlth']=df1['genhlth'].astype('category').cat.codes


In [86]:
## fair=1, good=2, excellent=0, 
df1['genhlth'].replace(['Excellent','Very good','Good','Fair','Poor'],
                        [1,2,3,4,5], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [88]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df1_num=df1.select_dtypes(include=numerics)

In [128]:
y = df1_num.iloc[:,0]
y = y.astype('category')
X = df1_num.iloc[:,1:]
df1_num

Unnamed: 0,genhlth,physhlth,menthlth,sleptim1,children,height3,alcday5,fruitju1,fruit1,fvbeans,...,grenday_,orngday_,vegeda1_,X_frutsum,X_vegesum,maxvo2_,fc60_,strfreq_,pamiss1_,X_age80
0,3,0.0,0.0,6.0,2.0,510.0,0.0,305.0,301.0,310.0,...,43.0,29.0,43.0,20.0,148.0,2950.0,506.0,0.0,0.0,50.0
1,3,3.0,2.0,9.0,0.0,504.0,220.0,301.0,203.0,202.0,...,29.0,33.0,100.0,46.0,191.0,2765.0,474.0,0.0,0.0,55.0
2,2,2.0,0.0,8.0,0.0,504.0,208.0,202.0,306.0,202.0,...,33.0,17.0,57.0,49.0,136.0,2432.0,417.0,0.0,0.0,64.0
3,3,10.0,2.0,6.0,0.0,600.0,210.0,0.0,302.0,101.0,...,33.0,10.0,100.0,7.0,243.0,2370.0,406.0,0.0,0.0,66.0
4,2,0.0,0.0,8.0,0.0,503.0,0.0,205.0,206.0,0.0,...,43.0,0.0,100.0,157.0,143.0,2987.0,512.0,0.0,0.0,49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339277,3,0.0,30.0,5.0,0.0,500.0,0.0,0.0,0.0,101.0,...,43.0,14.0,71.0,0.0,228.0,2395.0,411.0,0.0,0.0,65.0
339278,3,1.0,3.0,6.0,1.0,510.0,204.0,320.0,308.0,205.0,...,50.0,71.0,100.0,94.0,292.0,3415.0,585.0,0.0,0.0,47.0
339279,4,14.0,15.0,6.0,0.0,500.0,0.0,101.0,302.0,101.0,...,100.0,29.0,100.0,107.0,329.0,2654.0,455.0,0.0,0.0,58.0
339280,4,15.0,20.0,7.0,3.0,505.0,0.0,102.0,103.0,0.0,...,13.0,43.0,71.0,500.0,127.0,3431.0,588.0,0.0,0.0,37.0


In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [130]:
# Normalization
scaler = StandardScaler()  # normalization: zero mean, unit variance
scaler.fit(X_train)  # scaling factor determined from the training set

X_train = scaler.transform(X_train)
scaler.fit(X_test) 
X_test = scaler.transform(X_test)

In [None]:
## logistic
mdl_lr = LogisticRegression('l2')
mdl_lr.fit(X_train, y_train)
y_pred_LR = np.squeeze(mdl_lr.predict(X_test))

In [136]:
## xgboost
mdl_xgb = XGBClassifier(booster='gblinear')#booster='gblinear'
mdl_xgb.fit(X_train, y_train)
y_pred_xgb = np.squeeze(mdl_xgb.predict(X_test))




In [139]:
## Logistic regression + Elastic net
mdl_lr2 = LogisticRegression(penalty='elasticnet',solver='saga',l1_ratio=0.5)
mdl_lr2.fit(X_train, y_train)
y_pred_lr2 = np.squeeze(mdl_lr2.predict(X_test))




In [144]:
# Create Naive Bayes classifier
mnb = GaussianNB()

# Train the model
mnb.fit(X_train, y_train);

# Predict output
y_pred = mnb.predict(X_test)

# Predicted probabilities per outcome class
P_pred = mnb.predict_proba(X_test)

# # Create dataframe with the predicted probabilities
# df_mnb = pd.DataFrame(X_test.to_numpy(), columns=['sorethroat', 'fever', 'headache'])
# df_mnb['flu'] = y_test.to_numpy()
# df_mnb['pred_label'] = y_pred
# df_mnb['prob_notflu'] = P_pred[:,0]
# df_mnb['prob_flu'] = P_pred[:,1]
# df_mnb

In [145]:
# Evaluation metrics
target_names = ['Excellent','Very good','Good','Fair','Poor']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Excellent       0.34      0.24      0.29     12743
   Very good       0.43      0.67      0.52     23570
        Good       0.42      0.21      0.28     20107
        Fair       0.31      0.25      0.28      8243
        Poor       0.39      0.61      0.47      3194

    accuracy                           0.40     67857
   macro avg       0.38      0.40      0.37     67857
weighted avg       0.39      0.40      0.37     67857

