## Health Survey Risk Factors
source: https://www.kaggle.com/datasets/nguyenngocphung/behavioral-risk-factor-surveillance-system2013

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [2]:
df=pd.read_csv('brfss2013.csv', encoding= 'unicode_escape', low_memory=False)

In [3]:
df.shape

(491775, 330)

In [4]:
for i in range(len(df.columns)):
    if df.columns[i]=='genhlth':
        print('the index of the general health column:', i)

the index of the general health column: 18


## EDA

In [5]:
## select columns with general health and other predictors
df=df.iloc[:,18:]

In [6]:
## Check missing values
# print('There are %i nan in the dataframe' % df.isna().sum().sum())
df.isnull().sum()/df.shape[0] 

genhlth      0.004036
physhlth     0.022281
menthlth     0.017543
poorhlth     0.494440
hlthpln1     0.003872
               ...   
X_rfseat3    0.079185
X_flshot6    0.697634
X_pneumo2    0.709784
X_aidtst3    0.111016
X_age80      0.000022
Length: 312, dtype: float64

- <0.1 : 339282 * 124 69% row remain
- <0.15 : 250425 * 135 49% row remain
- <0.2 : 209095 * 140 42% row remain

In [7]:
mask=(df.isnull().sum()/df.shape[0])<0.1
features=df.columns[mask]
df1 = df[features]
df1.dropna(inplace=True)
df1.reset_index(drop=True,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [8]:
df1

Unnamed: 0,genhlth,physhlth,menthlth,hlthpln1,persdoc2,medcost,checkup1,sleptim1,bphigh4,bloodcho,...,fc60_,strfreq_,pamiss1_,X_pastrng,X_lmtact1,X_lmtwrk1,X_lmtscl1,X_rfseat2,X_rfseat3,X_age80
0,Good,0.0,0.0,Yes,"Yes, only one",No,Within past year,6.0,No,Yes,...,506.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,50.0
1,Good,3.0,2.0,Yes,"Yes, only one",No,Within past year,9.0,No,Yes,...,474.0,0.0,0.0,Did not meet muscle strengthening recommendations,Told have arthritis and have limited usual act...,Told have arthritis and have limited work,Told have arthritis and social activities limi...,Always or almost always wear seat belt,Always wear seat belt,55.0
2,Very good,2.0,0.0,Yes,"Yes, only one",No,Within past 2 years,8.0,No,Yes,...,417.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,64.0
3,Good,10.0,2.0,Yes,"Yes, only one",No,5 or more years ago,6.0,Yes,Yes,...,406.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,66.0
4,Very good,0.0,0.0,Yes,"Yes, only one",No,Within past year,8.0,Yes,Yes,...,512.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339277,Good,0.0,30.0,Yes,"Yes, only one",No,Within past year,5.0,No,Yes,...,411.0,0.0,0.0,Did not meet muscle strengthening recommendations,Told have arthritis and no limited usual activ...,Told have arthritis and no limited work,Told have arthritis and social activities not ...,Always or almost always wear seat belt,Always wear seat belt,65.0
339278,Good,1.0,3.0,Yes,"Yes, only one",No,5 or more years ago,6.0,Yes,Yes,...,585.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,47.0
339279,Fair,14.0,15.0,Yes,More than one,No,Within past year,6.0,Yes,Yes,...,455.0,0.0,0.0,Did not meet muscle strengthening recommendations,Told have arthritis and no limited usual activ...,Told have arthritis and no limited work,Told have arthritis and social activities limi...,Always or almost always wear seat belt,Always wear seat belt,58.0
339280,Fair,15.0,20.0,Yes,More than one,Yes,Within past year,7.0,No,Yes,...,588.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,37.0


In [9]:
df1.to_csv('df_row_col.csv')

- 39 numerical columns
- 36 YES/NO

In [10]:
df1.columns[df1.stack().str.contains('Yes').any(level=1)]

  """Entry point for launching an IPython kernel.


Index(['hlthpln1', 'persdoc2', 'medcost', 'bphigh4', 'bloodcho', 'cvdinfr4',
       'cvdcrhd4', 'cvdstrk3', 'asthma3', 'chcscncr', 'chcocncr', 'chccopd1',
       'havarth3', 'addepev2', 'chckidny', 'diabete3', 'veteran3', 'internet',
       'qlactlm2', 'useequip', 'blind', 'decide', 'diffwalk', 'diffdres',
       'diffalon', 'smoke100', 'exerany2', 'flushot6', 'X_rfhype5',
       'X_ltasth1', 'X_casthm1', 'X_rfbmi5', 'X_rfsmok3', 'drnkany5',
       'X_rfbing5', 'X_rfdrhv4'],
      dtype='object')

In [11]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df1.select_dtypes(include=numerics)

Unnamed: 0,physhlth,menthlth,sleptim1,children,height3,alcday5,fruitju1,fruit1,fvbeans,fvgreen,...,grenday_,orngday_,vegeda1_,X_frutsum,X_vegesum,maxvo2_,fc60_,strfreq_,pamiss1_,X_age80
0,0.0,0.0,6.0,2.0,510.0,0.0,305.0,301.0,310.0,203.0,...,43.0,29.0,43.0,20.0,148.0,2950.0,506.0,0.0,0.0,50.0
1,3.0,2.0,9.0,0.0,504.0,220.0,301.0,203.0,202.0,202.0,...,29.0,33.0,100.0,46.0,191.0,2765.0,474.0,0.0,0.0,55.0
2,2.0,0.0,8.0,0.0,504.0,208.0,202.0,306.0,202.0,310.0,...,33.0,17.0,57.0,49.0,136.0,2432.0,417.0,0.0,0.0,64.0
3,10.0,2.0,6.0,0.0,600.0,210.0,0.0,302.0,101.0,310.0,...,33.0,10.0,100.0,7.0,243.0,2370.0,406.0,0.0,0.0,66.0
4,0.0,0.0,8.0,0.0,503.0,0.0,205.0,206.0,0.0,203.0,...,43.0,0.0,100.0,157.0,143.0,2987.0,512.0,0.0,0.0,49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339277,0.0,30.0,5.0,0.0,500.0,0.0,0.0,0.0,101.0,203.0,...,43.0,14.0,71.0,0.0,228.0,2395.0,411.0,0.0,0.0,65.0
339278,1.0,3.0,6.0,1.0,510.0,204.0,320.0,308.0,205.0,315.0,...,50.0,71.0,100.0,94.0,292.0,3415.0,585.0,0.0,0.0,47.0
339279,14.0,15.0,6.0,0.0,500.0,0.0,101.0,302.0,101.0,101.0,...,100.0,29.0,100.0,107.0,329.0,2654.0,455.0,0.0,0.0,58.0
339280,15.0,20.0,7.0,3.0,505.0,0.0,102.0,103.0,0.0,304.0,...,13.0,43.0,71.0,500.0,127.0,3431.0,588.0,0.0,0.0,37.0


In [12]:
#OneHotEncoder(df1)

In [13]:
df1['genhlth'].replace(['Excellent','Very good','Good','Fair','Poor'],
                        [1,2,3,4,5], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [14]:
df_mol1 = df1.select_dtypes(include=numerics)

In [15]:
#df[''].astype('category')

In [16]:
df_mol1

Unnamed: 0,genhlth,physhlth,menthlth,sleptim1,children,height3,alcday5,fruitju1,fruit1,fvbeans,...,grenday_,orngday_,vegeda1_,X_frutsum,X_vegesum,maxvo2_,fc60_,strfreq_,pamiss1_,X_age80
0,3,0.0,0.0,6.0,2.0,510.0,0.0,305.0,301.0,310.0,...,43.0,29.0,43.0,20.0,148.0,2950.0,506.0,0.0,0.0,50.0
1,3,3.0,2.0,9.0,0.0,504.0,220.0,301.0,203.0,202.0,...,29.0,33.0,100.0,46.0,191.0,2765.0,474.0,0.0,0.0,55.0
2,2,2.0,0.0,8.0,0.0,504.0,208.0,202.0,306.0,202.0,...,33.0,17.0,57.0,49.0,136.0,2432.0,417.0,0.0,0.0,64.0
3,3,10.0,2.0,6.0,0.0,600.0,210.0,0.0,302.0,101.0,...,33.0,10.0,100.0,7.0,243.0,2370.0,406.0,0.0,0.0,66.0
4,2,0.0,0.0,8.0,0.0,503.0,0.0,205.0,206.0,0.0,...,43.0,0.0,100.0,157.0,143.0,2987.0,512.0,0.0,0.0,49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339277,3,0.0,30.0,5.0,0.0,500.0,0.0,0.0,0.0,101.0,...,43.0,14.0,71.0,0.0,228.0,2395.0,411.0,0.0,0.0,65.0
339278,3,1.0,3.0,6.0,1.0,510.0,204.0,320.0,308.0,205.0,...,50.0,71.0,100.0,94.0,292.0,3415.0,585.0,0.0,0.0,47.0
339279,4,14.0,15.0,6.0,0.0,500.0,0.0,101.0,302.0,101.0,...,100.0,29.0,100.0,107.0,329.0,2654.0,455.0,0.0,0.0,58.0
339280,4,15.0,20.0,7.0,3.0,505.0,0.0,102.0,103.0,0.0,...,13.0,43.0,71.0,500.0,127.0,3431.0,588.0,0.0,0.0,37.0


In [17]:
train_Y = df_mol1.iloc[:,0]
print(train_Y.shape)
train_X = df_mol1.iloc[:,1:]
print(train_X.shape)

(339282,)
(339282, 39)


In [18]:
x_train, x_test, y_train, y_test = train_test_split(train_X, train_Y, random_state= 123)

In [19]:
#x_train = x_train.iloc[:,0:]
print(x_train.shape)
#x_test = x_test.iloc[:,0:]
print(x_test.shape)
#y_train = y_train.iloc[0:]
print(y_train.shape)
#y_test = y_test.iloc[0:]
print(y_test.shape)

(254461, 39)
(84821, 39)
(254461,)
(84821,)


In [20]:
from sklearn.linear_model import LogisticRegression
model_Log = LogisticRegression().fit(x_train, y_train)
y_pred_Log = model_Log.predict(x_test)
print(classification_report(y_test,y_pred_Log))
print('The accuracy score is',accuracy_score(y_test,y_pred_Log))

              precision    recall  f1-score   support

           1       0.34      0.04      0.07     15733
           2       0.38      0.71      0.50     29675
           3       0.37      0.41      0.39     25370
           4       0.00      0.00      0.00     10128
           5       0.00      0.00      0.00      3915

    accuracy                           0.38     84821
   macro avg       0.22      0.23      0.19     84821
weighted avg       0.31      0.38      0.30     84821

The accuracy score is 0.37768948727319884


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)

scaled_x_train = scaler.transform(x_train)
scaled_x_test = scaler.transform(x_test)
model_Log = LogisticRegression().fit(scaled_x_train, y_train)
y_pred_Log = model_Log.predict(scaled_x_test)
print(classification_report(y_test,y_pred_Log))
print('The accuracy score is',accuracy_score(y_test,y_pred_Log))

              precision    recall  f1-score   support

           1       0.47      0.16      0.23     15733
           2       0.44      0.69      0.54     29675
           3       0.42      0.40      0.41     25370
           4       0.40      0.21      0.28     10128
           5       0.49      0.41      0.45      3915

    accuracy                           0.43     84821
   macro avg       0.44      0.37      0.38     84821
weighted avg       0.44      0.43      0.41     84821

The accuracy score is 0.43482156541422523


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [22]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
scaled_x_train = scaler.transform(x_train)
scaler = StandardScaler()
scaler.fit(x_test)
scaled_x_test = scaler.transform(x_test)
model_Log = LogisticRegression().fit(scaled_x_train, y_train)
y_pred_Log = model_Log.predict(scaled_x_test)
print(classification_report(y_test,y_pred_Log))
print('The accuracy score is',accuracy_score(y_test,y_pred_Log))

              precision    recall  f1-score   support

           1       0.47      0.16      0.24     15733
           2       0.44      0.69      0.54     29675
           3       0.42      0.40      0.41     25370
           4       0.40      0.21      0.28     10128
           5       0.49      0.41      0.44      3915

    accuracy                           0.44     84821
   macro avg       0.44      0.37      0.38     84821
weighted avg       0.44      0.44      0.41     84821

The accuracy score is 0.4350691456125252


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [23]:
# Function to perform training with giniIndex.
def train_using_gini(X_train, X_test, y_train):
  
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion = "gini",
            random_state = 100,max_depth=3, min_samples_leaf=5)
  
    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini

# Function to perform training with entropy.
def tarin_using_entropy(X_train, X_test, y_train):
  
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(
            criterion = "entropy", random_state = 100,
            max_depth = 3, min_samples_leaf = 5)
  
    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy

# Function to make predictions
def prediction(X_test, clf_object):
  
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    print("Predicted values:")
    print(y_pred)
    return y_pred
      
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
      
    print("Confusion Matrix: ",
        confusion_matrix(y_test, y_pred))
      
    print ("Accuracy : ",
    accuracy_score(y_test,y_pred)*100)
      
    print("Report : ",
    classification_report(y_test, y_pred))

In [24]:
clf_gini = train_using_gini(x_train, x_test, y_train)
   
y_pred_gini = prediction(x_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)

Predicted values:
[2 2 2 ... 2 2 2]
Confusion Matrix:  [[    0 15052   476   185    20]
 [    0 26559  2325   703    88]
 [    0 19599  3779  1697   295]
 [    0  4338  2303  2764   723]
 [    0   380   391  2016  1128]]
Accuracy :  40.35557232289174
Report :                precision    recall  f1-score   support

           1       0.00      0.00      0.00     15733
           2       0.40      0.89      0.56     29675
           3       0.41      0.15      0.22     25370
           4       0.38      0.27      0.32     10128
           5       0.50      0.29      0.37      3915

    accuracy                           0.40     84821
   macro avg       0.34      0.32      0.29     84821
weighted avg       0.33      0.40      0.31     84821



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
clf_entropy = tarin_using_entropy(x_train, x_test, y_train)

y_pred_entropy = prediction(x_test, clf_entropy)
cal_accuracy(y_test, y_pred_entropy)

Predicted values:
[2 2 2 ... 2 2 2]
Confusion Matrix:  [[    0 15083   445   176    29]
 [    0 26736  2118   685   136]
 [    0 19777  3548  1580   465]
 [    0  4412  2103  2519  1094]
 [    0   391   370  1748  1406]]
Accuracy :  40.33081430306174
Report :                precision    recall  f1-score   support

           1       0.00      0.00      0.00     15733
           2       0.40      0.90      0.56     29675
           3       0.41      0.14      0.21     25370
           4       0.38      0.25      0.30     10128
           5       0.45      0.36      0.40      3915

    accuracy                           0.40     84821
   macro avg       0.33      0.33      0.29     84821
weighted avg       0.33      0.40      0.31     84821



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
models = {}
# Support Vector Machines
from sklearn.svm import LinearSVC
models['Support Vector Machines'] = LinearSVC()

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier()

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier()

# XGboost
from sklearn.ensemble import GradientBoostingClassifier
models['XGboost'] = GradientBoostingClassifier()

In [29]:
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#accuracy, precision, recall, f1 = {}, {}, {}, {}

for key in models.keys():
    print(models[key])
    # Fit the classifier
    models[key].fit(scaled_x_train, y_train)
    
    # Make predictions
    y_pred_Log = models[key].predict(scaled_x_test)
    
    # Calculate metrics
    print(classification_report(y_test,y_pred_Log))
    #accuracy[key] = accuracy_score(predictions, y_val)
    #precision[key] = precision_score(predictions, y_val)
    #recall[key] = recall_score(predictions, y_val)
    #f1[key] = f1_score(predictions, y_val)

LinearSVC()




LinearSVC()
              precision    recall  f1-score   support

           1       0.43      0.06      0.11     15733
           2       0.43      0.75      0.54     29675
           3       0.42      0.37      0.39     25370
           4       0.37      0.25      0.30     10128
           5       0.48      0.18      0.26      3915

    accuracy                           0.42     84821
   macro avg       0.42      0.32      0.32     84821
weighted avg       0.42      0.42      0.38     84821

DecisionTreeClassifier()
DecisionTreeClassifier()
              precision    recall  f1-score   support

           1       0.29      0.31      0.30     15733
           2       0.41      0.40      0.40     29675
           3       0.35      0.35      0.35     25370
           4       0.26      0.26      0.26     10128
           5       0.34      0.35      0.34      3915

    accuracy                           0.35     84821
   macro avg       0.33      0.33      0.33     84821
weighted avg   