## Health Survey Risk Factors
source: https://www.kaggle.com/datasets/cdc/behavioral-risk-factor-surveillance-system

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [2]:
df=pd.read_csv('brfss2013.csv', encoding= 'unicode_escape', low_memory=False) #'/Users/lingtianhan/Desktop/brfss2013.csv'

In [3]:
df.shape

(491775, 330)

In [4]:
for i in range(len(df.columns)):
    if df.columns[i]=='genhlth':
        print('the index of the general health column:', i)

the index of the general health column: 18


## EDA

In [5]:
## select columns with general health and other predictors
df=df.iloc[:,18:]

In [6]:
## Check missing values
# print('There are %i nan in the dataframe' % df.isna().sum().sum())
df.isnull().sum()/df.shape[0] 

genhlth      0.004036
physhlth     0.022281
menthlth     0.017543
poorhlth     0.494440
hlthpln1     0.003872
               ...   
X_rfseat3    0.079185
X_flshot6    0.697634
X_pneumo2    0.709784
X_aidtst3    0.111016
X_age80      0.000022
Length: 312, dtype: float64

- <0.1 : 339282 * 124 69% row remain
- <0.15 : 250425 * 135 49% row remain
- <0.2 : 209095 * 140 42% row remain

In [7]:
mask=(df.isnull().sum()/df.shape[0])<0.1
features=df.columns[mask]
df1 = df[features]
df1.dropna(inplace=True)
df1.reset_index(drop=True,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [8]:
df1

Unnamed: 0,genhlth,physhlth,menthlth,hlthpln1,persdoc2,medcost,checkup1,sleptim1,bphigh4,bloodcho,...,fc60_,strfreq_,pamiss1_,X_pastrng,X_lmtact1,X_lmtwrk1,X_lmtscl1,X_rfseat2,X_rfseat3,X_age80
0,Good,0.0,0.0,Yes,"Yes, only one",No,Within past year,6.0,No,Yes,...,506.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,50.0
1,Good,3.0,2.0,Yes,"Yes, only one",No,Within past year,9.0,No,Yes,...,474.0,0.0,0.0,Did not meet muscle strengthening recommendations,Told have arthritis and have limited usual act...,Told have arthritis and have limited work,Told have arthritis and social activities limi...,Always or almost always wear seat belt,Always wear seat belt,55.0
2,Very good,2.0,0.0,Yes,"Yes, only one",No,Within past 2 years,8.0,No,Yes,...,417.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,64.0
3,Good,10.0,2.0,Yes,"Yes, only one",No,5 or more years ago,6.0,Yes,Yes,...,406.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,66.0
4,Very good,0.0,0.0,Yes,"Yes, only one",No,Within past year,8.0,Yes,Yes,...,512.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339277,Good,0.0,30.0,Yes,"Yes, only one",No,Within past year,5.0,No,Yes,...,411.0,0.0,0.0,Did not meet muscle strengthening recommendations,Told have arthritis and no limited usual activ...,Told have arthritis and no limited work,Told have arthritis and social activities not ...,Always or almost always wear seat belt,Always wear seat belt,65.0
339278,Good,1.0,3.0,Yes,"Yes, only one",No,5 or more years ago,6.0,Yes,Yes,...,585.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,47.0
339279,Fair,14.0,15.0,Yes,More than one,No,Within past year,6.0,Yes,Yes,...,455.0,0.0,0.0,Did not meet muscle strengthening recommendations,Told have arthritis and no limited usual activ...,Told have arthritis and no limited work,Told have arthritis and social activities limi...,Always or almost always wear seat belt,Always wear seat belt,58.0
339280,Fair,15.0,20.0,Yes,More than one,Yes,Within past year,7.0,No,Yes,...,588.0,0.0,0.0,Did not meet muscle strengthening recommendations,Not told they have arthritis,Not told they have arthritis,Not told they have arthritis,Always or almost always wear seat belt,Always wear seat belt,37.0


In [10]:
df1.to_csv('df_row_col.csv')

In [23]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339282 entries, 0 to 339281
Columns: 124 entries, genhlth to X_age80
dtypes: float64(39), int64(1), object(84)
memory usage: 321.0+ MB


- 39 numerical columns
- 36 YES/NO
- 

In [11]:
df1.columns[df1.stack().str.contains('Yes').any(level=1)]

  """Entry point for launching an IPython kernel.


Index(['hlthpln1', 'persdoc2', 'medcost', 'bphigh4', 'bloodcho', 'cvdinfr4',
       'cvdcrhd4', 'cvdstrk3', 'asthma3', 'chcscncr', 'chcocncr', 'chccopd1',
       'havarth3', 'addepev2', 'chckidny', 'diabete3', 'veteran3', 'internet',
       'qlactlm2', 'useequip', 'blind', 'decide', 'diffwalk', 'diffdres',
       'diffalon', 'smoke100', 'exerany2', 'flushot6', 'X_rfhype5',
       'X_ltasth1', 'X_casthm1', 'X_rfbmi5', 'X_rfsmok3', 'drnkany5',
       'X_rfbing5', 'X_rfdrhv4'],
      dtype='object')

In [21]:
summary = df1.describe()
summary.head(8)

Unnamed: 0,genhlth,physhlth,menthlth,sleptim1,children,height3,alcday5,fruitju1,fruit1,fvbeans,...,grenday_,orngday_,vegeda1_,X_frutsum,X_vegesum,maxvo2_,fc60_,strfreq_,pamiss1_,X_age80
count,339282.0,339282.0,339282.0,339282.0,339282.0,339282.0,339282.0,339282.0,339282.0,339282.0,...,339282.0,339282.0,339282.0,339282.0,339282.0,339282.0,339282.0,339282.0,339282.0,339282.0
mean,2.487423,3.966939,3.167896,7.045944,0.525168,518.484995,95.277206,134.83756,180.207742,212.278208,...,55.895656,29.636686,81.630832,141.515176,194.997268,2871.529851,492.267789,1195.462014,0.095039,54.532318
std,1.067659,8.403641,7.363573,1.394212,1.021836,37.083732,96.215606,125.669948,95.530028,109.461349,...,63.963995,42.486344,72.79209,144.957423,146.743804,784.705507,134.525859,2160.188781,0.293269,16.431499
min,1.0,0.0,0.0,1.0,0.0,300.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,555.0,95.0,0.0,0.0,18.0
25%,2.0,0.0,0.0,6.0,0.0,504.0,0.0,0.0,101.0,201.0,...,17.0,7.0,43.0,57.0,113.0,2321.0,398.0,0.0,0.0,43.0
50%,2.0,0.0,0.0,7.0,0.0,507.0,101.0,101.0,201.0,204.0,...,43.0,17.0,71.0,107.0,171.0,2765.0,474.0,0.0,0.0,56.0
75%,3.0,3.0,2.0,8.0,1.0,510.0,202.0,301.0,302.0,304.0,...,83.0,43.0,100.0,200.0,243.0,3394.0,582.0,2000.0,0.0,67.0
max,5.0,30.0,30.0,24.0,17.0,709.0,230.0,399.0,399.0,399.0,...,9900.0,9900.0,9900.0,19800.0,19827.0,5010.0,859.0,99000.0,1.0,80.0


In [35]:
obj_col

Index(['hlthpln1', 'persdoc2', 'medcost', 'checkup1', 'bphigh4', 'bloodcho',
       'cvdinfr4', 'cvdcrhd4', 'cvdstrk3', 'asthma3', 'chcscncr', 'chcocncr',
       'chccopd1', 'havarth3', 'addepev2', 'chckidny', 'diabete3', 'veteran3',
       'marital', 'educa', 'employ1', 'weight2', 'internet', 'renthom1', 'sex',
       'qlactlm2', 'useequip', 'blind', 'decide', 'diffwalk', 'diffdres',
       'diffalon', 'smoke100', 'usenow3', 'exerany2', 'seatbelt', 'flushot6',
       'qstver', 'qstlang', 'X_imprace', 'X_dualuse', 'X_rfhlth', 'X_rfhype5',
       'X_cholchk', 'X_ltasth1', 'X_casthm1', 'X_asthms1', 'X_drdxar1',
       'X_prace1', 'X_mrace1', 'X_hispanc', 'X_race', 'X_raceg21', 'X_racegr3',
       'X_race_g1', 'X_ageg5yr', 'X_age65yr', 'X_age_g', 'X_bmi5cat',
       'X_rfbmi5', 'X_chldcnt', 'X_educag', 'X_smoker3', 'X_rfsmok3',
       'drnkany5', 'X_rfbing5', 'X_rfdrhv4', 'X_misfrtn', 'X_misvegn',
       'X_frtresp', 'X_vegresp', 'X_frtlt1', 'X_veglt1', 'X_frt16', 'X_veg23',
       'X_fru

In [31]:
def check_const(col_list):
    # col_list (list of str): list of features' names in df
    # list1, list2: whether all values in features are (and almost are) the same
    
    list1 = []
    list2 = []
    for col in list(col_list):
        if df[col].dtypes == 'int64' or df[col].dtypes == 'float64':
            if summary[col]['min']==summary[col]['max']: # check if all values in column are the same. 
                list1.append(col)
            elif df[col].value_counts().max() > (0.99*df[col].shape[0]): # check if most (>99%) of values in column are the same.
                list2.append(col)
        if df[col].dtypes == 'object':
            if df[col].value_counts().max() > (0.99*df[col].shape[0]): # check if most (>99%) of values in column are the same.
                list2.append(col)
    return(list1, list2)


const_col, al_const_col = check_const(df1.columns)

print('Columns with values are constant: ', const_col)
print('Columns with values are almost the same: ', al_const_col) # ['X_frt16', 'X_veg23']

In [32]:
const_col, al_const_col = check_const(df1.columns)

print('Columns with values are constant: ', const_col)
print('Columns with values are almost the same: ', al_const_col)

Columns with values are constant:  []
Columns with values are almost the same:  ['X_frt16', 'X_veg23']


In [33]:
df['X_frt16'].value_counts()

Included - values are in accepted range    491663
Not included - Values are too high            102
Name: X_frt16, dtype: int64

In [12]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df1.select_dtypes(include=numerics)

Unnamed: 0,physhlth,menthlth,sleptim1,children,height3,alcday5,fruitju1,fruit1,fvbeans,fvgreen,...,grenday_,orngday_,vegeda1_,X_frutsum,X_vegesum,maxvo2_,fc60_,strfreq_,pamiss1_,X_age80
0,0.0,0.0,6.0,2.0,510.0,0.0,305.0,301.0,310.0,203.0,...,43.0,29.0,43.0,20.0,148.0,2950.0,506.0,0.0,0.0,50.0
1,3.0,2.0,9.0,0.0,504.0,220.0,301.0,203.0,202.0,202.0,...,29.0,33.0,100.0,46.0,191.0,2765.0,474.0,0.0,0.0,55.0
2,2.0,0.0,8.0,0.0,504.0,208.0,202.0,306.0,202.0,310.0,...,33.0,17.0,57.0,49.0,136.0,2432.0,417.0,0.0,0.0,64.0
3,10.0,2.0,6.0,0.0,600.0,210.0,0.0,302.0,101.0,310.0,...,33.0,10.0,100.0,7.0,243.0,2370.0,406.0,0.0,0.0,66.0
4,0.0,0.0,8.0,0.0,503.0,0.0,205.0,206.0,0.0,203.0,...,43.0,0.0,100.0,157.0,143.0,2987.0,512.0,0.0,0.0,49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339277,0.0,30.0,5.0,0.0,500.0,0.0,0.0,0.0,101.0,203.0,...,43.0,14.0,71.0,0.0,228.0,2395.0,411.0,0.0,0.0,65.0
339278,1.0,3.0,6.0,1.0,510.0,204.0,320.0,308.0,205.0,315.0,...,50.0,71.0,100.0,94.0,292.0,3415.0,585.0,0.0,0.0,47.0
339279,14.0,15.0,6.0,0.0,500.0,0.0,101.0,302.0,101.0,101.0,...,100.0,29.0,100.0,107.0,329.0,2654.0,455.0,0.0,0.0,58.0
339280,15.0,20.0,7.0,3.0,505.0,0.0,102.0,103.0,0.0,304.0,...,13.0,43.0,71.0,500.0,127.0,3431.0,588.0,0.0,0.0,37.0


In [13]:
#OneHotEncoder(df1)

In [14]:
df1['genhlth'].replace(['Excellent','Very good','Good','Fair','Poor'],
                        [1,2,3,4,5], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [15]:
df_mol1 = df1.select_dtypes(include=numerics)

In [None]:
#df[''].astype('category')

In [16]:
df_mol1

Unnamed: 0,genhlth,physhlth,menthlth,sleptim1,children,height3,alcday5,fruitju1,fruit1,fvbeans,...,grenday_,orngday_,vegeda1_,X_frutsum,X_vegesum,maxvo2_,fc60_,strfreq_,pamiss1_,X_age80
0,3,0.0,0.0,6.0,2.0,510.0,0.0,305.0,301.0,310.0,...,43.0,29.0,43.0,20.0,148.0,2950.0,506.0,0.0,0.0,50.0
1,3,3.0,2.0,9.0,0.0,504.0,220.0,301.0,203.0,202.0,...,29.0,33.0,100.0,46.0,191.0,2765.0,474.0,0.0,0.0,55.0
2,2,2.0,0.0,8.0,0.0,504.0,208.0,202.0,306.0,202.0,...,33.0,17.0,57.0,49.0,136.0,2432.0,417.0,0.0,0.0,64.0
3,3,10.0,2.0,6.0,0.0,600.0,210.0,0.0,302.0,101.0,...,33.0,10.0,100.0,7.0,243.0,2370.0,406.0,0.0,0.0,66.0
4,2,0.0,0.0,8.0,0.0,503.0,0.0,205.0,206.0,0.0,...,43.0,0.0,100.0,157.0,143.0,2987.0,512.0,0.0,0.0,49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339277,3,0.0,30.0,5.0,0.0,500.0,0.0,0.0,0.0,101.0,...,43.0,14.0,71.0,0.0,228.0,2395.0,411.0,0.0,0.0,65.0
339278,3,1.0,3.0,6.0,1.0,510.0,204.0,320.0,308.0,205.0,...,50.0,71.0,100.0,94.0,292.0,3415.0,585.0,0.0,0.0,47.0
339279,4,14.0,15.0,6.0,0.0,500.0,0.0,101.0,302.0,101.0,...,100.0,29.0,100.0,107.0,329.0,2654.0,455.0,0.0,0.0,58.0
339280,4,15.0,20.0,7.0,3.0,505.0,0.0,102.0,103.0,0.0,...,13.0,43.0,71.0,500.0,127.0,3431.0,588.0,0.0,0.0,37.0


In [17]:
train_Y = df_mol1.iloc[:,0]
train_X = df_mol1.iloc[:,1:]

In [18]:
x_train, x_test, y_train, y_test = train_test_split(train_X, train_Y, random_state= 123)

In [36]:
x_train = x_train.iloc[:,1:]
x_test = x_test.iloc[:,1:]
y_train = y_train.iloc[1:]
y_test = y_test.iloc[1:]

In [None]:
from sklearn.linear_model import LogisticRegression
model_Log = LogisticRegression().fit(x_train, y_train)
y_pred_Log = model_Log.predict(x_test)
print(classification_report(y_test,y_pred_Log))
print('The accuracy score is',accuracy_score(y_test,y_pred_Log))

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)

scaled_x_train = scaler.transform(x_train)
scaled_x_test = scaler.transform(x_test)
model_Log = LogisticRegression().fit(scaled_x_train, y_train)
y_pred_Log = model_Log.predict(scaled_x_test)
print(classification_report(y_test,y_pred_Log))
print('The accuracy score is',accuracy_score(y_test,y_pred_Log))

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
scaled_x_train = scaler.transform(x_train)
scaler = StandardScaler()
scaler.fit(x_test)
scaled_x_test = scaler.transform(x_test)
model_Log = LogisticRegression().fit(scaled_x_train, y_train)
y_pred_Log = model_Log.predict(scaled_x_test)
print(classification_report(y_test,y_pred_Log))
print('The accuracy score is',accuracy_score(y_test,y_pred_Log))

In [None]:
# Function to perform training with giniIndex.
def train_using_gini(X_train, X_test, y_train):
  
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion = "gini",
            random_state = 100,max_depth=3, min_samples_leaf=5)
  
    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini

# Function to perform training with entropy.
def tarin_using_entropy(X_train, X_test, y_train):
  
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(
            criterion = "entropy", random_state = 100,
            max_depth = 3, min_samples_leaf = 5)
  
    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy

# Function to make predictions
def prediction(X_test, clf_object):
  
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    print("Predicted values:")
    print(y_pred)
    return y_pred
      
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
      
    print("Confusion Matrix: ",
        confusion_matrix(y_test, y_pred))
      
    print ("Accuracy : ",
    accuracy_score(y_test,y_pred)*100)
      
    print("Report : ",
    classification_report(y_test, y_pred))

In [None]:
clf_gini = train_using_gini(x_train, x_test, y_train)
   
y_pred_gini = prediction(x_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)

In [None]:
clf_entropy = tarin_using_entropy(x_train, x_test, y_train)

y_pred_entropy = prediction(x_test, clf_entropy)
cal_accuracy(y_test, y_pred_entropy)