In [1]:
import numpy as np
import pandas as pd
from numpy import mean
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
dataset = pd.read_csv("BRFSS_Data.csv")

print("Dataset shape: ", dataset.shape)

Dataset shape:  (450016, 358)


In [3]:
# Dataset shape after removing the columns with more than 50% of null values
data = dataset.loc[:, dataset.isnull().mean() < 0.50]

print("Dataset shape after removing the columns with more than 50% of null values: ", data.shape)
data.head()

Dataset shape after removing the columns with more than 50% of null values:  (450016, 183)


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,SAFETIME,...,_PAINDX1,_PA150R2,_PA300R2,_PA30021,_PASTRNG,_PAREC1,_PASTAE1,_RFSEAT2,_RFSEAT3,_AIDTST3
0,1,1,1302017,1,30,2017,1100,2017000001,2017000001,,...,1,1,1,1,1,1,1,1,1,1.0
1,1,1,1122017,1,12,2017,1100,2017000002,2017000002,,...,1,1,1,1,1,1,1,1,1,1.0
2,1,1,1102017,1,10,2017,1100,2017000003,2017000003,,...,2,3,3,2,2,4,2,1,1,2.0
3,1,1,2082017,2,8,2017,1200,2017000004,2017000004,,...,9,9,9,9,9,9,9,9,9,
4,1,1,1302017,1,30,2017,1100,2017000005,2017000005,,...,2,3,3,2,2,4,2,1,2,2.0


In [4]:
# Mode imputation of missing values in the dataset
data = data.fillna(data.mode().iloc[0])

print("Data shape after mode imputation:", data.shape)
data.head()

Data shape after mode imputation: (450016, 183)


Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,SAFETIME,...,_PAINDX1,_PA150R2,_PA300R2,_PA30021,_PASTRNG,_PAREC1,_PASTAE1,_RFSEAT2,_RFSEAT3,_AIDTST3
0,1,1,1302017,1,30,2017,1100,2017000001,2017000001,1.0,...,1,1,1,1,1,1,1,1,1,1.0
1,1,1,1122017,1,12,2017,1100,2017000002,2017000002,1.0,...,1,1,1,1,1,1,1,1,1,1.0
2,1,1,1102017,1,10,2017,1100,2017000003,2017000003,1.0,...,2,3,3,2,2,4,2,1,1,2.0
3,1,1,2082017,2,8,2017,1200,2017000004,2017000004,1.0,...,9,9,9,9,9,9,9,9,9,2.0
4,1,1,1302017,1,30,2017,1100,2017000005,2017000005,1.0,...,2,3,3,2,2,4,2,1,2,2.0


In [5]:
# Target variable we want to classify using decision tree - BMI
target_raw = data['_BMI5CAT']

# Replace BMI={3,4} with 1 indicating BMI >= 25 and overweight and replace BMI={1,2} with -1 indicating normal
target_replace = {3:1,4:1,1:-1,2:-1} 
target = target_raw.replace(target_replace)

# Add the target variable to the last column of the dataframe
data["TARGET"] = target
data.head()

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,SAFETIME,...,_PA150R2,_PA300R2,_PA30021,_PASTRNG,_PAREC1,_PASTAE1,_RFSEAT2,_RFSEAT3,_AIDTST3,TARGET
0,1,1,1302017,1,30,2017,1100,2017000001,2017000001,1.0,...,1,1,1,1,1,1,1,1,1.0,1.0
1,1,1,1122017,1,12,2017,1100,2017000002,2017000002,1.0,...,1,1,1,1,1,1,1,1,1.0,1.0
2,1,1,1102017,1,10,2017,1100,2017000003,2017000003,1.0,...,3,3,2,2,4,2,1,1,2.0,1.0
3,1,1,2082017,2,8,2017,1200,2017000004,2017000004,1.0,...,9,9,9,9,9,9,9,9,2.0,1.0
4,1,1,1302017,1,30,2017,1100,2017000005,2017000005,1.0,...,3,3,2,2,4,2,1,2,2.0,-1.0


In [6]:
# Drop the actual bmi columns along with other unimportant features
data = data.drop(["_BMI5", "_BMI5CAT", "_RFBMI5","_STATE","FMONTH","IDATE","IMONTH","IDAY","IYEAR","DISPCODE","SEQNO","_PSU"], axis=1)
data.head()

Unnamed: 0,SAFETIME,CTELNUM1,CELLFON5,CADULT,PVTRESD3,CSTATE1,LANDLINE,GENHLTH,PHYSHLTH,MENTHLTH,...,_PA150R2,_PA300R2,_PA30021,_PASTRNG,_PAREC1,_PASTAE1,_RFSEAT2,_RFSEAT3,_AIDTST3,TARGET
0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,88.0,88.0,...,1,1,1,1,1,1,1,1,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,88.0,88.0,...,1,1,1,1,1,1,1,1,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,88.0,88.0,...,3,3,2,2,4,2,1,1,2.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,2.0,4.0,88.0,88.0,...,9,9,9,9,9,9,9,9,2.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,2.0,4.0,14.0,88.0,...,3,3,2,2,4,2,1,2,2.0,-1.0


In [7]:
# Split the data into training set (80%) and testing set (20%) for testing and calculating accuracy
train_df, test_df = train_test_split(data, test_size=0.2)

print('Shape of training data: ', train_df.shape)
print('Shape of testing data: ', test_df.shape)

Shape of training data:  (360012, 172)
Shape of testing data:  (90004, 172)


In [8]:
# Function to calculate error
def cal_error(pred, target):
    return sum(pred != target) / float(len(target))

In [9]:
# ADABOOST Algorithm

def adaboost(train_df_target, train_df, test_df_target, test_df, decision_tree):
    
    n_train = len(train_df)
    n_test = len(test_df)
    
    # Initialize weight distribution
    w = (np.ones(n_train) / n_train)
    pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)]
    print("The initial weights given to each data point (vector) by the AdaBoost algorithm: ", w, "\n")

    # Execute 2 rounds of AdaBoost
    for i in range(2):
        # Fit a classifier with the specific weights
        decision_tree.fit(train_df, train_df_target, sample_weight = w)
        pred_train_i = decision_tree.predict(train_df)    
        pred_test_i = decision_tree.predict(test_df)
        
        # Misses
        h = [int(x) for x in (pred_train_i != train_df_target)]
        
        # Update the weights
        h_update = [x if x==1 else -1 for x in h]
        
        # Normalize the error
        err_m = np.dot(w,h) / sum(w)
        print("Normalized Error: ", err_m, "\n")

        # Calculate alpha
        alpha_m = 0.5 * np.log( (1 - err_m) / float(err_m))
    
        if (i==0):
            alpha_1 = alpha_m
            h1 = h
            print("Weight alpha_1 assigned to the hypothesis h1: ", alpha_1, "\n")
            
        elif (i==1):
            alpha_2 = alpha_m
            h2 = h
            print("The updated weights given to each data point (vector) by the AdaBoost algorithm: ", w, "\n")
            print("Weight alpha_2 assigned to the hypothesis h2: ", alpha_2, "\n")
            
            # Create a strong classifier H using the weak classifiers h1 and h2
            H = np.multiply(int(alpha_1),h1) + np.multiply(int(alpha_2),h2)
            print("Final weighted hypothesis after two rounds of AdaBoost: ", H, "\n")
            
        # New weights
        w = np.multiply(w, np.exp([float(x) * alpha_m for x in h_update]))
        
        # Add to prediction
        pred_train = [sum(x) for x in zip(pred_train, [x * alpha_m for x in pred_train_i])]
        pred_test = [sum(x) for x in zip(pred_test, [x * alpha_m for x in pred_test_i])]
    
    pred_train, pred_test = np.sign(pred_train), np.sign(pred_test)
    error = cal_error(pred_train, train_df_target), cal_error(pred_test, test_df_target)
    print("Decision Tree Error after 2 rounds of AdaBoost: ", error)
    
    return error

In [10]:
# Select train_df_target, test_df_target
train_df_target = train_df.iloc[:,-1]
test_df_target = test_df.iloc[:,-1]

# Fit a simple decision tree with depth = 1 to form a weak classifier. This tree uses maximum 1 feature for the split
decision_tree = DecisionTreeClassifier(max_depth = 1, max_features = 1, random_state = 1)
#print(decision_tree)

# Call the AdaBoost function
print("---- ADABOOST ----\n")
error = adaboost(train_df_target, train_df, test_df_target, test_df, decision_tree)

---- ADABOOST ----

The initial weights given to each data point (vector) by the AdaBoost algorithm:  [2.77768519e-06 2.77768519e-06 2.77768519e-06 ... 2.77768519e-06
 2.77768519e-06 2.77768519e-06] 

Normalized Error:  0.30217048320484224 

Weight alpha_1 assigned to the hypothesis h1:  0.4184917278151746 

Normalized Error:  0.4887728624098044 

The updated weights given to each data point (vector) by the AdaBoost algorithm:  [4.22115861e-06 1.82782400e-06 4.22115861e-06 ... 1.82782400e-06
 1.82782400e-06 4.22115861e-06] 

Weight alpha_2 assigned to the hypothesis h2:  0.022458050096252298 

Final weighted hypothesis after two rounds of AdaBoost:  [0 0 0 ... 0 0 0] 

Decision Tree Error after 2 rounds of AdaBoost:  (0.30217048320611534, 0.3001533265188214)
