## Importing Required Libraries

In [11]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
import numpy as np
warnings.filterwarnings('ignore')


## Read Data

In [3]:
data = pd.read_csv("heart_disease_health_indicators_BRFSS2015.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HeartDiseaseorAttack  253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   Diabetes              253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

    - Removing duplicates

In [5]:
# Remove duplicates..
data.drop_duplicates(inplace=True)
data.reset_index(drop=True, inplace=True)

In [6]:
data.duplicated().sum()

0

In [9]:
df = data

    - Remove Outliers based on Z-score

In [7]:
from scipy.stats import zscore

# Remove outliers based on Z-score for the BMI column
def remove_outliers_z_score_column(df, column_name, threshold = 2):
    z_scores = zscore(df[column_name])
    filtered_data = df[np.abs(z_scores) <= threshold]
    return filtered_data

In [12]:
# Apply the function to remove outliers from the 'BMI' column
df_filtered = remove_outliers_z_score_column(df, 'BMI', threshold=2)

In [13]:
# Reset index of the filtered DataFrame
df_filtered.reset_index(drop=True, inplace=True)

In [14]:
df_filtered['BMI'].describe()

count    221401.000000
mean         27.913438
std           5.170629
min          16.000000
25%          24.000000
50%          27.000000
75%          31.000000
max          42.000000
Name: BMI, dtype: float64

## Feature Scaling

In [15]:
from sklearn.preprocessing import MinMaxScaler

# Target column
target_col = 'HeartDiseaseorAttack'

# Columns to scale (all columns except the target)
columns_to_normalize = df_filtered.drop(columns=[target_col]).columns

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the selected columns
df_filtered[columns_to_normalize] = scaler.fit_transform(df_filtered[columns_to_normalize])

# Display the resulting DataFrame
df_filtered

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,0.923077,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.00,0.6,0.5,1.0,0.0,0.666667,0.6,0.285714
1,0.0,0.0,0.0,0.0,0.346154,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.50,0.0,0.0,0.0,0.0,0.500000,1.0,0.000000
2,0.0,1.0,1.0,1.0,0.461538,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.00,1.0,1.0,1.0,0.0,0.666667,0.6,1.000000
3,0.0,1.0,0.0,1.0,0.423077,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.833333,0.4,0.714286
4,0.0,1.0,1.0,1.0,0.307692,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.25,0.1,0.0,0.0,0.0,0.833333,0.8,0.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221396,0.0,0.0,0.0,1.0,0.423077,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.00,0.0,0.0,0.0,0.0,0.166667,1.0,0.571429
221397,0.0,1.0,1.0,1.0,0.076923,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.75,0.0,0.0,1.0,0.0,0.833333,0.2,0.428571
221398,0.0,0.0,0.0,1.0,0.461538,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.00,0.0,0.0,0.0,0.0,0.083333,0.8,0.142857
221399,0.0,1.0,0.0,1.0,0.269231,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.50,0.0,0.0,0.0,1.0,0.500000,0.8,0.000000


# Machine Learning Algorithms

**- Define X(Feature Matrix) and y(Target Vector)**

**- Train & Test Split**

In [18]:
# # Feature matrix
# X = df_filtered.drop('HeartDiseaseorAttack', axis=1)

# # target vector
# y = df_filtered['HeartDiseaseorAttack']

In [19]:
# # Train Test Split

# from sklearn.model_selection import train_test_split

# # Splitting the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Preparing Data

In [20]:
# Assuming df_filtered is your dataframe
# Feature matrix
X = df_filtered.drop('HeartDiseaseorAttack', axis=1)
# Target vector
y = df_filtered['HeartDiseaseorAttack']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Defining Models

In [21]:
models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('KNeighbors Classifier', KNeighborsClassifier()),
    ('GaussianNB', GaussianNB()),
    ('Support Vector Machine', SVC(probability=True, random_state=42)),
    ('Decision Tree Classifier', DecisionTreeClassifier(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('AdaBoost', AdaBoostClassifier(random_state=42)),
    ('LightGBM', LGBMClassifier(random_state=42))
]


## Training and Evaluating Models

In [None]:
best_model = None
best_accuracy = 0.0

# Iterate over the models and evaluate their performance
for name, model in models:
    # Perform cross validation
    scores = cross_val_score(model, X_train, y_train, cv=5)
    # Calculate mean accuracy
    mean_accuracy = scores.mean()
    # Fit the model on the training data
    model.fit(X_train, y_train)
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print the performance metrics
    print(f"Model: {name}")
    print(f"Cross Validation Accuracy: {mean_accuracy:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}\n")
    
    # Check if the current model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Retrieve the best model
print(f"Best Model: {best_model}")


## Hyperparameter Tuning Function