In [1]:
# Import necessary libraries
import numpy as np
import pandas  as pd
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df=pd.read_csv("heart_2020_cleaned.csv")

In [11]:
df.shape

(301717, 17)

In [2]:
duplicateObser = df[df.duplicated()]
LabelsDupObser=duplicateObser.axes[0].tolist()
print('Number of duplicated observations:', duplicateObser.shape[0])
df=df.drop_duplicates()
print(df.shape)

Number of duplicated observations: 18078
(301717, 18)


In [3]:
df.replace("Yes",1,inplace=True)
df.replace("No",0,inplace=True)

target=df["HeartDisease"]
df.drop(["HeartDisease"], axis=1, inplace=True)
df.AgeCategory.unique()
df.replace("18-24",0,inplace=True)
df.replace("25-29",1,inplace=True)
df.replace("30-34",2,inplace=True)
df.replace("35-39",3,inplace=True)
df.replace("40-44",4,inplace=True)
df.replace("45-49",5,inplace=True)
df.replace("50-54",6,inplace=True)
df.replace("55-59",7,inplace=True)
df.replace("60-64",8,inplace=True)
df.replace("65-69",9,inplace=True)
df.replace("70-74",10,inplace=True)
df.replace("75-79",11,inplace=True)
df.replace("80 or older",13,inplace=True)

df.Diabetic.unique()
df.replace("No, borderline diabetes",2,inplace=True)
df.replace("Yes (during pregnancy)",3,inplace=True)

df.GenHealth.unique()
df.replace("Excellent",0,inplace=True)
df.replace("Good",1,inplace=True)
df.replace("Fair",2,inplace=True)
df.replace("Very good",3,inplace=True)
df.replace("Poor",4,inplace=True)

df.Race.unique()
df.replace("White",0,inplace=True)
df.replace("Other",1,inplace=True)
df.replace("Black",2,inplace=True)
df.replace("Hispanic",3,inplace=True)
df.replace("Asian",4,inplace=True)
df.replace("American Indian/Alaskan Native",4,inplace=True)

df.Sex.unique()
df.replace("Female",0,inplace=True)
df.replace("Male",1,inplace=True)

df['BMI'].mask(df['BMI']  < 18.5, 0, inplace=True)
df['BMI'].mask(df['BMI'].between(18.5,25), 1, inplace=True)
df['BMI'].mask(df['BMI'].between(25,30), 2, inplace=True)
df['BMI'].mask(df['BMI']  > 30, 3, inplace=True)

In [4]:
# Split the data into training and testing
X_train,X_test,y_train,y_test = train_test_split(df,target,test_size=50,random_state=2)

# Train a logistic regression model on the training set
LogRegModel=LogisticRegression()
LogRegModel.fit(X_train, y_train)

In [5]:
# Save the model using pickle
import pickle 

with open('LogRegModel.pkl', 'wb') as f:
    pickle.dump(LogRegModel, f)

In [6]:
# Sample test case
test_case = [
    20.34,#BMI
    1,  # Smoking (0 for No, 1 for Yes)
    0,  # AlcoholDrinking (0 for No, 1 for Yes)
    0,  # Stroke (0 for No, 1 for Yes)
    3,  # PhysicalHealth (assuming it's on a scale of 0 to 3)
    3,  # MentalHealth (assuming it's on a scale of 0 to 3)
    0,  # DiffWalking (0 for No, 1 for Yes)
    0,  # Sex (0 for Female, 1 for Male)
    7,  # AgeCategory
    0,  # Race (assuming it's encoded similarly to your web app)
    1,  # Diabetic (assuming it's encoded similarly to your web app)
    1,  # PhysicalActivity (0 for No, 1 for Yes)
    3,  # GenHealth (assuming it's on a scale of 0 to 4)
    1,  # SleepTime (0 for No, 1 for Yes)
    0,  # Asthma (0 for No, 1 for Yes)
    0,  # KidneyDisease (0 for No, 1 for Yes)
    0   # Skincancer (0 for No, 1 for Yes)
]

# Predict probability for the test case
prob = LogRegModel.predict_proba(np.array(test_case).reshape(1, -1))[0][1]  # Probability of heart disease
prediction = LogRegModel.predict(np.array(test_case).reshape(1, -1))[0]     # Predicted class (0 for No, 1 for Yes)

# Display the test case and predicted results
print("Test Case:")
print("BMI:", test_case[0])
print("Smoking:", "Yes" if test_case[1] == 1 else "No")
print("Alcohol Drinking:", "Yes" if test_case[2] == 1 else "No")
# Display other features similarly...

print("\nPredicted Results:")
print("Probability of Heart Disease:", prob)
print("Heart Disease:", "Yes" if prediction == 1 else "No")


Test Case:
BMI: 20.34
Smoking: Yes
Alcohol Drinking: No

Predicted Results:
Probability of Heart Disease: 0.4401355051777548
Heart Disease: No


In [7]:
# Sample test case
test_case_2 = [
    25.34,  # BMI
    0,      # Smoking (0 for No, 1 for Yes)
    0,      # AlcoholDrinking (0 for No, 1 for Yes)
    1,      # Stroke (0 for No, 1 for Yes)
    0,      # PhysicalHealth (assuming it's on a scale of 0 to 3)
    0,      # MentalHealth (assuming it's on a scale of 0 to 3)
    0,      # DiffWalking (0 for No, 1 for Yes)
    0,      # Sex (0 for Female, 1 for Male)
    13,     # AgeCategory
    0,      # Race (assuming it's encoded similarly to your web app)
    2,      # Diabetic (assuming it's encoded similarly to your web app)
    1,      # PhysicalActivity (0 for No, 1 for Yes)
    3,      # GenHealth (assuming it's on a scale of 0 to 4)
    0,      # SleepTime (0 for No, 1 for Yes)
    1,      # Asthma (0 for No, 1 for Yes)
    0,     # KidneyDisease (0 for No, 1 for Yes)
    0
]


# Predict probability for the test case
prob_2 = LogRegModel.predict_proba(np.array(test_case_2).reshape(1, -1))[0][1]  # Probability of heart disease
prediction_2 = LogRegModel.predict(np.array(test_case_2).reshape(1, -1))[0]     # Predicted class (0 for No, 1 for Yes)

# Display the test case and predicted results
print("Test Case:")
print("BMI:", test_case_2[0])
print("Smoking:", "Yes" if test_case_2[1] == 1 else "No")
print("Alcohol Drinking:", "Yes" if test_case_2[2] == 1 else "No")
# Display other features similarly...

print("\nPredicted Results:")
print("Probability of Heart Disease:", prob_2)
print("Heart Disease:", "Yes" if prediction_2 == 1 else "No")


Test Case:
BMI: 25.34
Smoking: No
Alcohol Drinking: No

Predicted Results:
Probability of Heart Disease: 0.9616121684175332
Heart Disease: Yes


In [8]:
from sklearn.model_selection import cross_validate

# Define the logistic regression model
log_reg = LogisticRegression()

# Perform cross-validation
cv_results = cross_validate(log_reg, df, target, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])

# Extract and print cross-validation results
print("Cross-Validation Results:")
print("Accuracy:", cv_results['test_accuracy'])
print("Precision:", cv_results['test_precision'])
print("Recall:", cv_results['test_recall'])
print("F1 Score:", cv_results['test_f1'])

Cross-Validation Results:
Accuracy: [0.91026448 0.91034734 0.91006413 0.90958355 0.91072701]
Precision: [0.51717734 0.52440409 0.51384275 0.4978022  0.53536453]
Recall: [0.10216434 0.08472401 0.08510638 0.08308877 0.09024211]
F1 Score: [0.17062337 0.14587938 0.14602675 0.14240805 0.15444985]


In [9]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2']                # Penalty norm
}

# Define the logistic regression model
log_reg = LogisticRegression()

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform hyperparameter tuning
grid_search.fit(df, target)

# Print the best hyperparameters found
print("Best hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_


Best hyperparameters: {'C': 0.01, 'penalty': 'l2'}


In [10]:
from sklearn.metrics import accuracy_score

# Use the best model to make predictions on the test set
y_pred_test = best_model.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred_test)

# Print the accuracy on the test set
print("Accuracy on the test set:", test_accuracy)


Accuracy on the test set: 0.88
