## Creating Logistic Regression Model for Heart Disease Dataset

In [6]:
# Import the required modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

## Step 1: Read in the dataset 

In [2]:
# Read the Heart_Disease_Prediction.csv file from the Resources folder into a Pandas DataFrame
# file_path = "../Resources/Heart_Disease_Prediction.csv"
file_path = "Heart_Disease_Prediction.csv"
df_heart = pd.read_csv(file_path)
df_heart.head()


Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


## Step 2: Split the data into X and y and then into testing and training sets.

In [3]:
# Split the data into X (features) and y (target)
# The y variable should focus on the target column
y = df_heart['Heart Disease']

# The X variable should include all features except the target
X = df_heart.drop(columns=['Heart Disease'])


In [4]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
# Create a StandardScaler instances
scaler = StandardScaler()
# .fit_transform(application_dummies)

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Step 3: Fit a logistic regression classifier.

In [8]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(solver='lbfgs', max_iter=750, random_state=9)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train_scaled, y_train)

## Step 4: Create the predicted values for the testing and the training data.

In [9]:
# Generate training predictions
training_predictions = lr_model.predict(X_train_scaled)

# Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test_scaled)


## Step 5: Print a confusion matrix for the training data.

In [10]:
# Import the model for sklearn confusion matrix
from sklearn.metrics import confusion_matrix

# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[101   9]
 [ 14  78]]


## Step 6: Print a confusion matrix for the testing data.

In [11]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[34  6]
 [ 8 20]]


## Step 7: Print the training classification report.

In [12]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

     Absence       0.88      0.92      0.90       110
    Presence       0.90      0.85      0.87        92

    accuracy                           0.89       202
   macro avg       0.89      0.88      0.88       202
weighted avg       0.89      0.89      0.89       202



## Step 8: Print the testing classification report.

In [16]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, testing_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, testing_predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, testing_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,34,6
Actual 1,8,20


Accuracy Score : 0.7941176470588235
Classification Report
              precision    recall  f1-score   support

     Absence       0.81      0.85      0.83        40
    Presence       0.77      0.71      0.74        28

    accuracy                           0.79        68
   macro avg       0.79      0.78      0.79        68
weighted avg       0.79      0.79      0.79        68

