In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Load the dataset
file_path = 'heart_data.csv' 
heart_data = pd.read_csv(file_path)

In [3]:
# Encode categorical variables
label_encoder = LabelEncoder()
heart_data['gender'] = label_encoder.fit_transform(heart_data['gender'])
heart_data['smoking'] = label_encoder.fit_transform(heart_data['smoking'])
heart_data['diabetes'] = label_encoder.fit_transform(heart_data['diabetes'])
heart_data['exercise'] = label_encoder.fit_transform(heart_data['exercise'])

In [4]:
# Ensure a balanced test set manually
# Separate the data by class
class_0 = heart_data[heart_data['heart_attack'] == 0]
class_1 = heart_data[heart_data['heart_attack'] == 1]

In [5]:
# Split each class into training and testing sets
train_class_0, test_class_0 = train_test_split(class_0, test_size=0.2, random_state=42)
train_class_1, test_class_1 = train_test_split(class_1, test_size=0.2, random_state=42)

In [6]:
# Combine the training and testing sets to form the final training and testing sets
train_data = pd.concat([train_class_0, train_class_1])
test_data = pd.concat([test_class_0, test_class_1])

In [7]:
# Split into features and target
X_train = train_data.drop(columns=['heart_attack'])
y_train = train_data['heart_attack']
X_test = test_data.drop(columns=['heart_attack'])
y_test = test_data['heart_attack']

In [8]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Convert scaled arrays back to DataFrames
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [10]:
# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [11]:
# Make predictions
y_pred = model.predict(X_test_scaled)

In [12]:
# Calculate the confusion matrix, accuracy, precision, recall, and F1 score
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Display the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, columns=['Predicted No', 'Predicted Yes'], index=['Actual No', 'Actual Yes'])
print("Confusion Matrix:")
print(conf_matrix_df)


Confusion Matrix:
            Predicted No  Predicted Yes
Actual No              1              0
Actual Yes             1              0


In [14]:
# Display metrics
metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
}

In [15]:
print("\nMetrics:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")


Metrics:
Accuracy: 0.5000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
