In [21]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split as tts

In [11]:
def gather_and_clean_data():
    data = pd.read_csv("heart.csv")
    
    sex = pd.get_dummies(data['Sex'], drop_first=True, prefix="Sex").astype(int)
    chestPain = pd.get_dummies(data['ChestPainType'], drop_first=True, prefix="ChestPain").astype(int)
    ecg = pd.get_dummies(data['RestingECG'], drop_first=True, prefix="ECG").astype(int)
    stSlope = pd.get_dummies(data['ST_Slope'], drop_first=True, prefix="ST_Slope").astype(int)
    excercise = pd.get_dummies(data['ExerciseAngina'], drop_first=True, prefix="ExerciseAngina").astype(int)

    data.drop(['Sex', 'ChestPainType', 'RestingECG', 'ST_Slope', 'ExerciseAngina'], axis=1, inplace=True)
    data = pd.concat([data, sex, chestPain, ecg, stSlope, excercise], axis=1)

    Scaler = MinMaxScaler()
    data = pd.DataFrame(Scaler.fit_transform(data), columns=data.columns)

    return data

In [15]:
data = gather_and_clean_data()
data.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPain_ATA,ChestPain_NAP,ChestPain_TA,ECG_Normal,ECG_ST,ST_Slope_Flat,ST_Slope_Up,ExerciseAngina_Y
0,0.244898,0.7,0.47927,0.0,0.788732,0.295455,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.428571,0.8,0.298507,0.0,0.676056,0.409091,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.183673,0.65,0.46932,0.0,0.267606,0.295455,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.408163,0.69,0.354892,0.0,0.338028,0.465909,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,0.530612,0.75,0.323383,0.0,0.43662,0.295455,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [29]:
X=data.drop("HeartDisease", axis=1)
Y=data['HeartDisease']
X.columns = X.columns.astype(str)
X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.3, random_state=1)

In [37]:
logModel = LogisticRegression()
logModel.fit(X_train, y_train)

In [41]:
y_prediction = logModel.predict(X_test)

In [43]:
accuracy = metrics.accuracy_score(y_test, y_prediction)
print("Accuracy on test data:", accuracy)

report = classification_report(y_test, y_prediction, target_names=['No Heart Disease', 'Heart Disease'])
print("\nClassification Report:\n", report)

conf_matrix = confusion_matrix(y_test, y_prediction)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy on test data: 0.8804347826086957

Classification Report:
                   precision    recall  f1-score   support

No Heart Disease       0.83      0.87      0.85       109
   Heart Disease       0.91      0.89      0.90       167

        accuracy                           0.88       276
       macro avg       0.87      0.88      0.88       276
    weighted avg       0.88      0.88      0.88       276


Confusion Matrix:
 [[ 95  14]
 [ 19 148]]
