In [20]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [21]:
# Loading the CSV data into a Pandas DataFrame
heart_data = pd.read_csv("Dataset.csv")

In [22]:
# Displaying the first 5 rows of the dataset
print(heart_data.head())

   age  sex  chest_pain  resting_blood_pressure  heartbeat_rate  \
0   63    1           3                     145             105   
1   37    1           2                     130              76   
2   41    0           1                     130             106   
3   56    1           1                     120             107   
4   57    0           0                     120             107   

   fasting_blood_sugar  maximum_heart_rate  exercise_induced_angina  target  
0                    1                 150                        1       1  
1                    0                 187                        0       1  
2                    0                 172                        0       1  
3                    0                 178                        0       1  
4                    1                 163                        1       1  


In [23]:
# Displaying the last 5 rows of the dataset
print(heart_data.tail())

     age  sex  chest_pain  resting_blood_pressure  heartbeat_rate  \
303   25    1           0                     120              72   
304   30    0           0                     130              90   
305   40    0           3                     120              72   
306   34    1           1                     120              90   
307   30    0           1                     120              80   

     fasting_blood_sugar  maximum_heart_rate  exercise_induced_angina  target  
303                    0                 150                        0       0  
304                    0                 160                        0       0  
305                    1                 140                        0       0  
306                    1                 170                        1       1  
307                    1                 160                        1       1  


In [24]:
# Displaying the number of rows and columns in the dataset
print(heart_data.shape)

(308, 9)


In [25]:
# Getting some info about the data
print(heart_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   age                      308 non-null    int64
 1   sex                      308 non-null    int64
 2   chest_pain               308 non-null    int64
 3   resting_blood_pressure   308 non-null    int64
 4   heartbeat_rate           308 non-null    int64
 5   fasting_blood_sugar      308 non-null    int64
 6   maximum_heart_rate       308 non-null    int64
 7   exercise_induced_angina  308 non-null    int64
 8   target                   308 non-null    int64
dtypes: int64(9)
memory usage: 21.8 KB
None


In [26]:
# Checking for missing values
print(heart_data.isnull().sum())

age                        0
sex                        0
chest_pain                 0
resting_blood_pressure     0
heartbeat_rate             0
fasting_blood_sugar        0
maximum_heart_rate         0
exercise_induced_angina    0
target                     0
dtype: int64


In [27]:
# Statistical measures about the data
print(heart_data.describe())

             age         sex  chest_pain  resting_blood_pressure  \
count  308.00000  308.000000  308.000000              308.000000   
mean    54.00000    0.678571    0.974026              131.639610   
std      9.47137    0.467785    1.033309               17.405211   
min     25.00000    0.000000    0.000000               94.000000   
25%     47.00000    0.000000    0.000000              120.000000   
50%     55.00000    1.000000    1.000000              130.000000   
75%     61.00000    1.000000    2.000000              140.000000   
max     77.00000    1.000000    3.000000              200.000000   

       heartbeat_rate  fasting_blood_sugar  maximum_heart_rate  \
count      308.000000           308.000000          308.000000   
mean        95.370130             0.162338          149.805195   
std         20.819513             0.369360           22.337820   
min         63.000000             0.000000           71.000000   
25%         73.750000             0.000000          136.0

In [28]:
# Checking the distribution of the target variable
print(heart_data['target'].value_counts())

target
1    167
0    141
Name: count, dtype: int64


In [29]:
# Splitting the features and target
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [30]:
print(X)

     age  sex  chest_pain  resting_blood_pressure  heartbeat_rate  \
0     63    1           3                     145             105   
1     37    1           2                     130              76   
2     41    0           1                     130             106   
3     56    1           1                     120             107   
4     57    0           0                     120             107   
..   ...  ...         ...                     ...             ...   
303   25    1           0                     120              72   
304   30    0           0                     130              90   
305   40    0           3                     120              72   
306   34    1           1                     120              90   
307   30    0           1                     120              80   

     fasting_blood_sugar  maximum_heart_rate  exercise_induced_angina  
0                      1                 150                        1  
1                      0   

In [31]:
print(Y)

0      1
1      1
2      1
3      1
4      1
      ..
303    0
304    0
305    0
306    1
307    1
Name: target, Length: 308, dtype: int64


In [32]:
# Splitting the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(308, 8) (246, 8) (62, 8)


In [33]:
# Creating a pipeline for preprocessing and model training
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('model', LogisticRegression())  # Logistic Regression model
])

In [34]:
# Model training with cross-validation
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
cross_val_results = cross_val_score(pipeline, X_train, Y_train, cv=kfold, scoring='accuracy')
print(f'Cross-validation accuracy: {cross_val_results.mean()}')

Cross-validation accuracy: 0.9754999999999999


In [35]:
# Training the logistic regression model with training data
pipeline.fit(X_train, Y_train)

In [36]:
# Model evaluation
X_train_prediction = pipeline.predict(X_train)
X_test_prediction = pipeline.predict(X_test)

training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

print(f'Accuracy on training data: {training_data_accuracy}')
print(f'Accuracy on test data: {test_data_accuracy}')

Accuracy on training data: 0.983739837398374
Accuracy on test data: 0.9354838709677419


In [37]:
# Additional evaluation metrics
precision = precision_score(Y_test, X_test_prediction)
recall = recall_score(Y_test, X_test_prediction)
f1 = f1_score(Y_test, X_test_prediction)
roc_auc = roc_auc_score(Y_test, pipeline.predict_proba(X_test)[:, 1])

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC-AUC Score: {roc_auc}')

Precision: 1.0
Recall: 0.8823529411764706
F1 Score: 0.9375
ROC-AUC Score: 0.9684873949579832


In [38]:
# Saving the trained model
pickle.dump(pipeline, open('model.pkl', 'wb'))