Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
import pickle
import matplotlib.pyplot as plt

Data Collection and Processing

In [None]:

# Load the CSV data into a Pandas DataFrame
heart_data = pd.read_csv('/content/heart.csv')

In [None]:
# Display first 5 rows of the dataset
print(heart_data.head())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [None]:

# Display last 5 rows of the dataset
print(heart_data.tail())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
302,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0


In [None]:

# Check the number of rows and columns
print("Shape of the data:", heart_data.shape)

(303, 14)

In [None]:
# Get information about the dataset
print("Dataset Info:")
print(heart_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [None]:
# Check for missing values
print("Missing values in each column:")
print(heart_data.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [None]:
# Statistical measures about the data
print("Statistical summary:")
print(heart_data.describe())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [None]:

# Check the distribution of the target variable
print("Target variable distribution:")
print(heart_data['target'].value_counts())

1    165
0    138
Name: target, dtype: int64

1 --> Defective Heart

0 --> Healthy Heart

Splitting the Features and Target

In [None]:
# Splitting the Features and Target
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

print("Features (X):")
print(X)
print("Target (Y):")
print(Y)

Splitting the Data into Training data & Test Data

In [None]:
# Splitting the Data into Training and Test Data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print("Shape of X:", X.shape)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Cross Validation

In [None]:
# Cross-Validation
log_reg = LogisticRegression(max_iter=1000)
cv_scores = cross_val_score(log_reg, X, Y, cv=5, scoring='accuracy')
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())


Random Forest

In [None]:
# Feature Importance using Random Search and Random Forest
random_forest = RandomForestClassifier(random_state=42)

Random Searching

In [None]:
# Define a parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_search = RandomizedSearchCV(estimator=random_forest, param_distributions=param_dist, n_iter=50, cv=5, random_state=42, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train, Y_train)

print("Best Random Forest parameters:", random_search.best_params_)
print("Best Random Forest score:", random_search.best_score_)

In [None]:
# Select features based on importance
selector = SelectFromModel(random_search.best_estimator_, prefit=True)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

print("Shape of selected features (training):", X_train_selected.shape)

Model Training

Logistic Regression

In [None]:
# Logistic Regression on selected features
log_reg.fit(X_train_selected, Y_train)
log_reg_accuracy = accuracy_score(Y_test, log_reg.predict(X_test_selected))
print("Logistic Regression accuracy on selected features:", log_reg_accuracy)

In [None]:

# Random Forest on selected features
random_forest.fit(X_train_selected, Y_train)
rf_accuracy = accuracy_score(Y_test, random_forest.predict(X_test_selected))
print("Random Forest accuracy on selected features:", rf_accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [None]:

# Building a Predictive System using Logistic Regression
input_data = (62, 0, 0, 140, 268, 0, 0, 160, 0, 3.6, 0, 2, 2)
input_data_as_numpy_array = np.asarray(input_data).reshape(1, -1)
input_data_selected = selector.transform(input_data_as_numpy_array)

prediction = log_reg.predict(input_data_selected)
print(prediction)

if prediction[0] == 0:
    print('The Person does not have Heart Disease')
else:
    print('The Person has Heart Disease')

In [None]:

# Saving the trained Logistic Regression model
filename = 'heart_disease_logreg_model.sav'
pickle.dump(log_reg, open(filename, 'wb'))

In [None]:

# Loading the saved model
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
# Print all feature columns for reference
print("Feature columns:")
for column in X.columns:
    print(column)

In [None]:
# Overfitting and Underfitting Analysis
# Train the model on the full training data
log_reg.fit(X_train_selected, Y_train)


In [None]:

# Predict on the training and test data
train_predictions = log_reg.predict(X_train_selected)
test_predictions = log_reg.predict(X_test_selected)


In [None]:

# Calculate the accuracy on the training and test data
train_accuracy = accuracy_score(Y_train, train_predictions)
test_accuracy = accuracy_score(Y_test, test_predictions)

print("Training accuracy:", train_accuracy)
print("Test accuracy:", test_accuracy)


Over Fitting Check 

In [None]:
# Overfitting Check
if train_accuracy > test_accuracy:
    print("The model is overfitting.")
elif train_accuracy < test_accuracy:
    print("The model is underfitting.")
else:
    print("The model is performing well and is neither overfitting nor underfitting.")

In [None]:

# Plot learning curve to visualize overfitting or underfitting
train_sizes, train_scores, test_scores = learning_curve(log_reg, X_train_selected, Y_train, cv=5)

In [None]:
# Calculate the mean and std for each learning curve score
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [None]:
# Plotting the learning curve
plt.plot(train_sizes, train_mean, label="Training Accuracy", color="blue")
plt.plot(train_sizes, test_mean, label="Cross-validation Accuracy", color="red")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="blue", alpha=0.1)
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="red", alpha=0.1)

plt.title("Learning Curve")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy")
plt.legend(loc="best")
plt.show()

Building a Predictive System

Saving the trained model