<a href="https://colab.research.google.com/github/anchalvishawakarma/Projects/blob/main/Health_Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import Data Manipulation Libraries
import pandas as pd
import numpy as np

# Import Visualization Libraries
import matplotlib.pyplot as plt #--> vender is matlab
import seaborn as sns

# Import Filter Warning Librares
import warnings
warnings.filterwarnings('ignore')

# Customise Descriptive Stats
from collections import OrderedDict

# Import Machine Learning Libraries
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, mean_squared_error,confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score



In [2]:
url= 'https://raw.githubusercontent.com/chandanc5525/MachineLearning-Bootcamp/refs/heads/main/heart-disease.csv'
df = pd.read_csv(url)
df.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
256,58,1,0,128,259,0,0,130,1,3.0,1,2,3,0
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1,0
209,59,1,0,140,177,0,1,162,1,0.0,2,1,3,0
242,64,1,0,145,212,0,0,132,0,2.0,1,2,1,0
56,48,1,0,122,222,0,0,186,0,0.0,2,0,2,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [4]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [5]:

stats =[]
for i in df.columns:
  numerical_stats = OrderedDict({
      'Feature': i,
      'Mean':df[i] .mean(),
      'Median':df[i].median(),
      'Std':df[i].std(),
      'Max':df[i].max(),
      'Min': df[i].min(),
      '25%':df[i].quantile(0.25),
      '75%':df[i].quantile(0.75),
      'IQR':df[i].quantile(0.75) - df[i].quantile(0.25),
      'Skew':df[i].skew(),
      'Kurtosis':df[i].kurtosis()
  })
  stats.append(numerical_stats)
  report = pd.DataFrame(stats)
report.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
Feature,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
Mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
Median,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
Std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
Max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0
Min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
IQR,13.5,1.0,2.0,20.0,63.5,0.0,1.0,32.5,1.0,1.6,1.0,1.0,1.0,1.0
Skew,-0.202463,-0.791335,0.484732,0.713768,1.143401,1.986652,0.162522,-0.53741,0.742532,1.26972,-0.508316,1.310422,-0.476722,-0.179821


In [6]:
df['age'].value_counts()

Unnamed: 0_level_0,count
age,Unnamed: 1_level_1
58,19
57,17
54,16
59,14
52,13
51,12
62,11
60,11
44,11
56,11


In [7]:
df['sex'].value_counts()#.plot(kind='barh')

Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
1,207
0,96


In [8]:
'''cp: chest pain type
        -- Value 1: typical angina
        -- Value 2: atypical angina
        -- Value 3: non-anginal pain
        -- Value 4: asymptomatic
'''
df['cp'].value_counts()

Unnamed: 0_level_0,count
cp,Unnamed: 1_level_1
0,143
2,87
1,50
3,23


In [9]:
#  trestbps: resting blood pressure (in mm Hg on admission to the hospital)
df['trestbps'].value_counts()


Unnamed: 0_level_0,count
trestbps,Unnamed: 1_level_1
120,37
130,36
140,32
110,19
150,17
138,13
128,12
160,11
125,11
112,9


In [10]:
# chol: serum cholestoral in mg/dl
df['chol'].value_counts()

Unnamed: 0_level_0,count
chol,Unnamed: 1_level_1
204,6
197,6
234,6
269,5
254,5
...,...
284,1
224,1
167,1
276,1


In [11]:
#  fbs: (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false)

df['fbs'].value_counts()

Unnamed: 0_level_0,count
fbs,Unnamed: 1_level_1
0,258
1,45


In [12]:
"""restecg: resting electrocardiographic results
        -- Value 0: normal
        -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
        -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria"""

df['restecg'].value_counts()

Unnamed: 0_level_0,count
restecg,Unnamed: 1_level_1
1,152
0,147
2,4


In [13]:
# thalach: maximum heart rate achieved

df['thalach'].value_counts()

Unnamed: 0_level_0,count
thalach,Unnamed: 1_level_1
162,11
160,9
163,9
152,8
173,8
...,...
202,1
184,1
121,1
192,1


In [14]:
df['exang'].value_counts()

Unnamed: 0_level_0,count
exang,Unnamed: 1_level_1
0,204
1,99


In [15]:
df['oldpeak'].value_counts()

Unnamed: 0_level_0,count
oldpeak,Unnamed: 1_level_1
0.0,99
1.2,17
1.0,14
0.6,14
1.4,13
0.8,13
0.2,12
1.6,11
1.8,10
0.4,9


In [16]:
df['slope'].value_counts()

Unnamed: 0_level_0,count
slope,Unnamed: 1_level_1
2,142
1,140
0,21


In [17]:
df['ca'].value_counts()

Unnamed: 0_level_0,count
ca,Unnamed: 1_level_1
0,175
1,65
2,38
3,20
4,5


In [18]:
df['thal'].value_counts()

Unnamed: 0_level_0,count
thal,Unnamed: 1_level_1
2,166
3,117
1,18
0,2


In [19]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,165
0,138


In [20]:
df.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
61,54,1,1,108,309,0,1,156,0,0.0,2,0,3,1
166,67,1,0,120,229,0,0,129,1,2.6,1,2,3,0
165,67,1,0,160,286,0,0,108,1,1.5,1,3,2,0
44,39,1,2,140,321,0,0,182,0,0.0,2,0,2,1
268,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0


# **Data Insights**

### **1. Most Affected Age Groups**  
- The **late 50s** have the highest prevalence of heart disease, with **58 years** (19 cases) being the most common, followed by **57 years** (17 cases) and **54 years** (16 cases). This suggests that individuals in their late 50s are at higher risk.

---

### **2. Distribution by Sex**  
- **Male (Sex 1)**: **207 cases**, indicating a significantly higher prevalence of heart disease among men.  
- **Female (Sex 0)**: **96 cases**, showing a lower prevalence compared to men.

---

### **3. Most Common Chest Pain Types (cp)**  
- **Asymptomatic (Value 0)**: Most frequent, with **143 cases**, making it challenging for diagnosis due to the lack of symptoms.  
- **Atypical Angina (Value 2)**: Second most common with **87 cases** (about 30%).  
- **Typical Angina (Value 1)**: Moderate frequency, with **50 cases** (17%).  
- **Non-Anginal Pain (Value 3)**: Least common, with **23 cases**.

---

### **4. Resting Blood Pressure (trestbps)**  
- **Common Range**: Most cases fall within **120–140 mmHg**, with **120 mmHg** (37 cases) being the most frequent.  
- **High BP**: Values above **150 mmHg** are less frequent, with the highest being **200 mmHg** (1 case).  
- **Low BP**: Rare values below **110 mmHg**, e.g., **94 mmHg** (2 cases).  
- **Key Focus**: **120–140 mmHg** is the critical range for monitoring heart disease risk.

---

### **5. Cholesterol Levels (chol)**  
- **Common Range**: Most cases fall between **200–270**, with **204**, **197**, and **234** being the most frequent (**6 cases each**).  
- **Rare Values**: Extremely low (**131**) and high (**284**) levels are uncommon, with only **1 case** each.  
- **Key Focus**: Cholesterol levels in the **200–270** range are crucial for monitoring heart disease risk.

---

### **6. Fasting Blood Sugar (fbs)**  
- **Low FBS (Value 0)**: Dominates the dataset with **258 cases**, indicating that most individuals do not have elevated fasting blood sugar levels (>120 mg/dL).  
- **High FBS (Value 1)**: Present in **45 cases**, suggesting that elevated fasting blood sugar is less common among heart disease patients.

---

### **7. Resting ECG (restecg)**  
- **ST-T Wave Abnormality (Value 1)**: Most common, with **152 cases**.  
- **Normal (Value 0)**: Second most frequent, with **147 cases**.  
- **Left Ventricular Hypertrophy (Value 2)**: Rare, with only **4 cases**.

---

### **8. Maximum Heart Rate Achieved (thalach)**  
- **Common Range**: Most cases fall between **150–170 bpm**, with **162 bpm** being the most frequent (**11 cases**).  
- **Rare Values**: Extremes like **90 bpm** and values above **184 bpm** are uncommon, with **1 case** each.

---

### **9. Exercise-Induced Angina (exang)**  
- **No Angina (Value 0)**: Dominates with **204 cases**, indicating the majority do not experience exercise-induced angina.  
- **Angina Present (Value 1)**: Found in **99 cases**, representing about **one-third** of the dataset.

---

### **10. Oldpeak (ST Depression Induced by Exercise)**  
- **Most Frequent Value**: **0.0** (99 cases), indicating no ST depression in many patients.  
- **Common Range**: Values between **0.0–2.0** dominate, with **1.2** (17 cases) being the most frequent among them.  
- **Rare Extremes**: Values above **3.0** are uncommon.

---

### **11. Slope of the Peak Exercise ST Segment (slope)**  
- **Up-sloping (Value 2)**: Most common, with **142 cases**.  
- **Flat (Value 1)**: Close second, with **140 cases**.  
- **Down-sloping (Value 0)**: Least common, with **21 cases**.

---

### **12. Number of Major Vessels (ca)**  
- **No Vessels (Value 0)**: Most common, with **175 cases**, indicating no major vessel involvement in many patients.  
- **1–2 Vessels**: **65 cases** (1 vessel) and **38 cases** (2 vessels), representing significant subsets.  
- **3–4 Vessels**: Rare, with **20 cases** (3 vessels) and **5 cases** (4 vessels).

---

### **13. Thalassemia (thal)**  
- **Fixed Defect (Value 2)**: Most common, with **166 cases**.  
- **Normal (Value 3)**: Second most frequent, with **117 cases**.  
- **Reversible Defect (Value 1)**: Rare, with **18 cases**.  
- **Unknown/Other (Value 0)**: Extremely rare, with only **2 cases**.

---


In [21]:
df.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
11,48,0,2,130,275,0,1,139,0,0.2,2,0,2,1
288,57,1,0,110,335,0,1,143,1,3.0,1,1,3,0
210,57,1,2,128,229,0,0,150,0,0.4,1,1,3,0
105,68,0,2,120,211,0,0,115,0,1.5,1,0,2,1
46,44,1,2,140,235,0,0,180,0,0.0,2,0,2,1
