##                 Heart Disease Analysis


### For my Project Heart Disease Analysis, I am utilizing Python to perform comprehensive data cleaning, add new columns, and modify existing column names. Additionally, I am conducting an extensive Exploratory Data Analysis (EDA). 

### The data is sourced from a CSV file, which I process to ensure accuracy and derive meaningful insights.

In [44]:
# Import necessary libraries

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [46]:
df_main = pd.read_csv("Heart_disease_dataset.csv") # Read the CSV file into a DataFrame

df = df_main # Create a copy of the DataFrame

print(df) # Print the DataFrame to check its contents

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   1       145   233    1        2      150      0      2.3   
1     67    1   4       160   286    0        2      108      1      1.5   
2     67    1   4       120   229    0        2      129      1      2.6   
3     37    1   3       130   250    0        0      187      0      3.5   
4     41    0   2       130   204    0        2      172      0      1.4   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   45    1   1       110   264    0        0      132      0      1.2   
299   68    1   4       144   193    1        0      141      0      3.4   
300   57    1   4       130   131    0        0      115      1      1.2   
301   57    0   2       130   236    0        2      174      0      0.0   
302   38    1   3       138   175    0        0      173      0      0.0   

     slope      ca  thal  num  
0        3       0     6    0  
1        2       3     

In [47]:
df.head(2) # Display the first 2 rows of the DataFrame

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,1


In [48]:
df.describe().T # Generates descriptive statistics for numerical columns in the DataFrame

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,54.438944,9.038662,29.0,48.0,56.0,61.0,77.0
sex,303.0,0.679868,0.467299,0.0,0.0,1.0,1.0,1.0
cp,303.0,3.158416,0.960126,1.0,3.0,3.0,4.0,4.0
trestbps,303.0,131.689769,17.599748,94.0,120.0,130.0,140.0,200.0
chol,303.0,246.693069,51.776918,126.0,211.0,241.0,275.0,564.0
fbs,303.0,0.148515,0.356198,0.0,0.0,0.0,0.0,1.0
restecg,303.0,0.990099,0.994971,0.0,0.0,1.0,2.0,2.0
thalach,303.0,149.607261,22.875003,71.0,133.5,153.0,166.0,202.0
exang,303.0,0.326733,0.469794,0.0,0.0,0.0,1.0,1.0
oldpeak,303.0,1.039604,1.161075,0.0,0.0,0.8,1.6,6.2


In [49]:
df.shape # Returns Rows & Columns

(303, 14)

In [50]:
df.dtypes #Returns the data types of each column

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
num           int64
dtype: object

In [8]:
df.columns #output the column names of the DataFrame

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [14]:
df.head(3)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1


In [51]:
df.isnull().sum() #Calculates the number of missing/null values in each column of the DataFrame

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [16]:
 df.duplicated().sum() #Check for duplicate rows in the DataFrame

0

In [52]:
df.nunique() #Returns the number of unique values for each column in the DataFrame

age          41
sex           2
cp            4
trestbps     50
chol        152
fbs           2
restecg       3
thalach      91
exang         2
oldpeak      40
slope         3
ca            5
thal          4
num           2
dtype: int64

In [18]:
df['oldpeak'].isnull().sum()

0

In [53]:
colval = df.columns # Get the list of column names from the DataFrame
for i in colval:
    print(df[colval].isnull().sum()) 
    break

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64


In [54]:
# Check if the value is 0 / 1 , which indicates heart disease
def is_heart_disease(value):
    if value == 0:
        return 'No'
    else:
        return 'Yes'

In [21]:
# To find the number of people having Heart disease
df['is_heart_disease'] = df['num'].apply(is_heart_disease)

In [22]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num', 'is_heart_disease'],
      dtype='object')

In [23]:
# Group the dataframe by 'is_heart_disease' and count the number of occurrences in each group
heart_disease_count = df.groupby('is_heart_disease')['num'].count()
print(heart_disease_count)

is_heart_disease
No     164
Yes    139
Name: num, dtype: int64


In [24]:
# Find the minimum / maximum / mean value in the 'age' column

min_age=df['age'].min()
max_age=df['age'].max()
mean_age=df['age'].mean()
print(min_age)
print(max_age)
print(round(mean_age,2))

29
77
54.44


In [31]:
def age_range(row):
    if row>=29 and row<40:
        return 'Young Age'
    elif row>=40 and row<55:
        return 'Middle Age'
    else:
        return "Old Age"

In [32]:
df['ageRange']=df['age'].apply(age_range)
df.head(3)

Unnamed: 0,age,sex,cp,trestbps (Blood Pressure),chol (cholestrol),fbs (Blood Sugar),restecg,thalach (Heart Rate),exang,oldpeak,slope,ca,thal,num,is_heart_disease,ageRange
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0,No,Old Age
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,1,Yes,Old Age
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1,Yes,Old Age


In [38]:
# Change the Column names
df.rename(columns={"trestbps": "trestbps (Blood Pressure)", "chol": "chol (cholestrol)","fbs":"fbs (Blood Sugar)","thalach": "thalach (Heart Rate)","ageRange":"Age_Range"}, inplace=True)
print(df)

     age  sex  cp  trestbps (Blood Pressure)  chol (cholestrol)  \
0     63    1   1                        145                233   
1     67    1   4                        160                286   
2     67    1   4                        120                229   
3     37    1   3                        130                250   
4     41    0   2                        130                204   
..   ...  ...  ..                        ...                ...   
298   45    1   1                        110                264   
299   68    1   4                        144                193   
300   57    1   4                        130                131   
301   57    0   2                        130                236   
302   38    1   3                        138                175   

     fbs (Blood Sugar)  restecg  thalach (Heart Rate)  exang  oldpeak  slope  \
0                    1        2                   150      0      2.3      3   
1                    0        2    

In [39]:
df.head(1)

Unnamed: 0,age,sex,cp,trestbps (Blood Pressure),chol (cholestrol),fbs (Blood Sugar),restecg,thalach (Heart Rate),exang,oldpeak,slope,ca,thal,num,is_heart_disease,Age_Range
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0,No,Old Age


In [40]:
# Assign Male or Female for 1 or 0 Value
def gender(val):
    if val==1:
        return 'Male'
    else:
        return 'Female'

In [42]:
df['Gender']=df['sex'].apply(gender)
df.head(20)

Unnamed: 0,age,sex,cp,trestbps (Blood Pressure),chol (cholestrol),fbs (Blood Sugar),restecg,thalach (Heart Rate),exang,oldpeak,slope,ca,thal,num,is_heart_disease,Age_Range,Gender
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0,No,Old Age,Male
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,1,Yes,Old Age,Male
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1,Yes,Old Age,Male
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0,No,Young Age,Male
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0,No,Middle Age,Female
5,56,1,2,120,236,0,0,178,0,0.8,1,0,3,0,No,Old Age,Male
6,62,0,4,140,268,0,2,160,0,3.6,3,2,3,1,Yes,Old Age,Female
7,57,0,4,120,354,0,0,163,1,0.6,1,0,3,0,No,Old Age,Female
8,63,1,4,130,254,0,2,147,0,1.4,2,1,7,1,Yes,Old Age,Male
9,53,1,4,140,203,1,2,155,1,3.1,3,0,7,1,Yes,Middle Age,Male


In [55]:
df.to_csv('heart_disease.csv',index=False) # Save the DataFrame to a CSV file