In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [96]:
df = pd.read_csv('./dataset/hungarian.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,4,140,260,0,1,112,1,3.0,2,?,?,2
1,44,1,4,130,209,0,1,127,0,0.0,?,?,?,0
2,60,1,4,132,218,0,1,140,1,1.5,3,?,?,2
3,55,1,4,142,228,0,1,149,1,2.5,1,?,?,1
4,66,1,3,110,213,1,2,99,1,1.3,2,?,?,0


In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       200 non-null    int64 
 1   sex       200 non-null    int64 
 2   cp        200 non-null    int64 
 3   trestbps  200 non-null    object
 4   chol      200 non-null    object
 5   fbs       200 non-null    object
 6   restecg   200 non-null    int64 
 7   thalach   200 non-null    object
 8   exang     200 non-null    object
 9   oldpeak   200 non-null    object
 10  slope     200 non-null    object
 11  ca        200 non-null    object
 12  thal      200 non-null    object
 13  num       200 non-null    int64 
dtypes: int64(5), object(9)
memory usage: 22.0+ KB


In [98]:
# creating a copy of original dataset
heart = df

In [99]:
columns_with_missing_values = ['trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

for column in columns_with_missing_values:
    heart[column] = heart[column].replace('?', np.nan)


In [100]:
#check for missing value in entire dataframe
heart.isnull().sum()

age           0
sex           0
cp            0
trestbps     56
chol          7
fbs           7
restecg       0
thalach      53
exang        53
oldpeak      56
slope       102
ca          198
thal        166
num           0
dtype: int64

#### Handle Missing Values:

For categorical columns (slope, ca, thal), you can fill missing values with the mode or a specific category depending on the domain knowledge.

In [101]:
heart['slope'].fillna(heart['slope'].mode()[0], inplace=True)
heart['ca'].fillna(heart['ca'].mode()[0], inplace=True)
heart['thal'].fillna(heart['thal'].mode()[0], inplace=True)

For binary columns (fbs, exang), you can fill missing values with the mode (most frequent value) of the respective column.

In [102]:
heart['fbs'].fillna(heart['fbs'].mode()[0], inplace=True)
heart['exang'].fillna(heart['exang'].mode()[0], inplace=True)

For numeric columns (trestbps, chol, thalach, oldpeak), you can fill missing values with the mean or median of the respective column. This helps to preserve the distribution of the data.

In [103]:
heart['trestbps'].fillna(heart['trestbps'].median(), inplace=True)
heart['chol'].fillna(heart['chol'].median(), inplace=True)
heart['thalach'].fillna(heart['thalach'].median(), inplace=True)
heart['oldpeak'].fillna(heart['oldpeak'].median(), inplace=True)

In [104]:
#check for missing value in entire dataframe
heart.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

#### Correcting the data types of attributes

In [105]:
# Convert specific columns to numeric
numeric_columns = ['trestbps', 'chol', 'thalach', 'oldpeak']

for column in numeric_columns:
    heart[column] = pd.to_numeric(heart[column], errors='coerce')

In [106]:
# Convert specific columns to categorical
categorical_columns = ['sex', 'num','cp','restecg']

for column in categorical_columns:
    heart[column] = heart[column].astype('object')


In [107]:
heart.dtypes

age           int64
sex          object
cp           object
trestbps    float64
chol        float64
fbs          object
restecg      object
thalach     float64
exang        object
oldpeak     float64
slope        object
ca           object
thal         object
num          object
dtype: object

In [108]:
heart.describe()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak
count,200.0,200.0,200.0,200.0,200.0
mean,59.35,132.71,180.05,122.055,1.371
std,7.811697,18.335947,112.221727,18.876269,0.941216
min,35.0,0.0,0.0,69.0,-0.5
25%,55.0,125.75,129.25,112.0,1.0
50%,60.0,130.0,216.0,120.0,1.5
75%,64.0,140.0,254.5,130.0,2.0
max,77.0,190.0,458.0,180.0,4.0


#### Outliers detection

In [109]:
# Specify the columns with potential outliers
columns_with_outliers = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Calculate the Z-scores (Z == (X−μ)/σ)
z_scores = np.abs(stats.zscore(heart[columns_with_outliers]))

# Find the rows with outliers
outlier_rows = heart[(z_scores >= 3).any(axis=1)]  # Select rows with at least one outlier in any column

# Display the data values of the outlier rows
print("Outlier Data Points:")
print(outlier_rows)

Outlier Data Points:
     age sex cp  trestbps   chol fbs restecg  thalach exang  oldpeak slope ca  \
24    52   1  3     128.0    0.0   0       1    180.0     0      3.0     1  0   
33    55   1  3       0.0    0.0   0       0    155.0     0      1.5     2  0   
70    55   1  2     110.0  214.0   1       1    180.0     0      1.5     2  0   
96    35   1  3     130.0  161.0   0       1    120.0     1      1.5     2  0   
176   61   1  4     190.0  287.0   1       2    150.0     1      2.0     3  0   

    thal num  
24     7   2  
33     7   3  
70     7   0  
96     7   0  
176    7   4  
