In [2]:
import pandas as pd
from scipy import stats

df = pd.read_csv('heart.csv')

print("Before removing outliers:")
print(df.head())


Before removing outliers:
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  


In [3]:
threshold = 3


numeric_columns = df.select_dtypes(include=['number']).columns


outlier_mask = pd.DataFrame(False, index=df.index, columns=df.columns)


for column in numeric_columns:
    z_scores = stats.zscore(df[column])
    outlier_mask[column] = abs(z_scores) > threshold


outlier_indices = outlier_mask.any(axis=1)


df_no_outliers = df[~outlier_indices]


print("After removing outliers:")
print(df_no_outliers.head())

After removing outliers:
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  


In [4]:
df_no_outliers.shape

(899, 12)

In [5]:
df.shape

(918, 12)

In [7]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [10]:
df.ChestPainType.unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [12]:
df.Sex.unique()

array(['M', 'F'], dtype=object)

In [13]:
df.RestingECG.unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [14]:
df.ExerciseAngina.unique()

array(['N', 'Y'], dtype=object)

In [15]:
df.ST_Slope.unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [16]:
df_no_outliers.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [22]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
text_columns_to_label_encode = ['Sex', 'RestingECG','ExerciseAngina','ST_Slope']
for column in text_columns_to_label_encode:
    df_no_outliers[column] = label_encoder.fit_transform(df_no_outliers[column])
text_columns_to_one_hot_encode = ['ChestPainType']
df_no_outliers = pd.get_dummies(df_no_outliers, drop_first=True)

df_no_outliers.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,1,140,289,0,1,172,0,0.0,2,0,1,0,0
1,49,0,160,180,0,1,156,0,1.0,1,1,0,1,0
2,37,1,130,283,0,2,98,0,0.0,2,0,1,0,0
3,48,0,138,214,0,1,108,1,1.5,1,1,0,0,0
4,54,1,150,195,0,1,122,0,0.0,2,0,0,1,0


In [23]:
df_no_outliers.to_csv('heart_numerical.csv')

In [24]:
X=df_no_outliers.drop('HeartDisease',axis='columns')
y=df_no_outliers.HeartDisease
X.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,1,140,289,0,1,172,0,0.0,2,1,0,0
1,49,0,160,180,0,1,156,0,1.0,1,0,1,0
2,37,1,130,283,0,2,98,0,0.0,2,1,0,0
3,48,0,138,214,0,1,108,1,1.5,1,0,0,0
4,54,1,150,195,0,1,122,0,0.0,2,0,1,0


In [25]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-1.42815446,  0.515943  ,  0.46590022, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       [-0.47585532, -1.93819859,  1.63471366, ..., -0.48465463,
         1.86949191, -0.22955001],
       [-1.7455875 ,  0.515943  , -0.1185065 , ...,  2.06332497,
        -0.5349047 , -0.22955001],
       ...,
       [ 0.3706328 ,  0.515943  , -0.1185065 , ..., -0.48465463,
        -0.5349047 , -0.22955001],
       [ 0.3706328 , -1.93819859, -0.1185065 , ...,  2.06332497,
        -0.5349047 , -0.22955001],
       [-1.63977649,  0.515943  ,  0.34901888, ..., -0.48465463,
         1.86949191, -0.22955001]])

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=30)

In [28]:
X_train.shape

(719, 13)

In [29]:
X_test.shape

(180, 13)

In [30]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
model_rf.score(X_test, y_test)

0.8611111111111112

In [31]:
from sklearn.decomposition import PCA

pca = PCA(0.95)
X_pca = pca.fit_transform(X)
X_pca

array([[ 93.1295843 , -29.67585012],
       [-16.33846884, -14.80341797],
       [ 82.66876737,  38.91645004],
       ...,
       [-68.22601267,  17.69533611],
       [ 40.02721805, -33.47297974],
       [-20.61248562, -37.61418393]])

In [32]:
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=30)

In [33]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
model_rf.fit(X_train_pca, y_train)
model_rf.score(X_test_pca, y_test)

0.6444444444444445