In [1]:
# Import the required libraries and dependencies
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
df_heart_disease = pd.read_csv("../Resources/Heart_Disease_Prediction.csv")

# Review the DataFrame
df_heart_disease.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


In [3]:
#  Prepare the data. Use the StandardScaler module and fit_transform function to 
# scale all columns with numerical values
heart_data_scaled = StandardScaler().fit_transform(df_heart_disease[['Age','Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium']])

# Diplay the first five rows of the scaled data
heart_data_scaled[0:5]

array([[ 1.71209356,  0.6894997 ,  0.87092765, -0.07540984,  1.40221232,
        -0.41702883,  0.98166365, -1.75920811, -0.7012223 ,  1.18101235,
         0.67641928,  2.47268219, -0.87570581],
       [ 1.38213977, -1.45032695, -0.18355874, -0.91675934,  6.0930045 ,
        -0.41702883,  0.98166365,  0.44640927, -0.7012223 ,  0.48115318,
         0.67641928, -0.71153494,  1.18927733],
       [ 0.2822938 ,  0.6894997 , -1.23804513, -0.41194964,  0.21982255,
        -0.41702883, -1.02628472, -0.37529132, -0.7012223 , -0.65611797,
        -0.95423434, -0.71153494,  1.18927733],
       [ 1.05218598,  0.6894997 ,  0.87092765, -0.18758978,  0.25858943,
        -0.41702883, -1.02628472, -1.93219771,  1.4260813 , -0.74360037,
         0.67641928,  0.34987077,  1.18927733],
       [ 2.15203195, -1.45032695, -1.23804513, -0.63630951,  0.37489006,
        -0.41702883,  0.98166365, -1.24023932,  1.4260813 , -0.74360037,
        -0.95423434,  0.34987077, -0.87570581]])

In [4]:
# Create a DataFrame called with the scaled data
# The column names should match those referenced in the StandardScaler step
df_heart_scaled = pd.DataFrame(
    heart_data_scaled,
    columns=['Age','Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium']
)

# Review the DataFrame
df_heart_scaled.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,1.712094,0.6895,0.870928,-0.07541,1.402212,-0.417029,0.981664,-1.759208,-0.701222,1.181012,0.676419,2.472682,-0.875706
1,1.38214,-1.450327,-0.183559,-0.916759,6.093004,-0.417029,0.981664,0.446409,-0.701222,0.481153,0.676419,-0.711535,1.189277
2,0.282294,0.6895,-1.238045,-0.41195,0.219823,-0.417029,-1.026285,-0.375291,-0.701222,-0.656118,-0.954234,-0.711535,1.189277
3,1.052186,0.6895,0.870928,-0.18759,0.258589,-0.417029,-1.026285,-1.932198,1.426081,-0.7436,0.676419,0.349871,1.189277
4,2.152032,-1.450327,-1.238045,-0.63631,0.37489,-0.417029,0.981664,-1.240239,1.426081,-0.7436,-0.954234,0.349871,-0.875706


In [5]:
# Encode (convert to dummy variables) the Heart Disease column
df_heart_dummies = pd.get_dummies(df_heart_disease["Heart Disease"])

# Review the DataFrame
df_heart_dummies.head()

Unnamed: 0,Absence,Presence
0,0,1
1,1,0
2,0,1
3,1,0
4,1,0


In [6]:
# Concatenate the Heart Disease encoded dummies with the scaled data DataFrame
df_heart_scaled = pd.concat([df_heart_scaled, df_heart_dummies], axis=1)

# Display the sample data
df_heart_scaled.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Absence,Presence
0,1.712094,0.6895,0.870928,-0.07541,1.402212,-0.417029,0.981664,-1.759208,-0.701222,1.181012,0.676419,2.472682,-0.875706,0,1
1,1.38214,-1.450327,-0.183559,-0.916759,6.093004,-0.417029,0.981664,0.446409,-0.701222,0.481153,0.676419,-0.711535,1.189277,1,0
2,0.282294,0.6895,-1.238045,-0.41195,0.219823,-0.417029,-1.026285,-0.375291,-0.701222,-0.656118,-0.954234,-0.711535,1.189277,0,1
3,1.052186,0.6895,0.870928,-0.18759,0.258589,-0.417029,-1.026285,-1.932198,1.426081,-0.7436,0.676419,0.349871,1.189277,1,0
4,2.152032,-1.450327,-1.238045,-0.63631,0.37489,-0.417029,0.981664,-1.240239,1.426081,-0.7436,-0.954234,0.349871,-0.875706,1,0


In [7]:
# Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=3)

In [8]:
# Fit the model for the df_heart_scaled DataFrame
model.fit(df_heart_scaled)

KMeans(n_clusters=3)

In [9]:
# Predict the model segments (clusters)
heart_clusters = model.predict(df_heart_scaled)

# View the stock segments
print(heart_clusters)

[0 2 1 0 2 1 0 0 0 0 1 0 1 1 2 2 0 0 1 1 0 1 1 1 2 2 2 2 0 2 0 0 2 0 0 0 0
 1 1 1 1 2 1 1 0 1 1 1 0 0 0 1 2 2 2 2 0 2 2 0 1 0 1 2 1 0 1 2 2 1 0 2 1 2
 2 0 0 2 2 1 0 0 0 1 0 1 1 0 2 0 1 2 0 0 1 0 2 1 2 2 2 0 2 0 0 1 1 0 0 2 0
 1 2 2 1 2 1 0 2 0 0 0 0 2 1 2 0 2 1 0 0 0 1 0 1 2 2 0 2 1 0 1 1 1 1 0 1 0
 0 2 1 2 2 2 2 2 0 1 1 0 1 2 1 0 1 2 2 1 2 1 1 0 0 2 1 0 0 1 0 1 1 0 1 2 1
 1 0 1 2 0 1 0 1 1 1 2 2 2 2 0 1 0 0 0 0 1 2 0 0 1 1 1 2 0 1 2 2 1 1 1 0 0
 1 0 2 1 0 0 2 1 0 1 1 0 0 0 2 0 1 1 0 1 2 0 2 0 0 2 1 0 0 1 1 1 1 2 1 0 1
 1 2 0 1 1 1 1 1 2 1 0]


In [10]:
# Create a copy of the concatenated DataFrame
df_heart_scaled_predictions = df_heart_scaled.copy()

In [11]:
# Create a new column in the copy of the concatenated DataFrame with the predicted clusters
df_heart_scaled_predictions["HeartCluster"] = heart_clusters

# Review the DataFrame
df_heart_scaled_predictions.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Absence,Presence,HeartCluster
0,1.712094,0.6895,0.870928,-0.07541,1.402212,-0.417029,0.981664,-1.759208,-0.701222,1.181012,0.676419,2.472682,-0.875706,0,1,0
1,1.38214,-1.450327,-0.183559,-0.916759,6.093004,-0.417029,0.981664,0.446409,-0.701222,0.481153,0.676419,-0.711535,1.189277,1,0,2
2,0.282294,0.6895,-1.238045,-0.41195,0.219823,-0.417029,-1.026285,-0.375291,-0.701222,-0.656118,-0.954234,-0.711535,1.189277,0,1,1
3,1.052186,0.6895,0.870928,-0.18759,0.258589,-0.417029,-1.026285,-1.932198,1.426081,-0.7436,0.676419,0.349871,1.189277,1,0,0
4,2.152032,-1.450327,-1.238045,-0.63631,0.37489,-0.417029,0.981664,-1.240239,1.426081,-0.7436,-0.954234,0.349871,-0.875706,1,0,2
