In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 1: Create the DataFrame from the given data
data = {
    'BloodPressure': [72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74, 80, 60, 72, 0, 84, 74, 30],
    'SkinThickness': [35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, 23, 19, 0, 47, 0, 38],
    'Insulin': [0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, 846, 175, 0, 230, 0, 83],
    'BMI': [33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, 0, 37.6, 38.0, 27.1, 30.1, 25.8, 30.0, 45.8, 29.6, 43.3],
    'DiabetesPedigreeFunction': [0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158, 0.232, 0.191, 0.537, 1.441, 0.398, 0.587, 0.484, 0.551, 0.254, 0.183],
    'Age': [50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, 51, 32, 31, 31, 33],
    'Outcome': [1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0]
}

df = pd.DataFrame(data)

# Step 2: Select features for clustering (excluding 'Outcome' column)
features = df.drop('Outcome', axis=1)

# Step 3: Standardize the features (important for KMeans)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Step 4: Apply K-Means clustering (K = 3)
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(features_scaled)

# Step 5: Get the centroid values
centroids = kmeans.cluster_centers_

# Step 6: Convert centroids back to the original scale
centroids_original_scale = scaler.inverse_transform(centroids)

# Display the new centroids in the original scale
centroids_df = pd.DataFrame(centroids_original_scale, columns=features.columns)
print("New Centroids (Original Scale):")
print(centroids_df)

# Step 7: Display the dataframe with assigned clusters
print("\nData with Assigned Clusters:")
print(df.head())


New Centroids (Original Scale):
   BloodPressure  SkinThickness  Insulin        BMI  DiabetesPedigreeFunction  \
0   7.105427e-15            0.0      0.0  32.650000                    0.3090   
1   7.914286e+01            0.0      0.0  25.885714                    0.5040   
2   6.100000e+01           32.6    222.7  33.790000                    0.5558   

         Age  
0  30.500000  
1  38.285714  
2  38.800000  

Data with Assigned Clusters:
   BloodPressure  SkinThickness  Insulin   BMI  DiabetesPedigreeFunction  Age  \
0             72             35        0  33.6                     0.627   50   
1             66             29        0  26.6                     0.351   31   
2             64              0        0  23.3                     0.672   32   
3             66             23       94  28.1                     0.167   21   
4             40             35      168  43.1                     2.288   33   

   Outcome  Cluster  
0        1        2  
1        0        2  


In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 1: Create the DataFrame from the given data
data = {
    'BloodPressure': [72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74, 80, 60, 72, 0, 84, 74, 30],
    'SkinThickness': [35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, 23, 19, 0, 47, 0, 38],
    'Insulin': [0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, 846, 175, 0, 230, 0, 83],
    'BMI': [33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, 0, 37.6, 38.0, 27.1, 30.1, 25.8, 30.0, 45.8, 29.6, 43.3],
    'DiabetesPedigreeFunction': [0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158, 0.232, 0.191, 0.537, 1.441, 0.398, 0.587, 0.484, 0.551, 0.254, 0.183],
    'Age': [50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, 51, 32, 31, 31, 33],
    'Outcome': [1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0]
}

df = pd.DataFrame(data)

# Step 2: Select features for clustering (excluding 'Outcome' column)
features = df.drop('Outcome', axis=1)

# Step 3: Standardize the features (important for KMeans)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Step 4: Apply K-Means clustering (K = 2)
kmeans = KMeans(n_clusters=2, random_state=42)
df['Cluster'] = kmeans.fit_predict(features_scaled)

# Step 5: Get the centroid values
centroids = kmeans.cluster_centers_

# Step 6: Convert centroids back to the original scale
centroids_original_scale = scaler.inverse_transform(centroids)

# Display the new centroids in the original scale
centroids_df = pd.DataFrame(centroids_original_scale, columns=features.columns)
print("New Centroids (Original Scale):")
print(centroids_df)

# Step 7: Display the dataframe with assigned clusters
print("\nData with Assigned Clusters:")
print(df[['BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Cluster']])


New Centroids (Original Scale):
   BloodPressure  SkinThickness     Insulin      BMI  \
0         10.000      12.666667   27.666667  36.2000   
1         70.875      18.000000  134.000000  29.7375   

   DiabetesPedigreeFunction        Age  
0                  0.267000  31.333333  
1                  0.556438  38.937500  

Data with Assigned Clusters:
    BloodPressure  SkinThickness  Insulin   BMI  DiabetesPedigreeFunction  \
0              72             35        0  33.6                     0.627   
1              66             29        0  26.6                     0.351   
2              64              0        0  23.3                     0.672   
3              66             23       94  28.1                     0.167   
4              40             35      168  43.1                     2.288   
5              74              0        0  25.6                     0.201   
6              50             32       88  31.0                     0.248   
7               0             