Import Libraries

In [None]:
import pandas as pd                               # For dataframes
import matplotlib.pyplot as plt                   # For plotting data
import seaborn as sns                             # For plotting data
from sklearn.cluster import KMeans                # For k-Means
from sklearn.model_selection import GridSearchCV  # For grid search
from sklearn.metrics import silhouette_score      # For metrics and scores
from sklearn.preprocessing import StandardScaler  # For standardizing data

Load and prepare data

In [None]:
# Reads the .csv file into variable df
df = pd.read_csv('../data/penguins.csv')

# Separates the class variable in y
y = df.y

# Removes the y column from df
df = df.drop('y', axis=1)

# Standardizes df using fit_transform
df = pd.DataFrame(
    StandardScaler().fit_transform(df),
    columns=df.columns)

# Displays the first 5 rows of df
df.head()

Apply K-Means Clustering Algorithm

In [None]:
# Sets up the kMeans object
# Since the dataset contains 3 species of penguins, we have set k=3
km = KMeans(
    n_clusters=3,
    random_state=1,
    init='k-means++',
    n_init=10)

# Fit the model to the data
km.fit(df)

# Display the parameters of the fitted model
km.get_params()

Visualize clusters as scatter plots

In [None]:
# Create a scatter plot
sns.scatterplot(
    x='bill_length_mm', 
    y='bill_depth_mm',
    data=df, 
    hue=y,
    style=km.labels_,
    palette=["orange", "green", "blue"])

# Adds cluster centers (i.e centroids) to the same plot
plt.scatter(
    km.cluster_centers_[:,0],
    km.cluster_centers_[:,1],
    marker='x',
    s=200,
    c='red')