In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [9]:
df = pd.read_csv("baseball.csv")


def compute_distance(point, centroid, metric="euclidean", p=3):
    diff = np.abs(point - centroid)
    if metric == "euclidean":
        return np.sqrt(np.sum(diff**2))
    elif metric == "manhattan":
        return np.sum(diff)
    return np.power(np.sum(diff**p), 1 / p)


def k_means(data, k, metric="euclidean", max_iter=100):
    centroids = data.sample(k).to_numpy()
    clusters, sse = np.zeros(len(data)), []

    for _ in range(max_iter):
        distances = np.array(
            [
                [compute_distance(row, c, metric) for c in centroids]
                for row in data.to_numpy()
            ]
        )
        clusters = distances.argmin(axis=1)

        sse.append(
            np.sum(
                [
                    np.sum((data.iloc[clusters == i] - centroids[i]) ** 2)
                    for i in range(k)
                ]
            )
        )
        new_centroids = np.array(
            [
                data.iloc[clusters == i].mean().fillna(centroids[i][0]).to_numpy()
                for i in range(k)
            ]
        )

        if np.all(new_centroids == centroids):
            break
        centroids = new_centroids

    return clusters, centroids, sse


sse_results = [k_means(df, k)[2][-1] for k in range(1, 5)]

optimal_k = np.argmin(np.diff(sse_results, 2)) + 2
print(f"Optimal k is: {optimal_k}")

Optimal k is: 3
