In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd

# Padronizar os dados
df = pd.read_csv('../../data/home-credit/prepared/train.csv')
X = df.select_dtypes(include=[np.number]).drop(columns=['TARGET'])

# Testar diferentes valores de k
inertias = []
silhouettes = []
K_range = range(2, 11)  # testar de 2 a 10 clusters

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X)
    
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X, clusters))

# Plot do método do cotovelo
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.plot(K_range, inertias, marker='o')
plt.title("Método do Cotovelo (Inércia)")
plt.xlabel("Número de clusters (k)")
plt.ylabel("Inércia")

# Plot do silhouette score
plt.subplot(1,2,2)
plt.plot(K_range, silhouettes, marker='o', color='orange')
plt.title("Silhouette Score")
plt.xlabel("Número de clusters (k)")
plt.ylabel("Silhouette")

plt.tight_layout()
plt.show()

# Melhor k com base no maior silhouette
best_k = K_range[np.argmax(silhouettes)]
print(f"Melhor k pelo Silhouette Score: {best_k}")

# Treinar o modelo final com best_k
kmeans_final = KMeans(n_clusters=best_k, random_state=42, n_init=10)
clusters_final = kmeans_final.fit_predict(X)

print(f"Silhouette Score do modelo final: {silhouette_score(X, clusters_final):.3f}")
