# Clustering - Treinar K-means - Economiza+ MVP

**Objetivo:** Treinar modelo K-means para segmenta√ß√£o de usu√°rios

**Checklist:**
- Normalizar features (StandardScaler)
- Treinar K-means para K=3, 4, 5
- Elbow Curve
- Escolher melhor K
- Salvar modelos

## 1. Imports e Configura√ß√µes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import joblib
import warnings
warnings.filterwarnings('ignore')

# Configura√ß√µes de visualiza√ß√£o
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

In [None]:
# Teste r√°pido - verificar se sklearn est√° instalado
import sys
print(f"Python: {sys.version}")
print(f"Caminho: {sys.executable}")

# Testar import b√°sico
try:
    import sklearn
    print(f"‚úì scikit-learn {sklearn.__version__} instalado!")
except ImportError as e:
    print(f"‚úó Erro: {e}")

Python: 3.12.3 (main, Jun 22 2025, 11:00:19) [GCC 13.3.0]
Caminho: /home/celina/.pyenv/versions/economiza-ia-env/bin/python
‚úì scikit-learn 1.8.0 instalado!


: 

## 2. Carregar Features

In [1]:
# Carregar features processadas
df = pd.read_csv('../data/processed/features_clustering.csv')
print(f"Shape: {df.shape}")
df.head()

NameError: name 'pd' is not defined

In [None]:
# Verificar features dispon√≠veis
print("Colunas dispon√≠veis:")
print(df.columns.tolist())
print(f"\nValores nulos: {df.isnull().sum().sum()}")

## 3. Normaliza√ß√£o com StandardScaler

In [None]:
# Separar ID e features
id_col = 'usuario_id' if 'usuario_id' in df.columns else df.columns[0]
user_ids = df[id_col]
features = df.drop(columns=[id_col])

print(f"Features para clustering: {features.shape[1]} colunas")
print(f"Usu√°rios: {features.shape[0]}")

In [None]:
# Normalizar features
scaler = StandardScaler()
features_normalized = scaler.fit_transform(features)

print("Normaliza√ß√£o conclu√≠da!")
print(f"Shape: {features_normalized.shape}")
print(f"M√©dia: {features_normalized.mean():.4f}")
print(f"Desvio padr√£o: {features_normalized.std():.4f}")

## 4. Treinar K-means para K=3, 4, 5

In [None]:
# Treinar modelos para diferentes valores de K
K_values = [3, 4, 5]
models = {}
inertias = []

for k in K_values:
    print(f"\nTreinando K-means com K={k}...")
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300)
    kmeans.fit(features_normalized)
    
    models[k] = kmeans
    inertias.append(kmeans.inertia_)
    
    print(f"  In√©rcia: {kmeans.inertia_:.2f}")
    print(f"  Itera√ß√µes: {kmeans.n_iter_}")
    
    # Distribui√ß√£o dos clusters
    unique, counts = np.unique(kmeans.labels_, return_counts=True)
    print(f"  Distribui√ß√£o: {dict(zip(unique, counts))}")

## 5. Elbow Curve

In [None]:
# Plotar Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(K_values, inertias, 'bo-', linewidth=2, markersize=10)
plt.xlabel('N√∫mero de Clusters (K)', fontsize=12)
plt.ylabel('In√©rcia (Within-Cluster Sum of Squares)', fontsize=12)
plt.title('Elbow Curve - Escolha do K √ìtimo', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.xticks(K_values)

# Adicionar valores no gr√°fico
for k, inertia in zip(K_values, inertias):
    plt.annotate(f'{inertia:.1f}', 
                xy=(k, inertia), 
                xytext=(0, 10),
                textcoords='offset points',
                ha='center',
                fontsize=10)

plt.tight_layout()
plt.savefig('../outputs/elbow_curve.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nElbow Curve salva em: outputs/elbow_curve.png")

## 6. Escolher Melhor K

In [None]:
# Calcular taxa de redu√ß√£o da in√©rcia
print("An√°lise da redu√ß√£o de in√©rcia:")
print("="*50)
for i in range(len(K_values)-1):
    k_atual = K_values[i]
    k_prox = K_values[i+1]
    reducao = inertias[i] - inertias[i+1]
    percentual = (reducao / inertias[i]) * 100
    print(f"K={k_atual} ‚Üí K={k_prox}: {reducao:.2f} ({percentual:.1f}% de redu√ß√£o)")

# Escolher K (baseado no cotovelo visual e redu√ß√£o significativa)
best_k = 4  # Ajustar conforme an√°lise visual
print(f"\n{'='*50}")
print(f"‚úì K escolhido: {best_k}")
print(f"{'='*50}")

## 7. Analisar Clusters do Modelo Escolhido

In [None]:
# Obter labels do melhor modelo
best_model = models[best_k]
cluster_labels = best_model.labels_

# Adicionar labels ao dataframe original
df_clustered = df.copy()
df_clustered['cluster'] = cluster_labels

# Estat√≠sticas por cluster
print(f"\nDistribui√ß√£o dos {best_k} clusters:")
print(df_clustered['cluster'].value_counts().sort_index())
print(f"\nPercentuais:")
print((df_clustered['cluster'].value_counts(normalize=True) * 100).sort_index().round(2))

In [None]:
# Perfil dos clusters (estat√≠sticas das features originais)
cluster_profiles = df_clustered.groupby('cluster')[features.columns].mean()
print("\nPerfil m√©dio dos clusters:")
cluster_profiles

In [None]:
# Visualizar distribui√ß√£o dos clusters (primeiras 2 features principais)
plt.figure(figsize=(12, 5))

# Scatter plot
plt.subplot(1, 2, 1)
scatter = plt.scatter(features_normalized[:, 0], 
                     features_normalized[:, 1], 
                     c=cluster_labels, 
                     cmap='viridis', 
                     alpha=0.6,
                     edgecolors='black',
                     linewidth=0.5)
plt.xlabel('Feature 1 (normalizada)', fontsize=11)
plt.ylabel('Feature 2 (normalizada)', fontsize=11)
plt.title('Visualiza√ß√£o dos Clusters', fontsize=12, fontweight='bold')
plt.colorbar(scatter, label='Cluster')
plt.grid(True, alpha=0.3)

# Tamanho dos clusters
plt.subplot(1, 2, 2)
cluster_counts = pd.Series(cluster_labels).value_counts().sort_index()
colors = plt.cm.viridis(np.linspace(0, 1, len(cluster_counts)))
bars = plt.bar(cluster_counts.index, cluster_counts.values, color=colors, edgecolor='black')
plt.xlabel('Cluster', fontsize=11)
plt.ylabel('N√∫mero de Usu√°rios', fontsize=11)
plt.title('Distribui√ß√£o dos Clusters', fontsize=12, fontweight='bold')
plt.grid(True, alpha=0.3, axis='y')

# Adicionar valores nas barras
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}',
            ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig('../outputs/cluster_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nVisualiza√ß√£o salva em: outputs/cluster_visualization.png")

## 8. Salvar Modelos e Artefatos

In [None]:
# Criar diret√≥rio se n√£o existir
import os
os.makedirs('../models', exist_ok=True)

# Salvar scaler
joblib.dump(scaler, '../models/scaler.pkl')
print("‚úì Scaler salvo em: models/scaler.pkl")

# Salvar todos os modelos K-means
for k, model in models.items():
    filename = f'../models/kmeans_k{k}.pkl'
    joblib.dump(model, filename)
    print(f"‚úì Modelo K={k} salvo em: models/kmeans_k{k}.pkl")

# Salvar modelo escolhido com nome especial
joblib.dump(best_model, '../models/kmeans_best.pkl')
print(f"\n‚úì Modelo final (K={best_k}) salvo em: models/kmeans_best.pkl")

In [None]:
# Salvar dataframe com clusters
df_clustered.to_csv('../data/processed/usuarios_clustered.csv', index=False)
print("‚úì Dados com clusters salvos em: data/processed/usuarios_clustered.csv")

## 9. Resumo Final

In [None]:
print("="*60)
print("RESUMO DO CLUSTERING")
print("="*60)
print(f"\n‚úì Modelos treinados: K={K_values}")
print(f"‚úì Modelo escolhido: K={best_k}")
print(f"‚úì Total de usu√°rios: {len(df)}")
print(f"‚úì Features utilizadas: {features.shape[1]}")
print(f"\nIn√©rcias:")
for k, inertia in zip(K_values, inertias):
    marker = "üëâ" if k == best_k else "  "
    print(f"{marker} K={k}: {inertia:.2f}")
print(f"\n‚úì Artefatos salvos:")
print(f"  - models/scaler.pkl")
print(f"  - models/kmeans_k3.pkl, kmeans_k4.pkl, kmeans_k5.pkl")
print(f"  - models/kmeans_best.pkl (K={best_k})")
print(f"  - data/processed/usuarios_clustered.csv")
print(f"  - outputs/elbow_curve.png")
print(f"  - outputs/cluster_visualization.png")
print("\n" + "="*60)
print("CLUSTERING CONCLU√çDO! ‚ú®")
print("="*60)