## **Importación de librerías**

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

## **Carga de datos**

In [None]:
filename = 'iris_data_challenge.csv'
file_path = os.path.join(os.path.dirname(os.getcwd()), filename)

if not os.path.exists(file_path):
    raise FileNotFoundError(f"File {filename} not found in the current directory.")

try:
    df = pd.read_csv(file_path)
    df_clean = df.copy()
except Exception as e:
    raise RuntimeError(f"Error reading the file: {e}")

## **Análisis exploratorio**

In [None]:
df_clean = df_clean.rename(columns={
    'sepal length (cm)': 'sepal_length',
    'sepal width (cm)' : 'sepal_width',
    'petal length (cm)': 'petal_length',
    'petal width (cm)' : 'petal_width',
})

In [None]:
df_clean = df_clean.drop_duplicates()

In [None]:
correlation_matrix = df_clean.dropna().corr()

plt.figure(figsize=(6, 4))
sns.heatmap(correlation_matrix, annot=True, cmap='viridis', fmt=".2f")
plt.title("Matriz de Correlación")
plt.show()

In [None]:
def plot_columns_density(df):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 8))
    axes = axes.flatten()

    for i, col in enumerate(df.columns):
        df[col].plot.density(ax=axes[i], color='blue', alpha=0.7)
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Densidad')

    return fig, axes

fig, axes = plot_columns_density(df_clean)
plt.tight_layout()
plt.show()

## **Preprocesamiento**

### **Imputación de valores nulos**

In [None]:
msno.heatmap(df_clean, figsize=(10,6))
plt.show()

In [None]:
rows_with_2_or_more_nans = (df_clean.isna().sum(axis=1) > 1)
n_rows_with_2_or_more_nans = rows_with_2_or_more_nans.sum()
print(
    f"Hay {n_rows_with_2_or_more_nans} filas con 2 o más valores nulos, que "
    f"representa el %{n_rows_with_2_or_more_nans / len(df_clean) * 100:.3f} de los datos."
)

df_clean = df_clean.loc[~rows_with_2_or_more_nans]

In [None]:
df_clean['sepal_width'] = df_clean['sepal_width'].fillna(df_clean['sepal_width'].mean())

### **Escalado de los datos**

`KNNImputer` require que los datos estén en la misma escala.

In [None]:
scaler = StandardScaler()
df_clean[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] = scaler.fit_transform(
    df_clean[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
)

In [None]:
imputer = KNNImputer(n_neighbors=5)
df_clean[['sepal_length', 'petal_length', 'petal_width']] = imputer.fit_transform(
    df_clean[['sepal_length', 'petal_length', 'petal_width']]
)

Verificamos que las densidad se hayan mantenido similares después de la estandarización y la imputación de valores nulos.

In [None]:
fig, axes = plot_columns_density(df_clean)
plt.tight_layout()
plt.show()

In [None]:
df_clean = df_clean.reset_index(drop=True)

## **Clustering con K vecinos más cercanos**

### **Determinación del número de clusters**

In [None]:
wcss = []
range_n_clusters = list(range(2, 16))

for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, random_state=0, n_init='auto', max_iter=30)
    kmeans.fit(df_clean)
    wcss.append(kmeans.inertia_)

plt.plot(range_n_clusters, wcss)
plt.title("Método del codo")
plt.xlabel("Número de clusters")
plt.ylabel("Varianza total intra-cluster ")
plt.xticks(range_n_clusters)

plt.grid()
plt.show()

### **Clustering final**

In [None]:
selected_num_clusters = 5

kmeans = KMeans(n_clusters=selected_num_clusters, random_state=0, n_init='auto', max_iter=30)
kmeans.fit(df_clean)

## **Guardar nuevo dataset**

In [None]:
df_clusters = df_clean.copy()
df_clusters['cluster'] = kmeans.labels_
df_clusters['cluster'] = df_clusters['cluster'].astype('category')

In [None]:
df_clusters.cluster.value_counts()

In [None]:
df_clusters.to_csv('iris_data_challenge_with_clusters.csv', index=False)