# Preprocesamiento Unificado de spotify_dataset.csv
Este notebook limpia, imputa valores faltantes y genera **tres** versiones de cada feature:
- Escala original (imputada)
- Min-Max (0–1)
- Z-score

Luego guarda todo en un único archivo `spotify_processed.csv` con columnas adicionales para cada versión.

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import os

In [2]:
# Cargar dataset crudo
df = pd.read_csv('../data/raw/spotify_dataset.csv')

In [3]:
# Renombrar columnas y convertir tipos numéricos
df.rename(columns={
    'Danceability':'danceability', 'Energy':'energy', 'Positiveness':'valence',
    'Tempo':'tempo', 'Loudness (db)':'loudness', 'Liveness':'liveness',
    'Speechiness':'speechiness', 'Acousticness':'acousticness',
    'Instrumentalness':'instrumentalness', 'Length':'duration',
    'Popularity':'popularity'
}, inplace=True)
num_cols = ['danceability','energy','valence','tempo','loudness','liveness',
            'speechiness','acousticness','instrumentalness','duration','popularity']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [5]:
# Imputar valores faltantes (mediana)
valid_num_cols = [col for col in num_cols if not df[col].isna().all()]
imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df[valid_num_cols]), columns=valid_num_cols)

In [6]:
# Recortar outliers (percentiles 1–99)
lower = df_imputed.quantile(0.01)
upper = df_imputed.quantile(0.99)
df_clipped = df_imputed.clip(lower=lower, upper=upper, axis=1)

In [9]:
# Generar versiones escaladas
mm_scaler = MinMaxScaler()
df_minmax = pd.DataFrame(mm_scaler.fit_transform(df_clipped), columns=valid_num_cols)

std_scaler = StandardScaler()
df_standard = pd.DataFrame(std_scaler.fit_transform(df_clipped), columns=valid_num_cols)

# Preparar df final
df_final = df_imputed.copy()
for col in num_cols:
    if col in df_minmax.columns:
        df_final[f'{col}_minmax'] = df_minmax[col]
    if col in df_standard.columns:
        df_final[f'{col}_std'] = df_standard[col]

# Guardar unico archivo
os.makedirs('../data/processed', exist_ok=True)
df_final.to_csv('../data/processed/spotify_processed.csv', index=False)
print('Archivo guardado: data/processed/spotify_processed.csv')

Archivo guardado: data/processed/spotify_processed.csv
