# Proyecto 1 - Stacking con 10 Modelos Bases y con XGB como Selector y  Meta-Modelo

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, StackingRegressor, BaggingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNetCV, RidgeCV, LassoCV, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold

In [None]:
#  Cargar datasets
url_train = 'https://raw.githubusercontent.com/davidzarruk/MIAD_ML_NLP_2025/main/datasets/dataTrain_Spotify.csv'
url_test = 'https://raw.githubusercontent.com/davidzarruk/MIAD_ML_NLP_2025/main/datasets/dataTest_Spotify.csv'

dataTrain = pd.read_csv(url_train)
dataTest = pd.read_csv(url_test, index_col=0)


In [17]:
# Memoria antes de optimización
mem_before = dataTrain.memory_usage(deep=True).sum() / 1024**2  # en MB

# Optimizar tipos de datos
for col in dataTrain.columns:
    col_type = dataTrain[col].dtype

    if col_type == 'float64':
        dataTrain[col] = dataTrain[col].astype('float32')
    elif col_type == 'int64':
        dataTrain[col] = dataTrain[col].astype('int32')
    elif col_type == 'bool':
        dataTrain[col] = dataTrain[col].astype('int8')  # más compacto
    elif col_type == 'object':
        num_unique_values = dataTrain[col].nunique()
        num_total_values = len(dataTrain[col])
        if num_unique_values / num_total_values < 0.5:
            dataTrain[col] = dataTrain[col].astype('category')

# Memoria después de optimización
mem_after = dataTrain.memory_usage(deep=True).sum() / 1024**2  # en MB

print(f'Memoria antes: {mem_before:.2f} MB')
print(f'Memoria después: {mem_after:.2f} MB')
print(f'Reducción: {100 * (mem_before - mem_after) / mem_before:.2f}%')

Memoria antes: 21.21 MB
Memoria después: 21.21 MB
Reducción: 0.00%


In [None]:
#Prepocesamiento de datos


# Eliminar columnas innecesarias
for col in ['Unnamed: 0']:
    if col in dataTrain.columns: dataTrain.drop(columns=col, inplace=True)
    if col in dataTest.columns: dataTest.drop(columns=col, inplace=True)

# Codificar columnas categóricas
for col in ['artists', 'album_name', 'track_genre']:
    combined = pd.concat([dataTrain[col], dataTest[col]], axis=0).astype(str)
    encoder = LabelEncoder().fit(combined)
    dataTrain[col + '_n'] = encoder.transform(dataTrain[col].astype(str))
    dataTest[col + '_n'] = encoder.transform(dataTest[col].astype(str))

# Nuevas características
for df in [dataTrain, dataTest]:
    df['track_name_length'] = df['track_name'].apply(lambda x: len(str(x)))
    df['explicit'] = df['explicit'].astype(int)
    df['tempo_density'] = df['tempo'] / df['duration_ms']
    df['energy_danceability'] = df['energy'] * df['danceability']
    df['acousticness_bin'] = (df['acousticness'] > 0.5).astype(int)


In [None]:
#  Selección de columnas y escalado
drop_cols = ['track_id', 'track_name', 'artists', 'album_name', 'track_genre']
features = dataTrain.drop(columns=drop_cols + ['popularity']).columns.tolist()

# Reemplazar valores infinitos por NaN y luego llenar NaN con la mediana
for df in [dataTrain, dataTest]:
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(df.median(numeric_only=True), inplace=True)


In [None]:
# Escalado de datos
scaler = RobustScaler()
train_scaled = scaler.fit_transform(dataTrain[features])
test_scaled = scaler.transform(dataTest[features])
y = dataTrain['popularity']

In [None]:
# Selección de características con XGBRegressor
selector = SelectFromModel(XGBRegressor(n_estimators=100, random_state=42))
selector.fit(train_scaled, y)
X_sel = selector.transform(train_scaled)
X_test_sel = selector.transform(test_scaled)


In [None]:
# Dividir el conjunto de datos en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X_sel, y, test_size=0.2, random_state=42)


In [None]:
# Modelos base

base_models = [
    ('svr', SVR(kernel='rbf', C=10, epsilon=0.2)),
    ('rf', RandomForestRegressor(n_estimators=200, max_depth=30, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=200, max_depth=10, random_state=42)),
    ('et', ExtraTreesRegressor(n_estimators=200, max_depth=30, random_state=42)),
    ('bag', BaggingRegressor(n_estimators=200, max_samples=0.8, max_features=0.8, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=200, learning_rate=0.075, max_depth=10, random_state=42)),
    ('lgbm', LGBMRegressor(n_estimators=100, learning_rate=0.075, max_depth=10, random_state=42)),
    ('catboost', CatBoostRegressor(iterations=100, depth=10, learning_rate=0.075, random_seed=42, verbose=False)),
    ('elasticnet', ElasticNetCV(cv=5)),
    ('ridge', RidgeCV()),
    ('lasso', LassoCV()),
    ('knn', KNeighborsRegressor(n_neighbors=5))
]



In [None]:
# Ensamblaje de modelos

# Estrategia de validación cruzada
cv_strategy = KFold(n_splits=10, shuffle=True, random_state=42)

# Crear el modelo de apilamiento
stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=XGBRegressor(n_estimators=200, learning_rate=0.075, max_depth=10, random_state=42),
    passthrough=True,
    n_jobs=-1,
    cv=cv_strategy
)

In [None]:
# Ajustar y predecir en el conjunto de validación
stacking_model.fit(X_train, y_train)
y_pred = stacking_model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"RMSE validación local: {rmse:.5f}")

RMSE validación local: 11.06196




In [None]:
# Predicción en el conjunto de prueba
test_pred = np.clip(stacking_model.predict(X_test_sel), 0, 100)

# Guardar la predicción en un archivo CSV para Kaggle
submission = pd.DataFrame({'ID': dataTest.index, 'popularity': test_pred})
submission.to_csv('test_submission_file8.csv', index=False)
submission.head()


Unnamed: 0,ID,popularity
0,0,39.420914
1,1,13.463353
2,2,4.754846
3,3,0.826673
4,4,25.499201
