In [None]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error 
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

In [None]:
# Load training set
diamantes1 = pd.read_csv("diamantes1.csv")

In [None]:
# Define roles for diamantes1.csv
y = diamantes1.price
X = diamantes1.drop(diamantes1.columns[[0,7]],axis=1)

In [None]:
# Define two preprocessing pipelines
categorical_features = X.select_dtypes(include=['object','category']).columns
numeric_features = X.select_dtypes(exclude=['object','category']).columns

preprocessor1 = ColumnTransformer(
        transformers=[
            ('cat',OneHotEncoder(handle_unknown='ignore',sparse_output=False),categorical_features),
            ('num','passthrough', numeric_features)  
        ]
)


preprocessor2 = ColumnTransformer(
        transformers=[
            ('cat',OneHotEncoder(handle_unknown='ignore',sparse_output=False),categorical_features),
            ('num',StandardScaler(),numeric_features)
        ]
)

In [None]:
# Combine preprocessing model and the KNN regression model into a single pipeline

modelo_linreg = Pipeline(steps=[
    ('preprocess',preprocessor1),
    ('linreg',LinearRegression())
])

modelo_knn1 = Pipeline(steps=[
    ('preprocessor',preprocessor2),
    ('knn',KNeighborsRegressor(n_neighbors=1))
])

modelo_knn5 = Pipeline(steps=[
    ('preprocessor',preprocessor2),
    ('knn',KNeighborsRegressor(n_neighbors=5))
])

modelo_knn10 = Pipeline(steps=[
    ('preprocessor',preprocessor2),
    ('knn',KNeighborsRegressor(n_neighbors=10))
])

modelo_knn30 = Pipeline(steps=[
    ('preprocessor',preprocessor2),
    ('knn',KNeighborsRegressor(n_neighbors=30))
])


modelo_tree3 = Pipeline(steps=[
    ('preprocess',preprocessor1),
    ('tree',DecisionTreeRegressor(max_depth=3))
])

modelo_tree5 = Pipeline(steps=[
    ('preprocess',preprocessor1),
    ('tree',DecisionTreeRegressor(max_depth=5))
])

modelo_tree10 = Pipeline(steps=[
    ('preprocess',preprocessor1),
    ('tree',DecisionTreeRegressor(max_depth=10))
])

modelo_tree20 = Pipeline(steps=[
    ('preprocess',preprocessor1),
    ('tree',DecisionTreeRegressor(max_depth=20))
])

In [None]:
# Setup 10-fold cross-validation
random_seed = 1
kf = KFold(n_splits=10,shuffle=True,random_state=random_seed)

estimators = [
    ('linreg', modelo_linreg),
    ('knn1', modelo_knn1),
    ('knn5', modelo_knn5),
    ('knn10', modelo_knn10),
    ('knn30', modelo_knn30),
    ('tree3',modelo_tree3),
    ('tree5',modelo_tree5),
    ('tree10',modelo_tree10),
    ('tree20',modelo_tree20)
]

In [None]:
#Declare the ensemble, and fit it using either LinearRegression, RidgeCV or LassoCV as final estimator 
StackedEnsemble = StackingRegressor(estimators=estimators,final_estimator=RidgeCV(),cv=kf)
StackedEnsemble.fit(X, y)

In [None]:
coefficients = StackedEnsemble.final_estimator_.coef_
# Display the coefficients corresponding to each model
for model_name, coef in zip([name for name, _ in estimators], coefficients):
    print(f"Coefficient for {model_name}: {coef}")

In [None]:
# Load test set
diamantes2 = pd.read_csv("diamantes2.csv")

In [None]:
# Define roles in test set 
newy = diamantes2.price
newX = diamantes2.drop(diamantes2.columns[[0,7]],axis=1)

In [None]:
# Evaluate performance (rmse) on test set
y_pred = StackedEnsemble.predict(newX)
# Compute Mean Squared Error
mse = mean_squared_error(newy,y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")