In [None]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer 
from sklearn.tree import DecisionTreeRegressor 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

In [None]:
# Load data
diamantes1 = pd.read_csv("diamantes1.csv")

In [None]:
# Define roles for diamantes1.csv
y = diamantes1.price
X = diamantes1.drop(diamantes1.columns[[0,7]],axis=1)

In [None]:
# Define the preprocessing pipeline
categorical_features = X.select_dtypes(include=['object','category']).columns
numeric_features = X.select_dtypes(exclude=['object','category']).columns

preprocessor = ColumnTransformer(
        transformers=[
            ('cat',OneHotEncoder(handle_unknown='ignore',sparse_output=False),categorical_features)
        ],
    remainder = 'passthrough'
)

In [None]:
# Combine preprocessing model and the KNN regression model into a single pipeline
param_grid = {'tree__max_depth':  [3,5,10,20]}

modelo_tree = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('tree',DecisionTreeRegressor())
])

In [None]:
# Setup 10-fold cross-validation
random_seed = 1
kf = KFold(n_splits=10,shuffle=True,random_state=random_seed)


# Define RMSE
def rmse(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

scoring = make_scorer(rmse,greater_is_better=False)

# Define Grid
modelo_tree_grid = GridSearchCV(modelo_tree,param_grid,cv=kf,scoring=scoring)

# Estimate the best model
modelo_tree_grid.fit(X,y)

In [None]:
resultados=pd.DataFrame(modelo_tree_grid.cv_results_)
resultados.loc[:,['param_tree__max_depth','mean_test_score','std_test_score']]

In [None]:
# Print the best parameters & the best score
print("Best Parameters:",modelo_tree_grid.best_params_)
print("Best Cross-Validation RMSE:",modelo_tree_grid.best_score_)

In [None]:
diamantes2 = pd.read_csv("diamantes2.csv")

In [None]:
newy = diamantes2.price
newX = diamantes2.drop(diamantes2.columns[[0,7]],axis=1)

In [None]:
y_pred = modelo_tree_grid.predict(newX)
# Compute Mean Squared Error
mse = mean_squared_error(newy,y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")