In [1]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

In [2]:
# Load data
diamantes1 = pd.read_csv("diamantes1.csv")

In [3]:
# Define roles for diamantes1.csv
y = diamantes1.price
X = diamantes1.drop(diamantes1.columns[[0,7]],axis=1)

In [4]:
# Define the preprocessing pipeline
categorical_features = X.select_dtypes(include=['object','category']).columns
numeric_features = X.select_dtypes(exclude=['object','category']).columns

preprocessor = ColumnTransformer(
        transformers=[
            ('cat',OneHotEncoder(handle_unknown='ignore',sparse_output=False),categorical_features),
            ('num',StandardScaler(),numeric_features)
        ]
)

In [5]:
# Combine preprocessing model and the KNN regression model into a single pipeline
param_grid = {'knn__n_neighbors':  [1,5,10,30]}

modelo_knn = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('knn',KNeighborsRegressor())
])

In [7]:
# Setup 10-fold cross-validation
random_seed = 1
kf = KFold(n_splits=10,shuffle=True,random_state=random_seed)


# Define RMSE
def rmse(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

scoring = make_scorer(rmse,greater_is_better=False)

# Define Grid
modelo_knn_grid = GridSearchCV(modelo_knn,param_grid,cv=kf,scoring=scoring)

# Estimate the best model
modelo_knn_grid.fit(X,y)

In [10]:
resultados=pd.DataFrame(modelo_knn_grid.cv_results_)
resultados.loc[:,['param_knn__n_neighbors','mean_test_score','std_test_score']]

Unnamed: 0,param_knn__n_neighbors,mean_test_score,std_test_score
0,1,-888.969485,28.755615
1,5,-786.448009,22.337291
2,10,-814.407807,18.851992
3,30,-910.599191,14.795948


In [11]:
# Print the best parameters & the best score
print("Best Parameters:",modelo_knn_grid.best_params_)
print("Best Cross-Validation RMSE:",modelo_knn_grid.best_score_)

Best Parameters: {'knn__n_neighbors': 5}
Best Cross-Validation RMSE: -786.4480092599531


In [12]:
diamantes2 = pd.read_csv("diamantes2.csv")

In [13]:
newy = diamantes2.price
newX = diamantes2.drop(diamantes2.columns[[0,7]],axis=1)

In [14]:
y_pred = modelo_knn_grid.predict(newX)
# Compute Mean Squared Error
mse = mean_squared_error(newy,y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")

RMSE: 803.371091773606
