In [None]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

In [None]:
# Load data
diamantes1 = pd.read_csv("diamantes1.csv")

In [None]:
# Define roles for diamantes1.csv
y = diamantes1.price
X = diamantes1.drop(diamantes1.columns[[0,7]],axis=1)

In [None]:
# Define the preprocessing pipeline
categorical_features = X.select_dtypes(include=['object','category']).columns
numeric_features = X.select_dtypes(exclude=['object','category']).columns

preprocessor = ColumnTransformer(
        transformers=[
            ('cat',OneHotEncoder(handle_unknown='ignore',sparse_output=False),categorical_features)
        ],
        remainder='passthrough'
)

In [None]:
# Combine preprocessing model and the linear regression model into a single pipeline
modelo_lr = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('linreg',LinearRegression())
])

In [None]:
# Setup 10-fold cross-validation
random_seed = 1
kf = KFold(n_splits=10,shuffle=True,random_state=random_seed)
scores = cross_val_score(modelo_lr,X,y,cv=kf,scoring='neg_mean_squared_error')
rmse_lr = np.sqrt(np.mean(-1*scores))
rmse_lr

In [None]:
modelo_lr.fit(X,y)

In [None]:
diamantes2 = pd.read_csv("diamantes2.csv")

In [None]:
newy = diamantes2.price
newX = diamantes2.drop(diamantes2.columns[[0,7]],axis=1)

In [None]:
y_pred = modelo_lr.predict(newX)
# Compute Mean Squared Error
mse = mean_squared_error(newy,y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")