In [None]:
# imports 

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Load the data 
data = pd.read_csv('../data/diamonds_train.csv')

In [None]:
# Create 'volume' variable
data['volume'] = data['x'] * data['y'] * data['z']

# To avoid errors when creating 'density' by dividing by 0
data["volume"] = data["volume"].replace(0, np.nan) # Replace 0 values with NaN
data["volume"] = data["volume"].fillna(data["volume"].median()) # Replace NaN with the median

# Create 'density' variable
data['density'] = data["carat"] / data["volume"]

In [None]:
# Encoding categorical variables
cat_cols = ['cut', 'color', 'clarity']    
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le 

In [None]:
#Features & Target

X = data[['carat', 'depth', 'table', 'density','cut', 'color', 'clarity']]  #Features
y = data["price"]                 #Target

In [None]:
# Apply Grid Search to improve the selected model

model = RandomForestRegressor(random_state = 42)

param_grid = {'n_estimators': [100, 200, 300],  # Number of trees in the forest.
              'max_depth': [None, 3, 10],  # Maximum depth of the trees.
              'min_samples_split': [2, 10],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [1, 4],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split.
              }

grid_search = GridSearchCV(model,
                           param_grid,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)

# Model training and evaluation
grid_search.fit(X,y)

print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

In [None]:
# Save the trained model
joblib.dump(grid_search.best_estimator_, 'PrimeModel.pkl')

TEST TIME

In [None]:
# Load test data
test = pd.read_csv('../data/diamonds_test.csv')

In [None]:
# Create 'volume' variable
test['volume'] = test['x'] * test['y'] * test['z']

# To avoid errors when creating 'density' by dividing by 0
test['volume'] = test['volume'].replace(0, np.nan) 
test['volume'] = test['volume'].fillna(test['volume'].median())

# Create 'density' variable
test['density'] =test["carat"] / test["volume"]

In [None]:
# Encoding categorical variables
cat_cols = ['cut', 'color', 'clarity']
label_encoders = {}
for col in cat_cols:
    test[col] = label_encoders[col].transform(test[col])

In [None]:
#Chose variables for test
test = test[['carat', 'depth', 'table', 'density','cut', 'color', 'clarity']]

In [None]:
# Load the previously saved model
modelo = joblib.load('PrimeModel.pkl')

In [None]:
# Make predictions on the test data
y_pred = modelo.predict(test)
print(y_pred)
type(y_pred)

In [None]:
# Create a DataFrame for submission (+ids)

ids = np.arange(len(y_pred))
submission_df = pd.DataFrame({
    'id': ids,
    'price': y_pred
})

In [None]:
#Save into .csv
submission_df.to_csv('../data/sample_submissions/PrimeModel.csv', index=False)
