In [1]:
from sklearn.model_selection import train_test_split
from typing import List

import pandas as pd
import numpy as np

RANDOM_SEED: int = 42

df = pd.read_csv("../datasets/diamonds/diamonds_encoded.csv")

# Split the data into 80% training and 20% testing
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)

# training data
X_train = train_df.drop('price', axis=1).to_numpy()
y_train = train_df['price'].to_numpy()

# testing data
X_test = test_df.drop('price', axis=1).to_numpy()
y_test = test_df['price'].to_numpy()

X_all = np.concatenate((X_train, X_test))
y_all = np.concatenate((y_train, y_test))

Create a simple Linear Regression on the training data and generate prices predictions for the test data with it.

In [2]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

Show the predicted prices compared to the actual ones in the test data.

In [3]:
test_df = test_df[[col for col in test_df.columns if col != 'price'] + ['price']]
test_df['predicted_price'] = predictions
test_df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price,predicted_price
4727,1.51,3,18,1,60.4,59.0,7.30,7.27,4.40,7864,8897.197014
2316,1.00,2,18,2,62.2,59.0,6.32,6.35,3.94,4830,4451.288229
3669,0.40,4,18,2,62.6,52.0,4.73,4.76,2.97,705,-187.257207
1212,1.01,2,18,3,62.1,57.0,6.39,6.45,3.99,5028,5082.900513
3032,0.31,2,21,2,59.1,59.0,4.42,4.44,2.62,544,-134.202061
...,...,...,...,...,...,...,...,...,...,...,...
3527,0.53,1,21,2,64.1,54.0,5.11,5.16,3.29,1332,1301.293787
4544,0.77,2,16,1,61.3,58.0,5.84,5.91,3.60,2001,1339.917417
4031,0.30,4,21,3,62.1,55.0,4.35,4.32,2.69,844,485.552144
4906,0.36,4,21,4,61.8,56.0,4.60,4.56,2.83,1094,1393.179516


Before proceeding with experimenting different model, we define the metrics we want to use to evaluate the performance of different models and find which one's predictions are closest to the expected values.

- Mean Squared Error (MSE).
- Root Mean Squared Error (RMSE).
- Mean Absolute Error (MAE).
- Coefficient of Determination ($R^2$).

In [4]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_model(test_prices, predictions):
	
	mae = mean_absolute_error(test_prices, predictions)
	mse = mean_squared_error(test_prices, predictions)
	rmse = np.sqrt(mse)
	r2 = r2_score(test_prices, predictions)
	
	return { 'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2 }

evaluate_model(y_test, predictions)


{'MAE': 803.8349828504461,
 'MSE': 1394931.8183011736,
 'RMSE': 1181.07231713438,
 'R2': 0.9053141135021813}

Now that we have a way to compare models' performance let's evaluate a different one: the Decision Tree.

In [5]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=2)
model.fit(X_train, y_train)

predictions = model.predict(X_test)

metrics = evaluate_model(y_test, predictions)

metrics

{'MAE': 1027.961805365557,
 'MSE': 2802246.8011463573,
 'RMSE': 1673.9912786948316,
 'R2': 0.8097876763071064}

We can tune some hyperparameters, like `max_depth` to change the maximum depth of the tree, and eventually obtain better performance.

In [6]:
criterions_to_evaluate = ['squared_error']#, 'friedman_mse', 'absolute_error', 'poisson']
max_depths_to_evaluate = [2, 5, 10, 20, 50, 100, 150, 200, 500]
results = list()

for max_depth in max_depths_to_evaluate:
    for criterion in criterions_to_evaluate:
        # Train a decision tree model
        model = DecisionTreeRegressor(max_depth=max_depth, criterion=criterion, random_state=RANDOM_SEED)
        model.fit(X_train, y_train)
        # Make predictions using the model on the testing set
        predictions = model.predict(X_test)
        # Compute and store some metrics to evaluate the model
        metrics = evaluate_model(y_test, predictions)
        results.append((max_depth, criterion, metrics['MAE'], metrics['MSE'], metrics['RMSE'], metrics['R2']))

results_dt_df = pd.DataFrame(results, columns=['max_depth', 'criterion', 'MAE', 'MSE', 'RMSE', 'R^2'])
results_dt_df

Unnamed: 0,max_depth,criterion,MAE,MSE,RMSE,R^2
0,2,squared_error,1027.961805,2802247.0,1673.991279,0.809788
1,5,squared_error,576.473263,1155419.0,1074.904287,0.921572
2,10,squared_error,425.27254,848711.3,921.255273,0.942391
3,20,squared_error,449.625885,846830.5,920.233948,0.942518
4,50,squared_error,432.160321,789295.8,888.423212,0.946424
5,100,squared_error,432.160321,789295.8,888.423212,0.946424
6,150,squared_error,432.160321,789295.8,888.423212,0.946424
7,200,squared_error,432.160321,789295.8,888.423212,0.946424
8,500,squared_error,432.160321,789295.8,888.423212,0.946424


In [7]:
def find_best_model(results_df: pd.DataFrame, model_name: str, hparameters: List[str], min_metrics = ["MAE", "MSE", "RMSE"], max_metrics = ["R^2"]):
    for metric in min_metrics:
        best_index = results_df[metric].idxmin()
        best_model = results_df.loc[best_index, [*hparameters, metric]]
        print(f"Best performing {model_name} on {metric}:")
        print(best_model.to_string(name=False), end="\n\n")
        
    for metric in max_metrics:
        best_index = results_df[metric].idxmax()
        best_model = results_df.loc[best_index, [*hparameters, metric]]
        print(f"Best performing {model_name} on {metric}:")
        print(best_model.to_string(name=False), end="\n\n")

find_best_model(results_dt_df, model_name=type(model).__name__, hparameters=["max_depth"])

Best performing DecisionTreeRegressor on MAE:
max_depth           10
MAE          425.27254

Best performing DecisionTreeRegressor on MSE:
max_depth               50
MSE          789295.803607

Best performing DecisionTreeRegressor on RMSE:
max_depth            50
RMSE         888.423212

Best performing DecisionTreeRegressor on R^2:
max_depth          50
R^2          0.946424



### K-fold Cross Validation

When evaluating different hyperparameters of a ML model there is a risk of overfitting because the parameters may have been tweaked on the test set until we have the optimal configuration.
One way to avoid this problem could be to hold out another portion of the original dataset as "validation set" and perform experiments on that before the test set.

Otherwise we can perform a K-fold Cross Validation, in which we split the whole dataset in K subsets and then for each of the _k_ "folds":
1. the model is trained using the other $k-1$ folds as training data
2. the evaluation is performed on the remaining part of the data (the left out fold)

In [8]:
from sklearn.model_selection import cross_validate

def evaluate_model_cv(model, X, y, cv=10):
	cv_results = cross_validate(model, X, y, cv=cv, scoring=['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'])
 
	metrics = {
		'MAE': -cv_results['test_neg_mean_absolute_error'],
		'MSE': -cv_results['test_neg_mean_squared_error'],
		'RMSE': np.sqrt(-cv_results['test_neg_mean_squared_error']),
		'R^2': cv_results['test_r2']
	}
 
	metrics_mean = {
		'MAE': metrics['MAE'].mean(),
		'MSE': metrics['MSE'].mean(),
		'RMSE': metrics['RMSE'].mean(),
		'R^2': metrics['R^2'].mean()
	}
 
	print(pd.Series(metrics_mean).add_prefix('Average ').to_string(name=False))
 
	return cv_results, metrics_mean, metrics

Let's take as a reference the 2 best performing configurations of the `DecisionTreeRegressor` model:

In [9]:
results_dt_df[(results_dt_df['max_depth'] == 50) | (results_dt_df['max_depth'] == 10)].drop('criterion', axis=1)

Unnamed: 0,max_depth,MAE,MSE,RMSE,R^2
2,10,425.27254,848711.278615,921.255273,0.942391
4,50,432.160321,789295.803607,888.423212,0.946424


We perform the K-fold CV with $k=10$, first with `DecisionTreeRegressor(max_depth=10)` and then with `max_depth=50`.

In [10]:
model = DecisionTreeRegressor(max_depth=10, random_state=RANDOM_SEED)

cv_results, metrics_mean, metrics = evaluate_model_cv(model, X_all, y_all)

Average MAE        427.222212
Average MSE     755195.884044
Average RMSE       867.119115
Average R^2          0.952039


In this case the average performance of computed during the cross validation is very close or even better w.r.t. the metrics computed on the model previously trained with the same configuration.

Therefore we are not overfitting!

In [11]:
cross_validation_metrics_df = pd.DataFrame(metrics)
cross_validation_metrics_df

Unnamed: 0,MAE,MSE,RMSE,R^2
0,432.818916,662818.174018,814.136459,0.957101
1,423.080921,694786.514895,833.53855,0.958524
2,425.315258,753932.76885,868.29302,0.947518
3,411.788357,641216.290319,800.759821,0.959223
4,430.721277,709591.606417,842.372605,0.958219
5,433.683874,790835.620657,889.289391,0.949502
6,451.903031,931297.898438,965.03777,0.945114
7,432.123066,794749.419376,891.487195,0.951621
8,401.139723,644707.650934,802.936891,0.955359
9,429.647699,928022.896531,963.33945,0.938214


When performing the cross validation with the second configuration instead, we can instead easily notice that this time the average performance is worse than what we had before by using the same parameters and it is also slightly worse or close to the performance of the first configuration.

It is very likely that we were overfitting with this setting!

In [12]:
model = DecisionTreeRegressor(max_depth=50, random_state=RANDOM_SEED)

cv_results, metrics_mean, metrics = evaluate_model_cv(model, X_all, y_all)

Average MAE        449.634669
Average MSE     851410.364128
Average RMSE       920.191985
Average R^2          0.945848


In [13]:
cross_validation_metrics_df = pd.DataFrame(metrics)
cross_validation_metrics_df

Unnamed: 0,MAE,MSE,RMSE,R^2
0,450.697395,811413.0,900.784666,0.947484
1,456.402806,775272.7,880.495705,0.953719
2,444.156313,772210.3,878.754958,0.946245
3,431.390782,748549.5,865.187551,0.952397
4,430.40481,781085.0,883.790146,0.954009
5,455.893788,798568.5,893.62659,0.949008
6,487.128257,1036121.0,1017.900109,0.938937
7,442.366733,810828.7,900.460293,0.950642
8,428.51503,801925.8,895.503097,0.944472
9,469.390782,1178129.0,1085.416736,0.921563


### Bonus: exploration of more complex models

In [14]:
from sklearn.ensemble import RandomForestRegressor

max_depths_to_evaluate = [2, 5, 10, 20, 50, 100, 150, 200]
n_estimators_to_evaluate = [10, 50, 100, 200]

results = []

for max_depth in max_depths_to_evaluate:
    for n_estimators in n_estimators_to_evaluate:
        model = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators, random_state=RANDOM_SEED)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        metrics = evaluate_model(y_test, predictions)
        results.append((max_depth, n_estimators, metrics['MAE'], metrics['MSE'], metrics['RMSE'], metrics['R2']))

results_rf_df = pd.DataFrame(results, columns=['max_depth', 'n_estimators', 'MAE', 'MSE', 'RMSE', 'R^2'])
results_rf_df

Unnamed: 0,max_depth,n_estimators,MAE,MSE,RMSE,R^2
0,2,10,989.321315,2717002.0,1648.333085,0.815574
1,2,50,998.607417,2738753.0,1654.917938,0.814098
2,2,100,988.765545,2675775.0,1635.77975,0.818372
3,2,200,985.220411,2659309.0,1630.738746,0.81949
4,5,10,497.377875,929933.2,964.330433,0.936878
5,5,50,489.87945,907769.9,952.769589,0.938382
6,5,100,489.76083,896788.8,946.989309,0.939127
7,5,200,488.698726,902682.4,950.095971,0.938727
8,10,10,360.933619,663318.9,814.443944,0.954975
9,10,50,344.695613,650005.5,806.229191,0.955879


In [15]:
find_best_model(results_rf_df, model_name=type(model).__name__, hparameters=["max_depth", "n_estimators"])

Best performing RandomForestRegressor on MAE:
max_depth        50.000000
n_estimators    100.000000
MAE             339.110762

Best performing RandomForestRegressor on MSE:
max_depth           10.000000
n_estimators       100.000000
MSE             636766.497781

Best performing RandomForestRegressor on RMSE:
max_depth        10.000000
n_estimators    100.000000
RMSE            797.976502

Best performing RandomForestRegressor on R^2:
max_depth        10.000000
n_estimators    100.000000
R^2               0.956777



In [16]:
model = RandomForestRegressor(max_depth=50, n_estimators=100, random_state=RANDOM_SEED)

cv_results, metrics_mean, metrics = evaluate_model_cv(model, X_all, y_all)

Average MAE        327.332707
Average MSE     468811.319010
Average RMSE       677.778126
Average R^2          0.970157


In [17]:
model = RandomForestRegressor(max_depth=10, n_estimators=100, random_state=RANDOM_SEED)

cv_results, metrics_mean, metrics = evaluate_model_cv(model, X_all, y_all)

Average MAE        336.426744
Average MSE     473637.014104
Average RMSE       681.668055
Average R^2          0.969847
