# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import optuna
from optuna.visualization import plot_parallel_coordinate
import seaborn as sns
from catboost import CatBoostRegressor
import edatk as eda
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

# Import Data
For additional details on the data source, see https://www.kaggle.com/budnyak/wine-rating-and-price

In [2]:
df = pd.read_csv('red.csv')
df.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8666 entries, 0 to 8665
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8666 non-null   object 
 1   Country          8666 non-null   object 
 2   Region           8666 non-null   object 
 3   Winery           8666 non-null   object 
 4   Rating           8666 non-null   float64
 5   NumberOfRatings  8666 non-null   int64  
 6   Price            8666 non-null   float64
 7   Year             8666 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 541.8+ KB


# Feature Selection and Train Test Split
For demonstration purposes, select just a few features, predicting price.

In [4]:
# Choose just a few features for demonstration, infer categorical features
feature_cols = ['Country', 'Region', 'Winery', 'Rating', 'Year']
cat_features = [col for col in feature_cols if df[col].dtype == 'object']
target_col = 'Price'

In [5]:
# Train test split
train_df, test_df = train_test_split(df, test_size=0.3, shuffle=False)

train_x = train_df.loc[:, feature_cols]
train_y = train_df.loc[:, target_col]

test_x = test_df.loc[:, feature_cols]
test_y = test_df.loc[:, target_col]

# Default Parameter Catboost Train

In [6]:
# Train a model with default parameters and score
model = CatBoostRegressor(loss_function = 'RMSE', eval_metric='RMSE', verbose=False, cat_features=cat_features, random_state=42)
default_train_score = np.mean(eda.cross_validate_custom(train_x, train_y, model, mean_absolute_percentage_error))
print('Training with default parameters results in a training score of {:.3f}.'.format(default_train_score))

Training with default parameters results in a training score of 0.367.


# Optuna Train
Train a hyperparameter optimized gbt

In [7]:
def objective(trial):

    # Define parameter dictionary used to build catboost model
    params = {
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'verbose': False,
        'cat_features': cat_features,
        'random_state': 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2),
        'depth': trial.suggest_int('depth', 2, 12),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50)
    }
    
    # Build and score model
    clf = CatBoostRegressor(**params)
    score = np.mean(eda.cross_validate_custom(train_x, train_y, clf, mean_absolute_percentage_error))

    return score

In [8]:
# Create an optuna study (minimize cost) and run optimizer
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=1200)

[32m[I 2021-09-18 21:13:43,437][0m A new study created in memory with name: no-name-de127745-a332-4adb-8a12-3edc44e49bcc[0m
[32m[I 2021-09-18 21:15:48,840][0m Trial 0 finished with value: 0.36781949928392776 and parameters: {'learning_rate': 0.04371199264656335, 'depth': 6, 'n_estimators': 1000}. Best is trial 0 with value: 0.36781949928392776.[0m
[32m[I 2021-09-18 21:17:04,053][0m Trial 1 finished with value: 0.3736021455942898 and parameters: {'learning_rate': 0.053810144189057364, 'depth': 6, 'n_estimators': 600}. Best is trial 0 with value: 0.36781949928392776.[0m
[32m[I 2021-09-18 21:17:14,382][0m Trial 2 finished with value: 0.42232250589936904 and parameters: {'learning_rate': 0.11040314425303185, 'depth': 5, 'n_estimators': 100}. Best is trial 0 with value: 0.36781949928392776.[0m
[32m[I 2021-09-18 21:17:19,116][0m Trial 3 finished with value: 0.45686989397094485 and parameters: {'learning_rate': 0.19044543569044672, 'depth': 2, 'n_estimators': 100}. Best is trial

# Evaluate Training Results
Compare default params vs. optimized hyperparmeters.

In [9]:
# Grab best trial from optuna study
best_trial_optuna = study.best_trial
print('Best score {:.3f}, with params {}'.format(best_trial_optuna.value, best_trial_optuna.params))

Best score 0.364, with params {'learning_rate': 0.060136314568611265, 'depth': 9, 'n_estimators': 900}


In [10]:
# Compare best trial vs. default parameters
print('Default parameters resulted in a score of {:.3f} vs. Optuna hyperparameter optimization score of {:.3f}.'.format(default_train_score, best_trial_optuna.value))

Default parameters resulted in a score of 0.367 vs. Optuna hyperparameter optimization score of 0.364.


In [11]:
# Visualize results to spot any hyperparameter trends
plot_parallel_coordinate(study)

# Evaluate Test Results

In [13]:
# Run baseline model (default predicting mean)
preds_baseline = np.zeros_like(test_y)
preds_baseline = np.mean(train_y) + preds_baseline
baseline_model_score = mean_absolute_percentage_error(test_y, preds_baseline)
print('Baseline score (mean) is {:.2f}.'.format(baseline_model_score))

Baseline score (mean) is 1.61.


In [14]:
# Rerun default model on full training set and score on test set
simple_model = model.fit(train_x, train_y)
simple_model_score = mean_absolute_percentage_error(test_y, model.predict(test_x))
print('Default parameter model score is {:.2f}'.format(simple_model_score))

Default parameter model score is 0.35


In [15]:
# Rerun optimized model on full training set and score on test set
params = best_trial_optuna.params
params['loss_function'] = 'RMSE'
params['eval_metric'] ='RMSE'
params['verbose'] = False
params['cat_features'] = cat_features
params['random_state'] = 42
opt_model = CatBoostRegressor(**params)
opt_model.fit(train_x, train_y)
opt_model_score = mean_absolute_percentage_error(test_y, opt_model.predict(test_x))
print('Optimized model score is {:.2f}.'.format(opt_model_score))

Optimized model score is 0.33.
