# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import optuna
from optuna.visualization import plot_parallel_coordinate
import seaborn as sns
from catboost import CatBoostRegressor
import edatk as eda
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

# Import Data
For additional details on the data source, see https://www.kaggle.com/mysarahmadbhat/wine-tasting

In [20]:
df = pd.read_csv('winemag-data-130k-v2.csv')
df.head()

Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,Other,Other,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Other,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,Other,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120975 entries, 1 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     120975 non-null  int64  
 1   country                120916 non-null  object 
 2   description            120975 non-null  object 
 3   designation            86196 non-null   object 
 4   points                 120975 non-null  int64  
 5   price                  120975 non-null  float64
 6   province               120916 non-null  object 
 7   region_1               101400 non-null  object 
 8   region_2               50292 non-null   object 
 9   taster_name            96479 non-null   object 
 10  taster_twitter_handle  91559 non-null   object 
 11  title                  120975 non-null  object 
 12  variety                120974 non-null  object 
 13  winery                 120975 non-null  object 
dtypes: float64(1), int64(2), object(11)


In [4]:
# Remove null target column
df = df.loc[~df.price.isna(), :]

# Feature Selection and Train Test Split
For demonstration purposes, select just a few features, predicting price.

In [7]:
# Choose just a few features for demonstration, infer categorical features
feature_cols = ['country', 'points', 'province', 'region_1', 'region_2', 'taster_name', 'variety', 'winery']
cat_features = [col for col in feature_cols if df[col].dtype == 'object']
for col in cat_features:
    df[col] = df[col].fillna('Other')
target_col = 'price'

In [8]:
# Train test split
train_df, test_df = train_test_split(df, test_size=0.3, shuffle=False)

train_x = train_df.loc[:, feature_cols]
train_y = train_df.loc[:, target_col]

test_x = test_df.loc[:, feature_cols]
test_y = test_df.loc[:, target_col]

# Default Parameter Catboost Train

In [9]:
# Train a model with default parameters and score
model = CatBoostRegressor(loss_function = 'RMSE', eval_metric='RMSE', verbose=False, cat_features=cat_features, random_state=42)
default_train_score = np.mean(eda.cross_validate_custom(train_x, train_y, model, mean_absolute_percentage_error))
print('Training with default parameters results in a training score of {:.3f}.'.format(default_train_score))

Training with default parameters results in a training score of 0.298.


# Optuna Train
Train a hyperparameter optimized gbt

In [11]:
def objective(trial):

    # Define parameter dictionary used to build catboost model
    params = {
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'verbose': False,
        'cat_features': cat_features,
        'random_state': 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2),
        'depth': trial.suggest_int('depth', 2, 12),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50)
    }
    
    # Build and score model
    clf = CatBoostRegressor(**params)
    score = np.mean(eda.cross_validate_custom(train_x, train_y, clf, mean_absolute_percentage_error))

    return score

In [12]:
# Create an optuna study (minimize cost) and run optimizer
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=1800)

[32m[I 2021-09-27 15:10:03,100][0m A new study created in memory with name: no-name-4ac556dc-a71f-40c7-ad36-250176c27ef1[0m
[32m[I 2021-09-27 15:24:03,097][0m Trial 0 finished with value: 0.29305524979623054 and parameters: {'learning_rate': 0.06805987625877713, 'depth': 9, 'n_estimators': 850}. Best is trial 0 with value: 0.29305524979623054.[0m
[32m[I 2021-09-27 15:30:27,723][0m Trial 1 finished with value: 0.29477551971365823 and parameters: {'learning_rate': 0.14046624315090983, 'depth': 9, 'n_estimators': 450}. Best is trial 0 with value: 0.29305524979623054.[0m
[32m[I 2021-09-27 15:36:49,911][0m Trial 2 finished with value: 0.3015142789932671 and parameters: {'learning_rate': 0.17778768142416215, 'depth': 4, 'n_estimators': 900}. Best is trial 0 with value: 0.29305524979623054.[0m
[32m[I 2021-09-27 16:02:10,785][0m Trial 3 finished with value: 0.28846074806206023 and parameters: {'learning_rate': 0.0888813729642258, 'depth': 12, 'n_estimators': 800}. Best is trial 3

# Evaluate Training Results
Compare default params vs. optimized hyperparmeters.

In [13]:
# Grab best trial from optuna study
best_trial_optuna = study.best_trial
print('Best score {:.3f}, with params {}'.format(best_trial_optuna.value, best_trial_optuna.params))

Best score 0.288, with params {'learning_rate': 0.0888813729642258, 'depth': 12, 'n_estimators': 800}


In [14]:
# Compare best trial vs. default parameters
print('Default parameters resulted in a score of {:.3f} vs. Optuna hyperparameter optimization score of {:.3f}.'.format(default_train_score, best_trial_optuna.value))

Default parameters resulted in a score of 0.298 vs. Optuna hyperparameter optimization score of 0.288.


In [15]:
# Visualize results to spot any hyperparameter trends
plot_parallel_coordinate(study)

# Evaluate Test Results

In [16]:
# Run baseline model (default predicting mean)
preds_baseline = np.zeros_like(test_y)
preds_baseline = np.mean(train_y) + preds_baseline
baseline_model_score = mean_absolute_percentage_error(test_y, preds_baseline)
print('Baseline score (mean) is {:.2f}.'.format(baseline_model_score))

Baseline score (mean) is 0.79.


In [17]:
# Rerun default model on full training set and score on test set
simple_model = model.fit(train_x, train_y)
simple_model_score = mean_absolute_percentage_error(test_y, model.predict(test_x))
print('Default parameter model score is {:.2f}'.format(simple_model_score))

Default parameter model score is 0.30


In [18]:
# Rerun optimized model on full training set and score on test set
params = best_trial_optuna.params
params['loss_function'] = 'RMSE'
params['eval_metric'] ='RMSE'
params['verbose'] = False
params['cat_features'] = cat_features
params['random_state'] = 42
opt_model = CatBoostRegressor(**params)
opt_model.fit(train_x, train_y)
opt_model_score = mean_absolute_percentage_error(test_y, opt_model.predict(test_x))
print('Optimized model score is {:.2f}.'.format(opt_model_score))

Optimized model score is 0.29.
