In [1]:
from IPython.display import display
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import SatandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score

In [2]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn.preprocessing._encoders')

## 1. Obtenção de dados

In [3]:
df = pd.read_csv('../data/raw/wines_SPA.csv')
df_dict = pd.read_csv('../data/external/dict.csv')

In [4]:
df_dict

Unnamed: 0,variavel,descricao,tipo,subtipo
0,winery,Vinícola: local onde o vinho foi produzido,qualitativa,nominal
1,wine,vinho,qualitativa,nominal
2,year,ano,quantitativa,discreta
3,rating,avaliação: de 4.2 a 4.9,quantitativa,discreta
4,num_reviews,número de avaliações,quantitativa,discreta
5,country,país de origem,qualitativa,nominal
6,region,região do vinho,qualitativa,nominal
7,price,preço em euros,quantitativa,continua
8,type,tipo do vinho,qualitativa,nominal
9,body,"valor do ""corpo"" do vinho: de 2 a 5",quantitativa,discreta


In [5]:
# a coluna year está como string, mudo para numérico
df['year'] = df['year'].replace('N.V.', -1)
df['year'] = df['year'].replace(np.nan, -1)
df['year'] = df['year'].astype(int)
df['year'] = df['year'].replace(-1, np.nan)

## 2. Preparação de dados

In [6]:
target_column = 'price'
nominal_columns = (
    df_dict
    .query("subtipo == 'nominal'")
    .variavel
    .to_list()
)
discrete_columns = (
    df_dict
    .query("subtipo == 'discreta'")
    .variavel
    .to_list()
)

X = df.drop(columns=[target_column], axis=1)
y = df[target_column]

In [7]:
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')), # codificação de variáveis
    ('normalization', StandardScaler()) # normalização de dados
])
discrete_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='median')), # tratamento de dados faltantes
    ('normalization', StandardScaler()) # normalização de dados
])
preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('discrete', discrete_preprocessor, discrete_columns)
])

## 3. Seleção de modelos

In [8]:
# experiment settings
n_splits_comparative_analysis = 10
n_folds_grid_search = 5
test_size = .2
random_state = 42
max_iter = 1000
scoring = 'neg_mean_squared_error'
metrics = {
    'neg_mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False),
    'neg_mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
    'r2': make_scorer(r2_score)
}
# model settings
models = [
    ('K-Nearest Neighbors', KNeighborsRegressor(), {"n_neighbors": range(3, 20, 2), 'weights': ['distance', 'uniform']}),
    ('Decision Tree',  DecisionTreeRegressor(random_state=random_state), {'criterion':['squared_error', 'friedman_mse'],'max_depth': [3, 6, 8]}),
    ('Random Forest',  RandomForestRegressor(random_state=random_state), {'criterion':['squared_error', 'friedman_mse'],'max_depth': [3, 6, 8], 'n_estimators': [10, 20, 30]})
]

In [9]:
results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)
for model_name, model_object, model_parameters in models:
    print(f"running {model_name}...")
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    scores = cross_validate(
        estimator=approach,
        X=X,
        y=y,
        cv=cross_validate_comparative_analysis,
        scoring=metrics
    )
    
    scores_df = pd.DataFrame(scores).mean().to_frame().T
    scores_df['model_name'] = model_name
    display(scores_df)
    
    results = pd.concat([results, scores_df], ignore_index=True)
    
display(results)

running K-Nearest Neighbors...


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,model_name
0,12.068237,0.221743,-6650.012096,-13.158093,0.757842,K-Nearest Neighbors


running Decision Tree...


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,model_name
0,5.487456,0.031977,-11102.858147,-21.468672,0.569711,Decision Tree


running Random Forest...


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,model_name
0,140.422097,0.05935,-8770.586976,-19.087787,0.670495,Random Forest


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,model_name
0,12.068237,0.221743,-6650.012096,-13.158093,0.757842,K-Nearest Neighbors
1,5.487456,0.031977,-11102.858147,-21.468672,0.569711,Decision Tree
2,140.422097,0.05935,-8770.586976,-19.087787,0.670495,Random Forest


### 3.1 Resultados gerais

In [10]:
def highlight_best(s, props=''):
    if s.name.endswith('time'):
        return np.where(s == np.nanmin(s.values), props, '')
    return np.where(s == np.nanmax(s.values), props, '')

display(
    results
    .groupby('model_name')
    .mean()  
    .T
    .style
    .apply(highlight_best, props='color:white;background-color:gray;font-weight: bold;', axis=1)
    .set_table_styles([{'selector': 'td', 'props': 'text-align: center;'}])
)

model_name,Decision Tree,K-Nearest Neighbors,Random Forest
fit_time,5.487456,12.068237,140.422097
score_time,0.031977,0.221743,0.05935
test_neg_mean_squared_error,-11102.858147,-6650.012096,-8770.586976
test_neg_mean_absolute_error,-21.468672,-13.158093,-19.087787
test_r2,0.569711,0.757842,0.670495


### 3.2 Persistência do Modelo

In [11]:
#Obtem o modelo e os parametros ganhadores
model_name, model_object, model_parameters  = [foo for foo in models if foo[0] == "K-Nearest Neighbors"][0] 


model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )

approach = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model_grid_search)
])

approach.fit(X, y) #Seleciona o approach

print(f"Hiper parâmetros do modelo: {approach.steps[1][1].best_params_}")

Hiper parâmetros do modelo: {'n_neighbors': 3, 'weights': 'distance'}


In [12]:
joblib.dump(approach, '../models/model.joblib') # Salva o modelo em disco

['../models/model.joblib']