# **Análise comparativa de modelos**

In [1]:
from IPython.display import display, Markdown
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor

from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate

import warnings

## **1. Obtenção de dados**

Nessa etapa obtemos novamente os arquivos brutos de dados e o dicionário antes de iniciar o pré-processamento.

In [2]:
df = pd.read_csv("../data/raw/wines_SPA.csv")
df_dict = pd.read_csv("../data/external/dictionary.csv")
df_dict

Unnamed: 0,variavel,descricao,tipo,subtipo
0,winery,Nome da vinícula,Qualitativa,Nominal
1,wine,Nome do vinho,Qualitativa,Nominal
2,year,Ano da safra do vinho,Quantitativa,Contínua
3,rating,Avaliação do vinho (1-5),Quantitativa,Discreta
4,num_reviews,Número de avaliações do vinho,Quantitativa,Discreta
5,country,País de origem do vinho,Qualitativa,Nominal
6,region,Região de origem do vinho,Qualitativa,Nominal
7,price,Preço do vinho em euros,Quantitativa,Contínua
8,type,Tipo do vinho,Qualitativa,Nominal
9,body,Classificação do corpo do vinho (1-5),Quantitativa,Discreta


## **2. Preparação de Dados**

Aqui realizamos a normalização, codificação e o tratamento de dados discrepantes e/ou faltantes dentro do conjunto de dados.

In [3]:
target_column = 'price'

nominal_columns = (
    df_dict
    .query("subtipo == 'Nominal'")
    .variavel
    .to_list()
)

continuous_columns = (
    df_dict
    .query("subtipo == 'Contínua' and variavel != @target_column")
    .variavel
    .to_list()
)

discreet_columns = (
     df_dict
    .query("subtipo == 'Discreta'")
    .variavel
    .to_list()
)

df[df.eq("N.V.")] = np.nan
x = df.drop(columns=[target_column], axis=1)
y = df[target_column]

In [4]:
# tratamento de dados discrepantes
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='infrequent_if_exist')), # codificação de variáveis
    ('normalization', StandardScaler()) # normalização de dados
])

continuous_preprocessor = Pipeline([
    ('missing', IterativeImputer(estimator=LinearRegression())), # tratamento de dados faltantes
    ('normalization', StandardScaler()) # normalização de dados
])

discreet_preprocessor = Pipeline([
    ('missing', IterativeImputer(estimator=LinearRegression())), # tratamento de dados faltantes
    ('normalization', StandardScaler()) # normalização de dados
])

preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('continuous', continuous_preprocessor, continuous_columns),
    ('discreet', discreet_preprocessor, discreet_columns)
])

In [5]:
#Aplicando o pré-processamento no conjunto de dados
X_transformed = preprocessor.fit_transform(df)
df_transformed = pd.DataFrame(X_transformed)

## **3. Seleção de Modelos**

Iremos análisar quatro modelos, que serão testados utilizando o método de validação cruzada k-fold. São eles:

* **Logistic Regression (LR)**
* **Support Vector Regression (SVR)**
* **K-Nearest-Neighbors (KNN)**
* **SGDRegressor**

Além disso, cada um desses algoritmos será testado com diferentes hiperparâmetros, a fim de encontrar o melhor modelo e a configuração ideal para ele.

As métricas a serem utilizadas na análise são as seguintes:

* **Erro quadrático médio (mean squared error, MSE):** Uma métrica de risco correspondente ao valor esperado do erro ou perda quadrática.
* **Erro médio absoluto (mean absolute error, MAE):** Calcula o erro absoluto médio.
* **Coeficiente de determinação (R2 score):** Representa a proporção da variância (de y) que foi explicada pelas variáveis independentes do modelo.
* **Erro percentual absoluto médio (mean absolute percentage error, MAPE):** Mede a precisão de um modelo de regressão ao calcular a média dos erros absolutos como uma porcentagem dos valores reais.

### **3.1. Configuração do experimento**

In [6]:
n_splits_comparative_analysis = 10
n_folds_grid_search = 5
test_size = 0.2
random_state = 42
scoring = 'neg_mean_absolute_error'
metrics = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_absolute_percentage_error']

### **3.2. Configuração dos modelos**

In [7]:
models = [
    (
        'Linear Regression',
        LinearRegression(),
        {} 
    ),
    (
        'Support Vector Regression',
         SVR(max_iter=100),
         {"kernel": ["linear", "rbf"], 'C':[1, 10, 100, 1000],'gamma': [0.0001, 0.001, 0.01, 0.1, 1]},
    ),
    (
         'K-Nearest-Neighbors',
         KNeighborsRegressor(),
      {'n_neighbors': [3, 5, 7, 10, 15], 'weights': ['uniform', 'distance'],'p': [1, 2]}
    ),
    (
        'SGDRegressor',
         SGDRegressor(max_iter=100, early_stopping=True),
         {'penalty': [None, 'l2'], 'alpha': [0.0001, 0.01, 0.1], 'l1_ratio': [0.1, 0.9]}
     )
]

### **3.3. Análise Comparativa**

In [8]:
warnings.filterwarnings("ignore")  # Ignora os avisos sobre as categorias desconhecidas

In [9]:
results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)

for model_name, model_object, model_parameters in models:
    print(f"Running {model_name}...")
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=1,
        cv=cross_validate_grid_search
    )
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    scores = cross_validate(
        estimator=approach,
        X=x,
        y=y,
        cv=cross_validate_comparative_analysis,
        n_jobs=1,
        scoring=metrics
    )
    scores_df = pd.DataFrame(scores)
    scores_df['model_name'] = model_name
    display(scores_df.agg(['mean', 'std']))
    results = pd.concat([results, scores_df], ignore_index=True)

Running Linear Regression...


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,test_neg_mean_absolute_percentage_error
mean,17.376328,0.090048,-1.5024890000000002e+31,-567050700000000.0,-7.0846e+26,-17544490000000.0
std,3.321295,0.019448,1.0032400000000001e+31,102312500000000.0,7.20692e+26,3436411000000.0


Running Support Vector Regression...


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,test_neg_mean_absolute_percentage_error
mean,140.192339,0.19899,-25273.592425,-106.583434,0.029959,-3.648213
std,6.639944,0.062854,11989.246877,50.01915,0.348563,2.062539


Running K-Nearest-Neighbors...


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,test_neg_mean_absolute_percentage_error
mean,192.564879,0.373604,-6661.260276,-13.1745,0.757483,-0.108819
std,28.099024,0.106897,3282.981468,2.160287,0.090482,0.011252


Running SGDRegressor...


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2,test_neg_mean_absolute_percentage_error
mean,18.380991,0.099055,-8.583387e+21,-10366510000.0,-3.391654e+17,-321489200.0
std,0.162376,0.001345,8.34342e+21,5809302000.0,3.439056e+17,191264900.0


## **4. Resultados e discussão**

In [10]:
def highlight_best(s, props=''):
    if s.name[1] != 'std':
        if s.name[0].endswith('time'):
            return np.where(s == np.nanmin(s.values), props, '')
        return np.where(s == np.nanmax(s.values), props, '')

display(Markdown(""))
(
    results
    .groupby('model_name')
    .agg(['mean', 'std']).T
    .style
    .apply(highlight_best, props='color:white;background-color:gray;font-weight: bold;', axis=1)
    .set_table_styles([{'selector': 'td', 'props': 'text-align: center;'}])
)



Unnamed: 0,model_name,K-Nearest-Neighbors,Linear Regression,SGDRegressor,Support Vector Regression
fit_time,mean,192.564879,17.376328,18.380991,140.192339
fit_time,std,28.099024,3.321295,0.162376,6.639944
score_time,mean,0.373604,0.090048,0.099055,0.19899
score_time,std,0.106897,0.019448,0.001345,0.062854
test_neg_mean_squared_error,mean,-6661.260276,-1.5024890432974743e+31,-8.583386978809616e+21,-25273.592425
test_neg_mean_squared_error,std,3282.981468,1.003239870456171e+31,8.343419803052355e+21,11989.246877
test_neg_mean_absolute_error,mean,-13.1745,-567050691615627.0,-10366513828.937042,-106.583434
test_neg_mean_absolute_error,std,2.160287,102312531170685.69,5809301543.405813,50.01915
test_r2,mean,0.757483,-7.084599711668029e+26,-3.3916540920652314e+17,0.029959
test_r2,std,0.090482,7.206919512324389e+26,3.439055959489075e+17,0.348563


Como pode ser visto, o classificador K-Nearest-Neighbors obteve melhores resultados para todas as métricas, portanto, podemos obter os melhores parâmetros deste modelo e salvá-lo em disco para utilização em uma próxima etapa.

### **4.1. Persistência do Modelo**

In [11]:
#Obtem o modelo e os parametros ganhadores
model_name, model_object, model_parameters  = [foo for foo in models if foo[0] == "K-Nearest-Neighbors"][0] 


model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=1,
        cv=cross_validate_grid_search
    )

approach = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model_grid_search)
])

approach.fit(x, y) #Seleciona o approach

print(f"Hiper parâmetros do modelo: {approach.steps[1][1].best_params_}")

Hiper parâmetros do modelo: {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}


In [12]:
joblib.dump(approach, '../models/model.joblib') # Salva o modelo em disco

['../models/model.joblib']