# **Análise comparativa de modelos**

In [72]:
from IPython.display import display, Markdown
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor

from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate

## **1. Obtenção de dados**

Nessa etapa obtemos novamente os arquivos brutos de dados e o dicionário antes de iniciar o pré-processamento.

In [7]:
df = pd.read_csv("../data/raw/wines_SPA.csv")
df_dict = pd.read_csv("../data/external/dictionary.csv")
df_dict

Unnamed: 0,variavel,descricao,tipo,subtipo
0,winery,Nome da vinícula,Qualitativa,Nominal
1,wine,Nome do vinho,Qualitativa,Nominal
2,year,Ano da safra do vinho,Quantitativa,Contínua
3,rating,Avaliação do vinho (1-5),Quantitativa,Discreta
4,num_reviews,Número de avaliações do vinho,Quantitativa,Discreta
5,country,País de origem do vinho,Qualitativa,Nominal
6,region,Região de origem do vinho,Qualitativa,Nominal
7,price,Preço do vinho em euros,Quantitativa,Contínua
8,type,Tipo do vinho,Qualitativa,Nominal
9,body,Classificação do corpo do vinho (1-5),Quantitativa,Discreta


## **2. Preparação de Dados**

Aqui realizamos a normalização, codificação e o tratamento de dados discrepantes e/ou faltantes dentro do conjunto de dados.

In [57]:
target_column = 'price'

nominal_columns = (
    df_dict
    .query("subtipo == 'Nominal'")
    .variavel
    .to_list()
)

continuous_columns = (
    df_dict
    .query("subtipo == 'Contínua' and variavel != @target_column")
    .variavel
    .to_list()
)

discreet_columns = (
     df_dict
    .query("subtipo == 'Discreta'")
    .variavel
    .to_list()
)

x = df.drop(columns=[target_column], axis=1)
y = df[target_column]

In [58]:
def convert_to_numeric(df, columns):
    for col in columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

x = convert_to_numeric(x, continuous_columns + discreet_columns)

# tratamento de dados discrepantes
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first')), # codificação de variáveis
    ('normalization', StandardScaler()) # normalização de dados
])

continuous_preprocessor = Pipeline([
    ('missing', IterativeImputer(estimator=LinearRegression())), # tratamento de dados faltantes
    ('normalization', StandardScaler()) # normalização de dados
])

discreet_preprocessor = Pipeline([
    ('missing', IterativeImputer(estimator=LinearRegression())), # tratamento de dados faltantes
    ('normalization', StandardScaler()) # normalização de dados
])

preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('continuous', continuous_preprocessor, continuous_columns),
    ('discreet', discreet_preprocessor, discreet_columns)
])

## **3. Seleção de Modelos**

Iremos análisar quatro modelos, que serão testados utilizando o método de validação cruzada k-fold. São eles:

* **Logistic Regression (LR)**
* **Support Vector Regression (SVR)**
* **K-Nearest-Neighbors (KNN)**
* **SGDRegressor**

Além disso, cada um desses algoritmos será testado com diferentes hiperparâmetros, a fim de encontrar o melhor modelo e a configuração ideal para ele.

As métricas a serem utilizadas na análise são as seguintes:

* **Erro quadrático médio (mean squared error, MSE):** Uma métrica de risco correspondente ao valor esperado do erro ou perda quadrática.
* **Erro médio absoluto (mean absolute error, MAE):** Calcula o erro absoluto médio.
* **Coeficiente de determinação (R2 score):** Representa a proporção da variância (de y) que foi explicada pelas variáveis independentes do modelo.
* **Erro percentual absoluto médio (mean absolute percentage error, MAPE):** Mede a precisão de um modelo de regressão ao calcular a média dos erros absolutos como uma porcentagem dos valores reais.

### **3.1. Configuração do experimento**

In [66]:
n_splits_comparative_analysis = 10
n_folds_grid_search = 5
test_size = 0.2
random_state = 42
scoring = 'neg_mean_absolute_error'
metrics = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_absolute_percentage_error']

### **3.2. Configuração dos modelos**

In [70]:
models = [
    (
        'Linear Regression',
        LinearRegression(),
        {} 
    ),
    (
        'Support Vector Regression',
         SVR(),
         {"kernel": ["linear", "rbf"], 'C':[1, 10, 100, 1000],'gamma': [0.0001, 0.001, 0.01, 0.1, 1]},
    ),
    (
         'K-Nearest-Neighbors',
         KNeighborsRegressor(),
      {'n_neighbors': [3, 5, 7, 10, 15], 'weights': ['uniform', 'distance'],'p': [1, 2]}
    ),
    (
        'SGDRegressor',
         SGDRegressor(),
         {'penalty': ['none', 'l2', 'l1', 'elasticnet'], 'alpha': [0.0001, 0.001, 0.01, 0.1], 'l1_ratio': [0.1, 0.5, 0.9], 
          'learning_rate': ['constant', 'optimal', 'invscaling'],  'eta0': [0.001, 0.01, 0.1]}
     )
]

### **3.3. Análise Comparativa**

In [73]:
results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)

for model_name, model_object, model_parameters in models:
    print(f"Running {model_name}...")
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    scores = cross_validate(
        estimator=approach,
        X=x,
        y=y,
        cv=cross_validate_comparative_analysis,
        n_jobs=-1,
        scoring=metrics
    )
    scores_df = pd.DataFrame(scores)
    scores_df['model_name'] = model_name
    display(scores_df.agg(['mean', 'std']))
    results = pd.concat([results, scores_df], ignore_index=True)
    
    error_score='raise'

Running Logistic Regression...


ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/kailanyalves/.cache/pypoetry/virtualenvs/src-hucw_3_i-py3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/kailanyalves/.cache/pypoetry/virtualenvs/src-hucw_3_i-py3.10/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/kailanyalves/.cache/pypoetry/virtualenvs/src-hucw_3_i-py3.10/lib/python3.10/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/home/kailanyalves/.cache/pypoetry/virtualenvs/src-hucw_3_i-py3.10/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/kailanyalves/.cache/pypoetry/virtualenvs/src-hucw_3_i-py3.10/lib/python3.10/site-packages/sklearn/model_selection/_search.py", line 1018, in fit
    self._run_search(evaluate_candidates)
  File "/home/kailanyalves/.cache/pypoetry/virtualenvs/src-hucw_3_i-py3.10/lib/python3.10/site-packages/sklearn/model_selection/_search.py", line 1572, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "/home/kailanyalves/.cache/pypoetry/virtualenvs/src-hucw_3_i-py3.10/lib/python3.10/site-packages/sklearn/model_selection/_search.py", line 995, in evaluate_candidates
    _warn_or_raise_about_fit_failures(out, self.error_score)
  File "/home/kailanyalves/.cache/pypoetry/virtualenvs/src-hucw_3_i-py3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 529, in _warn_or_raise_about_fit_failures
    raise ValueError(all_fits_failed_message)
ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/home/kailanyalves/.cache/pypoetry/virtualenvs/src-hucw_3_i-py3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/kailanyalves/.cache/pypoetry/virtualenvs/src-hucw_3_i-py3.10/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/kailanyalves/.cache/pypoetry/virtualenvs/src-hucw_3_i-py3.10/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1231, in fit
    check_classification_targets(y)
  File "/home/kailanyalves/.cache/pypoetry/virtualenvs/src-hucw_3_i-py3.10/lib/python3.10/site-packages/sklearn/utils/multiclass.py", line 219, in check_classification_targets
    raise ValueError(
ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

