<font size="30">Análise Comparativa dos Dados</font>

## 1. Obtenção de Dados

### 1.1 Importação de bibliotecas

In [1]:
import pandas as pd
from IPython.display import display, Markdown, HTML
import joblib
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import classification_report
from sklearn.exceptions import UndefinedMetricWarning

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

### 1.2 Importação dos Dados

Nessa etapa obteremos novamente os arquivos brutos de dados e o dicionário antes de iniciar o pre-processamento, pois serão utilizados além de serem necessários para visualização

In [2]:
caminho = '../data/raw/Orange_Quality_Data.csv'
laranjas = pd.read_csv(caminho) #Obtendo o dataset

laranjas

Unnamed: 0,Size (cm),Weight (g),Brix (Sweetness),pH (Acidity),Softness (1-5),HarvestTime (days),Ripeness (1-5),Color,Variety,Blemishes (Y/N),Quality (1-5)
0,7.5,180,12.0,3.2,2.0,10,4.0,Orange,Valencia,N,4.0
1,8.2,220,10.5,3.4,3.0,14,4.5,Deep Orange,Navel,N,4.5
2,6.8,150,14.0,3.0,1.0,7,5.0,Light Orange,Cara Cara,N,5.0
3,9.0,250,8.5,3.8,4.0,21,3.5,Orange-Red,Blood Orange,N,3.5
4,8.5,210,11.5,3.3,2.5,12,5.0,Orange,Hamlin,Y (Minor),4.5
...,...,...,...,...,...,...,...,...,...,...,...
236,8.0,194,10.9,3.6,5.0,13,1.0,Orange-Red,Tangerine,Y (Scars),5.0
237,7.4,275,8.5,3.5,5.0,20,5.0,Light Orange,Minneola (Hybrid),N,4.0
238,7.5,196,15.7,3.0,3.0,13,3.0,Deep Orange,Temple,Y (Minor Insect Damage),5.0
239,7.2,251,9.8,4.3,3.0,23,1.0,Light Orange,Moro (Blood),Y (Minor Insect Damage),3.0


In [3]:
dicionario = pd.read_csv("../data/external/dicionario.csv")
dicionario

Unnamed: 0,variavel,descrição,tipo,subtipo
0,Size (cm),Tamanho da fruta em cm,Quantitativa,Contínua
1,Weight (g),Peso da fruta em g,Quantitativa,Contínua
2,Brix (Sweetness),Nível de doçura,Quantitativa,Contínua
3,pH (Acidity),Nível de acidez em pH,Quantitativa,Contínua
4,Softness (1-5),Maciez de 1-5,Quantitativa,Discreta
5,HarvestTime (days),Dias desde a colheita,Quantitativa,Discreta
6,Ripeness (1-5),Maduração de 1-5,Quantitativa,Discreta
7,Color,Cor da laranja,Qualitativa,Nominal
8,Variety,Variedade da laranja,Qualitativa,Nominal
9,Blemishes (Y/N),Defeito,Qualitativa,Nominal


In [4]:
target_column = 'Quality (1-5)'
qualitative_columns = (
    dicionario
    .query("tipo == 'Qualitativa' and variavel != @target_column")
    .variavel
    .to_list()
)
quantitative_columns = (
    dicionario
    .query("tipo == 'Quantitativa' and variavel != @target_column")
    .variavel
    .to_list()
)

X = laranjas.drop(columns=[target_column], axis=1)
y = laranjas[target_column]

In [5]:
# tratamento de dados discrepantes
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='infrequent_if_exist')), # codificação de variáveis
    ('normalization', StandardScaler()) # normalização de dados
])
continuous_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')), # tratamento de dados faltantes
    ('normalization', StandardScaler()) # normalização de dados
])

preprocessor = ColumnTransformer([
    ('qualitative', nominal_preprocessor, qualitative_columns),
    ('quantitative', continuous_preprocessor, quantitative_columns)
])

In [13]:
# Configurações do experimento
n_splits_comparative_analysis = 3
n_folds_grid_search = 2
test_size = 0.3
random_state = 42
scoring = 'neg_mean_absolute_error'

# Métricas para análises
metrics = ['neg_mean_absolute_error', 'r2', 'max_error', 'explained_variance']

# Configurações do modelo
models = [
    ('Linear Regression', LinearRegression(), {}),
    ('Decision Tree', DecisionTreeRegressor(random_state=random_state), {'criterion': ['squared_error', 'absolute_error'], 'max_depth': [3, 6, 8]}),
    ('Random Forest', RandomForestRegressor(random_state=random_state), {'criterion': ['squared_error', 'absolute_error'], 'max_depth': [3, 6, 8], 'n_estimators': [10, 30]}),
    ('K Neighbors', KNeighborsRegressor(), {'n_neighbors': [1, 3, 5, 10]}),
]

results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)

for model_name, model_object, model_parameters in models:
    print(f"Running {model_name}...")
    
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=2,
        cv=cross_validate_grid_search
    )
    
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    scores = cross_validate(
            estimator=approach,
            X=X,
            y=y,
            cv=cross_validate_comparative_analysis,
            n_jobs=2,
            scoring=metrics
        )

    scores['model_name'] = [model_name] * n_splits_comparative_analysis
    df_scores = pd.DataFrame(scores)
    df_scores = df_scores.drop(columns=['model_name'])
    df_scores = df_scores.agg(['mean', 'std'])
    
    print(f"Results for {model_name}:")
    display(df_scores)
    
    results = pd.concat([results, pd.DataFrame(scores)], ignore_index=True)

# Mostrar resultados finais
print("Final results:")
display(results)

Running Linear Regression...
Results for Linear Regression:




Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_r2,test_max_error,test_explained_variance
mean,0.087931,0.032073,-0.582637,0.364138,-2.274667,0.38881
std,0.008684,0.007806,0.057669,0.108898,0.274822,0.107925


Running Decision Tree...




Results for Decision Tree:




Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_r2,test_max_error,test_explained_variance
mean,0.167491,0.022849,-0.684267,-0.070019,-3.333333,0.027052
std,0.01133,0.003416,0.125484,0.57835,0.57735,0.412536


Running Random Forest...




Results for Random Forest:




Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_r2,test_max_error,test_explained_variance
mean,2.092908,0.02691,-0.503737,0.514445,-2.410542,0.529959
std,0.208287,0.005766,0.03209,0.046509,0.601895,0.033232


Running K Neighbors...




Results for K Neighbors:


Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_r2,test_max_error,test_explained_variance
mean,0.131053,0.03012,-0.691096,0.146897,-2.9,0.158655
std,0.003468,0.005632,0.045737,0.055749,0.173205,0.055401


Final results:


Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,test_r2,test_max_error,test_explained_variance,model_name
0,0.079379,0.040685,-0.517761,0.474956,-1.957357,0.504008,Linear Regression
1,0.087674,0.03007,-0.602075,0.360192,-2.43687,0.372381,Linear Regression
2,0.096741,0.025464,-0.628075,0.257267,-2.429775,0.290041,Linear Regression
3,0.156064,0.022401,-0.828767,-0.731875,-4.0,-0.440891,Decision Tree
4,0.167689,0.01968,-0.60274,0.338033,-3.0,0.338209,Decision Tree
5,0.178722,0.026467,-0.621295,0.183786,-3.0,0.183838,Decision Tree
6,2.212484,0.021921,-0.48189,0.517667,-1.731667,0.535248,Random Forest
7,2.21384,0.025586,-0.540579,0.466409,-2.878914,0.494398,Random Forest
8,1.8524,0.033222,-0.488742,0.559261,-2.621044,0.560229,Random Forest
9,0.13037,0.036384,-0.654795,0.125466,-3.0,0.154949,K Neighbors


In [14]:
def highlight_best(s, props=''):
    if s.name[1] != 'std':
        if s.name[0].endswith('time'):
            return np.where(s == np.nanmin(s.values), props, '')
        return np.where(s == np.nanmax(s.values), props, '')

display(Markdown("### 3.3 Resultados gerais e discussão"))
(
    results
    .groupby('model_name')
    .agg(['mean', 'std']).T
    .style
    .apply(highlight_best, props='color:white;background-color:gray;font-weight: bold;', axis=1)
    .set_table_styles([{'selector': 'td', 'props': 'text-align: center;'}])
)

### 3.3 Resultados gerais e discussão

Unnamed: 0,model_name,Decision Tree,K Neighbors,Linear Regression,Random Forest
fit_time,mean,0.167491,0.131053,0.087931,2.092908
fit_time,std,0.01133,0.003468,0.008684,0.208287
score_time,mean,0.022849,0.03012,0.032073,0.02691
score_time,std,0.003416,0.005632,0.007806,0.005766
test_neg_mean_absolute_error,mean,-0.684267,-0.691096,-0.582637,-0.503737
test_neg_mean_absolute_error,std,0.125484,0.045737,0.057669,0.03209
test_r2,mean,-0.070019,0.146897,0.364138,0.514445
test_r2,std,0.57835,0.055749,0.108898,0.046509
test_max_error,mean,-3.333333,-2.9,-2.274667,-2.410542
test_max_error,std,0.57735,0.173205,0.274822,0.601895


## Configuração do experimento

## Resultados 

## Discussão