In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Carregar os dados
data = pd.read_csv('./datasets/defects_data.csv', encoding='latin1')

# Selecionar características e alvo
features = ['defect_type', 'defect_location', 'severity', 'inspection_method', 'product_id']
target = 'repair_cost'

# Dividir os dados em características (X) e alvo (y)
X = data[features]
y = data[target]

# Pré-processamento
# Codificar variáveis categóricas e normalizar variáveis numéricas
categorical_features = ['defect_type', 'defect_location', 'severity', 'inspection_method']
numerical_features = ['product_id']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ])

# Criar pipeline com pré-processador e modelo de Random Forest
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

# Dividir os dados em conjuntos de treino e teste
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Treinar o modelo de Random Forest
pipeline.fit(train_x, train_y)

# Previsões
train_pred = pipeline.predict(train_x)
test_pred = pipeline.predict(test_x)

# Avaliar o modelo
train_mae = mean_absolute_error(train_y, train_pred)
test_mae = mean_absolute_error(test_y, test_pred)
train_mse = mean_squared_error(train_y, train_pred)
test_mse = mean_squared_error(test_y, test_pred)
train_r2 = r2_score(train_y, train_pred)
test_r2 = r2_score(test_y, test_pred)

# Mostrar os resultados
print(f'Treino - MAE: {train_mae}, MSE: {train_mse}, R2: {train_r2}')
print(f'Teste - MAE: {test_mae}, MSE: {test_mse}, R2: {test_r2}')

# Validação cruzada
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print(f'Validação cruzada - R2: {cv_scores.mean()}')

Treino - MAE: 100.75118939181546, MSE: 14769.540190536403, R2: 0.8226243254340891
Teste - MAE: 257.0404050779762, MSE: 97858.01557528839, R2: -0.14204325399024764
Validação cruzada - R2: -0.13874055002025099
