In [1]:
import pickle
import pathlib

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
DATA_DIR = pathlib.Path.cwd().parent / 'aps-ML-nena-tets' / 'data'
clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'

with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

model_data = data.copy()

In [3]:
print(model_data.head())

  MS.SubClass MS.Zoning  Lot.Frontage  Lot.Area Lot.Shape Land.Contour  \
0          20        RL         141.0   31770.0       IR1          Lvl   
1          20        RH          80.0   11622.0       Reg          Lvl   
2          20        RL          81.0   14267.0       IR1          Lvl   
3          20        RL          93.0   11160.0       Reg          Lvl   
4          60        RL          74.0   13830.0       IR1          Lvl   

  Lot.Config Land.Slope Neighborhood Bldg.Type  ...  Sale.Type Sale.Condition  \
0     Corner        Gtl        NAmes      1Fam  ...  GroupedWD         Normal   
1     Inside        Gtl        NAmes      1Fam  ...  GroupedWD         Normal   
2     Corner        Gtl        NAmes      1Fam  ...  GroupedWD         Normal   
3     Corner        Gtl        NAmes      1Fam  ...  GroupedWD         Normal   
4     Inside        Gtl      Gilbert      1Fam  ...  GroupedWD         Normal   

  SalePrice Condition HasShed  HasAlley Exterior Garage.Age Remod.Ag

# Treinando diversos modelos

In [4]:
X = model_data.drop(columns='SalePrice') 
y = model_data['SalePrice']  

categorical_cols = X.select_dtypes(include=['category']).columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
boolean_cols = X.select_dtypes(include=['bool']).columns

X[boolean_cols] = X[boolean_cols].astype(int)

X[categorical_cols] = X[categorical_cols].astype(str)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols), 
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols) 
    ]
)

pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_lr.fit(X_train, y_train)

y_train_pred = pipeline_lr.predict(X_train)
y_test_pred = pipeline_lr.predict(X_test)

print(f'Treino - MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f'Treino - RMSE: {np.sqrt(mean_squared_error(y_train, y_train_pred))}')
print(f'Treino - R²: {r2_score(y_train, y_train_pred)}\n')

print(f'Teste - MAE: {mean_absolute_error(y_test, y_test_pred)}')
print(f'Teste - RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred))}')
print(f'Teste - R²: {r2_score(y_test, y_test_pred)}\n')

Treino - MAE: 0.030285174650185293
Treino - RMSE: 0.04541472229192995
Treino - R²: 0.9310654624466183

Teste - MAE: 0.034072410411970774
Teste - RMSE: 0.05932517888682991
Teste - R²: 0.8808017679548166



In [5]:
# Modelos com parâmetros padrão
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR(),
    "Ridge Regressor": Ridge()
}

# Resultados iniciais
baseline_results = {}

for model_name, model in models.items():
    print(f"Training {model_name}...\n")

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),  # Seu preprocessor já definido
        ('model', model)
    ])
    
    # Treinamento e predição
    pipeline.fit(X_train, y_train)
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    
    # Métricas
    baseline_results[model_name] = {
        "Train RMSE": np.sqrt(mean_squared_error(y_train, y_train_pred)),
        "Test RMSE": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        "Test R²": r2_score(y_test, y_test_pred),
    }

# Mostrar resultados iniciais
print("### Baseline Results ###\n")
for model_name, metrics in baseline_results.items():
    print(f"### {model_name} ###")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value}")
    print("\n")


Training Linear Regression...

Training Random Forest...

Training Gradient Boosting...

Training Support Vector Regressor...

Training Ridge Regressor...

### Baseline Results ###

### Linear Regression ###
Train RMSE: 0.04541472229192995
Test RMSE: 0.05932517888682991
Test R²: 0.8808017679548166


### Random Forest ###
Train RMSE: 0.022480502467526173
Test RMSE: 0.057519955141931936
Test R²: 0.887945635102393


### Gradient Boosting ###
Train RMSE: 0.03873644767480843
Test RMSE: 0.053033946548600344
Test R²: 0.9047424090177785


### Support Vector Regressor ###
Train RMSE: 0.05287070253236144
Test RMSE: 0.06882234823421182
Test R²: 0.8395828949563684


### Ridge Regressor ###
Train RMSE: 0.04555369092840306
Test RMSE: 0.058516808776451684
Test R²: 0.8840280475938952




In [6]:
# 1. Escolha os modelos mais promissores
promising_models = {
    "Random Forest": {
        "model": RandomForestRegressor(random_state=42),
        "param_grid": {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [10, 20, None],
            'model__min_samples_split': [2, 5, 10]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "param_grid": {
            'model__n_estimators': [50, 100, 200],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__max_depth': [3, 5, 7]
        }
    },
    "Support Vector Regressor": {
        "model": SVR(),
        "param_grid": {
            'model__C': [0.1, 1, 10],
            'model__epsilon': [0.1, 0.2, 0.5],
            'model__kernel': ['linear', 'rbf']
        }
    }
}

# 2. Aplicar Grid Search
grid_search_results = {}

for model_name, model_info in promising_models.items():
    print(f"Running Grid Search for {model_name}...\n")
    
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),  
        ('model', model_info["model"])
    ])
    
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=model_info["param_grid"],
        scoring='neg_mean_squared_error',
        cv=5, 
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    grid_search_results[model_name] = {
        "Best Params": grid_search.best_params_,
        "Train RMSE": np.sqrt(mean_squared_error(y_train, y_train_pred)),
        "Test RMSE": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        "Test R²": r2_score(y_test, y_test_pred)
    }

# 3. Mostrar os resultados
print("### Grid Search Results ###\n")
for model_name, metrics in grid_search_results.items():
    print(f"### {model_name} ###")
    print(f"Best Params: {metrics['Best Params']}")
    print(f"Train RMSE: {metrics['Train RMSE']}")
    print(f"Test RMSE: {metrics['Test RMSE']}")
    print(f"Test R²: {metrics['Test R²']}\n")

Running Grid Search for Random Forest...

Running Grid Search for Gradient Boosting...

Running Grid Search for Support Vector Regressor...

### Grid Search Results ###

### Random Forest ###
Best Params: {'model__max_depth': 20, 'model__min_samples_split': 5, 'model__n_estimators': 100}
Train RMSE: 0.02527287556739265
Test RMSE: 0.057699604475224694
Test R²: 0.8872445939350458

### Gradient Boosting ###
Best Params: {'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 200}
Train RMSE: 0.024237288989628153
Test RMSE: 0.05133049231945161
Test R²: 0.9107634933811777

### Support Vector Regressor ###
Best Params: {'model__C': 1, 'model__epsilon': 0.1, 'model__kernel': 'rbf'}
Train RMSE: 0.05287070253236144
Test RMSE: 0.06882234823421182
Test R²: 0.8395828949563684

