# Modelling

## Importando bibliotecas

In [11]:
import pathlib
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

## Importando os dados do notebook limpo

In [12]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

c:\Users\alema\OneDrive\Documentos\insper\4sem\MachineLearing\ames-project\data


In [13]:
clean_data_path = DATA_DIR / 'processed' / 'feature_engineering.pkl'

In [14]:
df = pd.read_pickle(clean_data_path)
df.head()

Unnamed: 0,MS.SubClass_160,MS.SubClass_190,MS.SubClass_20,MS.SubClass_30,MS.SubClass_50,MS.SubClass_60,MS.SubClass_70,MS.SubClass_80,MS.SubClass_85,MS.SubClass_90,...,HasShed,HasAlley,Garage.Age,Remod.Age,SqFtPerRoom,Total_Home_Quality,Total_Bathrooms,GrAreaPerCar,TotalPorchSF,AvgQualCond
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,3.931826,50.0,1.964901,9.0,1.693147,3.706382,4.143135,4.5
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,3.912023,49.0,1.951001,9.0,1.0,6.799056,4.795791,4.5
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,3.970292,52.0,1.66034,10.0,1.346574,7.192934,3.610918,5.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,3.7612,42.0,1.370984,10.0,3.039721,3.827459,0.0,5.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,False,False,2.639057,12.0,1.387106,8.0,2.346574,3.698168,3.555348,4.0


## Separando os dados de treino e teste

In [15]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

## Encontrando o melhor modelo

In [29]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

regressors = [
    ("Linear", LinearRegression()),
    ("Lasso", Lasso()),
    ("Ridge", Ridge()),
    ("DecisionTree", DecisionTreeRegressor()),
    ("RandomForest", RandomForestRegressor())
]

param_grid = {
    "Lasso": {"alpha": [0.01, 0.1, 1, 10]},
    "Ridge": {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
    "DecisionTree": {"max_depth": [None, 30, 40, 50], "criterion": ["squared_error"]},
    "RandomForest": {"n_estimators": [100, 150, 200], "max_depth": [None, 10, 20, 30, 40]}
}

In [37]:
def display_scores(scores, name):
    print(f"{name}: ")
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print(f"Standard deviation: {scores.std()}\n")

def cross_val_evaluate(model, name):
    scores = cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    display_scores(rmse_scores, name)

def RMSE_percentage(rmse, y):
    return (rmse / (np.max(y) - np.min(y))) * 100

In [36]:
for name, model in regressors:
    cross_val_evaluate(model, name)

Linear: 
Scores: [0.05782382 0.04634177 0.05400217 0.05939831 0.04664693 0.04419352
 0.04830075 0.04476289 0.04673915 0.05560846]
Mean: 0.050381778297261724
Standard deviation: 0.00542995259423805

Lasso: 
Scores: [0.11899857 0.12070626 0.13442687 0.12424038 0.12278046 0.11992811
 0.11785743 0.12176185 0.11498659 0.12938493]
Mean: 0.1225071450059115
Standard deviation: 0.005416652475649625

Ridge: 
Scores: [0.0567517  0.04591113 0.05346192 0.05969458 0.04626652 0.04386995
 0.04812431 0.04426176 0.04646448 0.05533666]
Mean: 0.05001430260422938
Standard deviation: 0.005451732057594576

DecisionTree: 
Scores: [0.07437792 0.08743151 0.09152887 0.09632071 0.08429005 0.08875562
 0.07885338 0.08529271 0.08008774 0.09450021]
Mean: 0.08614387295476236
Standard deviation: 0.006658081771234486

RandomForest: 
Scores: [0.05004846 0.05393463 0.06012618 0.06649246 0.05776233 0.055033
 0.05814668 0.05780704 0.05717196 0.06406199]
Mean: 0.05805847242181054
Standard deviation: 0.0045023810019272645



In [40]:
grid_search = GridSearchCV(Ridge(), param_grid["Ridge"], cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
grid_search.best_estimator_

In [41]:
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print(f"RMSE: {final_rmse}")
print(f"RMSE em porcentagem: {RMSE_percentage(final_rmse, y_test):.2f}%")

RMSE = 0.053280326844140316
RMSE em porcentagem: 4.52%


In [28]:
# # Loop para treinar e tunar regressores
# for name, regressor in regressors:
#     if name in param_grid:
#         grid_search = GridSearchCV(regressor, param_grid[name], cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
#         grid_search.fit(X_train, y_train)
#         best_model = grid_search.best_estimator_
#     else:
#         best_model = regressor
#         best_model.fit(X_train, y_train)
    
#     # Avaliação no conjunto de validação
#     y_pred_val = best_model.predict(X_val)
#     mse = mean_squared_error(y_val, y_pred_val)
    
#     print(f"Regressor: {name}")
#     print(f"Melhores Hiperparâmetros: {grid_search.best_params_ if name in param_grid else 'N/A'}")

#     # Testar overfitting nos melhores modelos
#     y_pred_train = best_model.predict(X_train)
#     y_pred_test = best_model.predict(X_test)
#     mse_train = mean_squared_error(y_train, y_pred_train)
#     mse_test = mean_squared_error(y_test, y_pred_test)
    
#     print(f"MSE no Conjunto de Validação, Treinamento e Teste: {[mse, mse_train, mse_test]}")
#     total_samples = X.shape[0]
#     mse_percentage = [(mse / total_samples) * 100, (mse_train / total_samples) * 100, (mse_test / total_samples) * 100]
#     print(f"MSE no Conjunto de Validação, Treinamento e Teste em Porcentagem: {mse_percentage}")

#    print("\n")

Regressor: Linear
Melhores Hiperparâmetros: N/A
MSE no Conjunto de Validação, Treinamento e Teste: [0.002979053430892817, 0.002022085791427744, 0.0028236648698078785]
MSE no Conjunto de Validação, Treinamento e Teste em Porcentagem: [0.00010354721692362936, 7.02845252494871e-05, 9.814615466833086e-05]


Regressor: Lasso
Melhores Hiperparâmetros: {'alpha': 0.01}
MSE no Conjunto de Validação, Treinamento e Teste: [0.0066403367377674905, 0.005383126480520477, 0.005989073766287816]
MSE no Conjunto de Validação, Treinamento e Teste em Porcentagem: [0.00023080767249800105, 0.00018710901913522685, 0.0002081707947962397]


Regressor: Ridge
Melhores Hiperparâmetros: {'alpha': 10}
MSE no Conjunto de Validação, Treinamento e Teste: [0.0030682181640849546, 0.0021165685302321283, 0.0028387932286184194]
MSE no Conjunto de Validação, Treinamento e Teste em Porcentagem: [0.00010664644296437102, 7.356859681029296e-05, 9.867199265270835e-05]


Regressor: DecisionTree
Melhores Hiperparâmetros: {'criterio