# Modelling

## Importando bibliotecas

In [1]:
import pathlib
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

## Importando os dados do notebook limpo

In [2]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

c:\Users\alema\OneDrive\Documentos\insper\4sem\MachineLearing\ames-project\data


In [3]:
clean_data_path = DATA_DIR / 'processed' / 'feature_engineering.pkl'

In [4]:
df = pd.read_pickle(clean_data_path)
df.head()

Unnamed: 0,MS.SubClass_160,MS.SubClass_190,MS.SubClass_20,MS.SubClass_30,MS.SubClass_50,MS.SubClass_60,MS.SubClass_70,MS.SubClass_80,MS.SubClass_85,MS.SubClass_90,...,HasShed,HasAlley,Garage.Age,Remod.Age,SqFtPerRoom,Total_Home_Quality,Total_Bathrooms,GrAreaPerCar,TotalPorchSF,AvgQualCond
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,3.931826,50.0,1.964901,9.0,1.693147,3.706382,4.143135,4.5
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,3.912023,49.0,1.951001,9.0,1.0,6.799056,4.795791,4.5
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,3.970292,52.0,1.66034,10.0,1.346574,7.192934,3.610918,5.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,3.7612,42.0,1.370984,10.0,3.039721,3.827459,0.0,5.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,False,False,2.639057,12.0,1.387106,8.0,2.346574,3.698168,3.555348,4.0


## Separando os dados de treino e teste

In [5]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

## Encontrando o melhor modelo

In [10]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

regressors = [
    ("Linear", LinearRegression()),
    ("Lasso", Lasso()),
    ("Ridge", Ridge()),
    ("DecisionTree", DecisionTreeRegressor()),
    ("RandomForest", RandomForestRegressor())
]

param_grid = {
    "Linear": {},
    "Lasso": {"alpha": [0.001, 0.01, 0.1, 1, 10]},
    "Ridge": {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
    "DecisionTree": {"max_depth": [None, 30, 40, 50], "criterion": ["squared_error"]},
    "RandomForest": {"n_estimators": [100, 150, 200], "max_depth": [None, 10, 20, 30, 40]}
}

In [7]:
def RMSE_percentage(rmse):
    return 100 * (10**rmse - 1)

In [15]:
for name, regressor in regressors:
    grid_search = GridSearchCV(regressor, param_grid[name], cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(name)
    print(f"RMSE: {rmse}")
    print(f"Melhores Hiperparâmetros: {grid_search.best_params_}")
    print(f"RMSE em porcentagem: {RMSE_percentage(rmse):.2f}%\n")

Linear
RMSE: 0.05313816773099989
Melhores Hiperparâmetros: {}
RMSE em porcentagem: 13.02%

Lasso
RMSE: 0.07738910625073671
Melhores Hiperparâmetros: {'alpha': 0.01}
RMSE em porcentagem: 19.51%

Ridge
RMSE: 0.053280326844140316
Melhores Hiperparâmetros: {'alpha': 10}
RMSE em porcentagem: 13.05%

DecisionTree
RMSE: 0.09066835451959425
Melhores Hiperparâmetros: {'criterion': 'squared_error', 'max_depth': 30}
RMSE em porcentagem: 23.22%

RandomForest
RMSE: 0.06058015480177196
Melhores Hiperparâmetros: {'max_depth': 20, 'n_estimators': 100}
RMSE em porcentagem: 14.97%



Modelo Linear apresentou menor RMSE após tunagem, dessa forma, foi escolhido como o modelo final

In [20]:
grid_search = GridSearchCV(LinearRegression(), param_grid=({}), cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
grid_search.best_estimator_

In [21]:
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print(f"RMSE: {final_rmse}")
print(f"RMSE em porcentagem: {RMSE_percentage(final_rmse):.2f}%")

RMSE: 0.05313816773099989
RMSE em porcentagem: 13.02%


## Avaliação do Modelo

Features mais importantes para o modelo:

In [22]:
coefficients = pd.DataFrame(final_model.coef_, X_train.columns, columns=['Coefficient'])
print(coefficients.abs().sort_values(by='Coefficient', ascending=False))

                        Coefficient
Gr.Liv.Area                0.178490
Bsmt.Qual_NA               0.130137
Garage.Type_NoGarage       0.091305
Sale.Condition_AdjLand     0.080786
Exterior_BrkFace           0.059230
...                             ...
Remod.Age                  0.000384
Garage.Finish_Unf          0.000348
Mo.Sold                    0.000317
Bedroom.AbvGr              0.000116
Garage.Area                0.000017

[157 rows x 1 columns]
