In [None]:
import pickle
import pathlib

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
DATA_DIR = pathlib.Path.cwd().parent / 'aps-ML-nena-tets' / 'data'
clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'

with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

model_data = data.copy()

In [9]:
print(model_data.head())

  MS.SubClass MS.Zoning  Lot.Frontage  Lot.Area Lot.Shape Land.Contour  \
0          20        RL         141.0   31770.0       IR1          Lvl   
1          20        RH          80.0   11622.0       Reg          Lvl   
2          20        RL          81.0   14267.0       IR1          Lvl   
3          20        RL          93.0   11160.0       Reg          Lvl   
4          60        RL          74.0   13830.0       IR1          Lvl   

  Lot.Config Land.Slope Neighborhood Bldg.Type  ...  Sale.Type Sale.Condition  \
0     Corner        Gtl        NAmes      1Fam  ...  GroupedWD         Normal   
1     Inside        Gtl        NAmes      1Fam  ...  GroupedWD         Normal   
2     Corner        Gtl        NAmes      1Fam  ...  GroupedWD         Normal   
3     Corner        Gtl        NAmes      1Fam  ...  GroupedWD         Normal   
4     Inside        Gtl      Gilbert      1Fam  ...  GroupedWD         Normal   

  SalePrice Condition HasShed  HasAlley Exterior Garage.Age Remod.Ag

In [None]:
X = model_data.drop(columns='SalePrice') 
y = model_data['SalePrice']  

categorical_cols = X.select_dtypes(include=['category']).columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
boolean_cols = X.select_dtypes(include=['bool']).columns

X[boolean_cols] = X[boolean_cols].astype(int)

X[categorical_cols] = X[categorical_cols].astype(str)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols), 
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols) 
    ]
)

pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline_lr.fit(X_train, y_train)

y_train_pred = pipeline_lr.predict(X_train)
y_test_pred = pipeline_lr.predict(X_test)

print(f'Treino - MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f'Treino - RMSE: {np.sqrt(mean_squared_error(y_train, y_train_pred))}')
print(f'Treino - R²: {r2_score(y_train, y_train_pred)}\n')

print(f'Teste - MAE: {mean_absolute_error(y_test, y_test_pred)}')
print(f'Teste - RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred))}')
print(f'Teste - R²: {r2_score(y_test, y_test_pred)}\n')

Treino - MAE: 0.030285174650185293
Treino - RMSE: 0.04541472229192995
Treino - R²: 0.9310654624466183

Teste - MAE: 0.034072410411970774
Teste - RMSE: 0.05932517888682991
Teste - R²: 0.8808017679548166

