In [495]:
import pickle
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns


np.random.seed(42)

In [496]:
df = pd.read_pickle('../data/processed/ames_clean.pkl')
df

Unnamed: 0,MS.SubClass,MS.Zoning,Lot.Frontage,Lot.Area,Lot.Shape,Land.Contour,Lot.Config,Land.Slope,Neighborhood,Bldg.Type,...,Sale.Type,Sale.Condition,SalePrice,Condition,HasShed,HasAlley,Exterior,Garage.Age,Remod.Age,House.Age
0,20,RL,141.0,31770.0,IR1,Lvl,Corner,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.332438,Norm,False,False,BrkFace,50.0,50.0,50.0
1,20,RH,80.0,11622.0,Reg,Lvl,Inside,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.021189,Roads,False,False,VinylSd,49.0,49.0,49.0
2,20,RL,81.0,14267.0,IR1,Lvl,Corner,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.235528,Norm,False,False,Wd Sdng,52.0,52.0,52.0
3,20,RL,93.0,11160.0,Reg,Lvl,Corner,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.387390,Norm,False,False,BrkFace,42.0,42.0,42.0
4,60,RL,74.0,13830.0,IR1,Lvl,Inside,Gtl,Gilbert,1Fam,...,GroupedWD,Normal,5.278525,Norm,False,False,VinylSd,13.0,12.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,80,RL,37.0,7937.0,IR1,Lvl,CulDSac,Gtl,Mitchel,1Fam,...,GroupedWD,Normal,5.153815,Norm,False,False,HdBoard,22.0,22.0,22.0
2926,20,RL,68.0,8885.0,IR1,Low,Inside,Mod,Mitchel,1Fam,...,GroupedWD,Normal,5.117271,Norm,False,False,HdBoard,23.0,23.0,23.0
2927,85,RL,62.0,10441.0,Reg,Lvl,Inside,Gtl,Mitchel,1Fam,...,GroupedWD,Normal,5.120574,Norm,True,False,HdBoard,28.0,14.0,14.0
2928,20,RL,77.0,10010.0,Reg,Lvl,Inside,Mod,Mitchel,1Fam,...,GroupedWD,Normal,5.230449,Norm,False,False,HdBoard,31.0,31.0,32.0


In [497]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2877 entries, 0 to 2929
Data columns (total 70 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   MS.SubClass      2877 non-null   category
 1   MS.Zoning        2877 non-null   category
 2   Lot.Frontage     2877 non-null   float64 
 3   Lot.Area         2877 non-null   float64 
 4   Lot.Shape        2877 non-null   category
 5   Land.Contour     2877 non-null   category
 6   Lot.Config       2877 non-null   category
 7   Land.Slope       2877 non-null   category
 8   Neighborhood     2877 non-null   category
 9   Bldg.Type        2877 non-null   category
 10  House.Style      2877 non-null   category
 11  Overall.Qual     2877 non-null   category
 12  Overall.Cond     2877 non-null   category
 13  Roof.Style       2877 non-null   category
 14  Mas.Vnr.Type     2877 non-null   category
 15  Mas.Vnr.Area     2877 non-null   float64 
 16  Exter.Qual       2877 non-null   category
 17  

In [498]:
model_data = df.copy()

categorical_columns = []
ordinal_columns = []
for col in model_data.select_dtypes('category').columns:
    if model_data[col].cat.ordered:
        ordinal_columns.append(col)
    else:
        categorical_columns.append(col)

ordinal_columns

['Lot.Shape',
 'Land.Slope',
 'Overall.Qual',
 'Overall.Cond',
 'Exter.Qual',
 'Exter.Cond',
 'Heating.QC',
 'Electrical',
 'Kitchen.Qual',
 'Functional',
 'Paved.Drive',
 'Fence']

In [499]:
categorical_columns

['MS.SubClass',
 'MS.Zoning',
 'Land.Contour',
 'Lot.Config',
 'Neighborhood',
 'Bldg.Type',
 'House.Style',
 'Roof.Style',
 'Mas.Vnr.Type',
 'Foundation',
 'Bsmt.Qual',
 'Bsmt.Cond',
 'Bsmt.Exposure',
 'BsmtFin.Type.1',
 'BsmtFin.Type.2',
 'Central.Air',
 'Garage.Type',
 'Garage.Finish',
 'Sale.Type',
 'Sale.Condition',
 'Condition',
 'Exterior']

In [500]:
for col in ordinal_columns:
    codes, _ = pd.factorize(df[col], sort=True)
    model_data[col] = codes

In [501]:
model_data[ordinal_columns].info()

<class 'pandas.core.frame.DataFrame'>
Index: 2877 entries, 0 to 2929
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Lot.Shape     2877 non-null   int64
 1   Land.Slope    2877 non-null   int64
 2   Overall.Qual  2877 non-null   int64
 3   Overall.Cond  2877 non-null   int64
 4   Exter.Qual    2877 non-null   int64
 5   Exter.Cond    2877 non-null   int64
 6   Heating.QC    2877 non-null   int64
 7   Electrical    2877 non-null   int64
 8   Kitchen.Qual  2877 non-null   int64
 9   Functional    2877 non-null   int64
 10  Paved.Drive   2877 non-null   int64
 11  Fence         2877 non-null   int64
dtypes: int64(12)
memory usage: 292.2 KB


In [502]:
model_data = pd.get_dummies(model_data, drop_first=True).astype('float64')

In [503]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2877 entries, 0 to 2929
Columns: 165 entries, Lot.Frontage to Exterior_Other
dtypes: float64(165)
memory usage: 3.6 MB


In [504]:
model_data.head()

Unnamed: 0,Lot.Frontage,Lot.Area,Lot.Shape,Land.Slope,Overall.Qual,Overall.Cond,Mas.Vnr.Area,Exter.Qual,Exter.Cond,BsmtFin.SF.1,...,Exterior_BrkFace,Exterior_CemntBd,Exterior_HdBoard,Exterior_MetalSd,Exterior_Plywood,Exterior_Stucco,Exterior_VinylSd,Exterior_Wd Sdng,Exterior_WdShing,Exterior_Other
0,141.0,31770.0,1.0,0.0,5.0,4.0,112.0,2.0,2.0,639.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,80.0,11622.0,0.0,0.0,4.0,5.0,0.0,2.0,2.0,468.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,81.0,14267.0,1.0,0.0,5.0,5.0,108.0,2.0,2.0,923.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,93.0,11160.0,0.0,0.0,6.0,4.0,0.0,1.0,2.0,1065.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,74.0,13830.0,1.0,0.0,4.0,4.0,0.0,2.0,2.0,791.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [505]:
for cat in categorical_columns:
    dummies = []
    for col in model_data.columns:
        if col.startswith(cat + "_"):
            dummies.append(f'"{col}"')
    dummies_str = ', '.join(dummies)
    print(f'From column "{cat}" we made {dummies_str}\n')

From column "MS.SubClass" we made "MS.SubClass_30", "MS.SubClass_50", "MS.SubClass_60", "MS.SubClass_70", "MS.SubClass_80", "MS.SubClass_85", "MS.SubClass_90", "MS.SubClass_120", "MS.SubClass_160", "MS.SubClass_190", "MS.SubClass_Other"

From column "MS.Zoning" we made "MS.Zoning_RH", "MS.Zoning_RL", "MS.Zoning_RM"

From column "Land.Contour" we made "Land.Contour_HLS", "Land.Contour_Low", "Land.Contour_Lvl"

From column "Lot.Config" we made "Lot.Config_CulDSac", "Lot.Config_FR2", "Lot.Config_FR3", "Lot.Config_Inside"

From column "Neighborhood" we made "Neighborhood_BrDale", "Neighborhood_BrkSide", "Neighborhood_ClearCr", "Neighborhood_CollgCr", "Neighborhood_Crawfor", "Neighborhood_Edwards", "Neighborhood_Gilbert", "Neighborhood_IDOTRR", "Neighborhood_MeadowV", "Neighborhood_Mitchel", "Neighborhood_NAmes", "Neighborhood_NPkVill", "Neighborhood_NWAmes", "Neighborhood_NoRidge", "Neighborhood_NridgHt", "Neighborhood_OldTown", "Neighborhood_SWISU", "Neighborhood_Sawyer", "Neighborhood_Sa

In [506]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = model_data.drop('SalePrice', axis=1).copy()
y = model_data['SalePrice'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [507]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2301 entries, 1446 to 873
Columns: 164 entries, Lot.Frontage to Exterior_Other
dtypes: float64(164)
memory usage: 2.9 MB


In [508]:
linreg = LinearRegression()
ridge = Ridge()
lasso = Lasso()
elasticNet = ElasticNet()

linreg.fit(X_train, y_train)
ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)
elasticNet.fit(X_train, y_train)


In [509]:
def error_percent(rmse):
    return f"{100 * (10**rmse - 1):.2f}%"

In [510]:
y_pred_linreg = linreg.predict(X_test)
y_pred_ridge = ridge.predict(X_test)
y_pred_lasso = lasso.predict(X_test)
y_pred_elasticNet = elasticNet.predict(X_test)

rmse_linreg = np.sqrt(mean_squared_error(y_test, y_pred_linreg))
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
rmse_elasticNet = np.sqrt(mean_squared_error(y_test, y_pred_elasticNet))

print("Error percent for Linear Regression: ", error_percent(rmse_linreg))
print("Error percent for Ridge Regression: ", error_percent(rmse_ridge))
print("Error percent for Lasso Regression: ", error_percent(rmse_lasso))
print("Error percent for Elastic Net Regression: ", error_percent(rmse_elasticNet))

Error percent for Linear Regression:  14.37%
Error percent for Ridge Regression:  14.33%
Error percent for Lasso Regression:  26.50%
Error percent for Elastic Net Regression:  24.36%


In [511]:
corr_matrix = model_data.corr()
corr_matrix_target = corr_matrix['SalePrice']
sorted_corr_matrix_target = corr_matrix_target.sort_values(ascending=False)

print("Ordem das colunas mais correlacionadas com SalePrice: ")
for i in range(1, 165):
    print(f"{i}. {sorted_corr_matrix_target.index[i]} -> {sorted_corr_matrix_target[i]}")



Ordem das colunas mais correlacionadas com SalePrice: 
1. Overall.Qual -> 0.8255866359651403
2. Gr.Liv.Area -> 0.7028929040679912
3. Garage.Cars -> 0.6824754546487515
4. Garage.Area -> 0.6593089639956545
5. Total.Bsmt.SF -> 0.6247159839540278
6. X1st.Flr.SF -> 0.6059743338279805
7. Full.Bath -> 0.5790685450560715
8. Foundation_PConc -> 0.5502702978309144
9. TotRms.AbvGrd -> 0.49672296416871453
10. Fireplaces -> 0.4894646192557834
11. Mas.Vnr.Area -> 0.446062177443733
12. Garage.Type_Attchd -> 0.4073593379139668
13. BsmtFin.SF.1 -> 0.4072969820677565
14. MS.SubClass_60 -> 0.3884469898535596
15. Neighborhood_NridgHt -> 0.3790822437792921
16. Exterior_VinylSd -> 0.36350392318161273
17. Lot.Frontage -> 0.3447840799985114
18. Central.Air_Y -> 0.341858078079174
19. Open.Porch.SF -> 0.3346861920813722
20. Wood.Deck.SF -> 0.3326109417090465
21. Sale.Condition_Partial -> 0.3271696728060346
22. Bsmt.Qual_Gd -> 0.31765690807240116
23. Lot.Shape -> 0.31142559563909694
24. Half.Bath -> 0.3062856855

  print(f"{i}. {sorted_corr_matrix_target.index[i]} -> {sorted_corr_matrix_target[i]}")


In [512]:
correlations = model_data.corr()['SalePrice']

threshold = 0.03

low_correlation_columns = correlations[correlations.abs() < threshold].index.tolist()

for col in low_correlation_columns:
    print(f"Column {col} has correlation {correlations[col]}")
    model_data.drop(col, axis=1, inplace=True)


Column BsmtFin.SF.2 has correlation 0.012288888504329342
Column Bsmt.Half.Bath has correlation -0.02638928570318194
Column Misc.Val has correlation -0.010276462279623644
Column Yr.Sold has correlation -0.028055816164518528
Column MS.SubClass_80 has correlation -0.007830959896818342
Column Lot.Config_FR2 has correlation -0.004804627325899428
Column Lot.Config_FR3 has correlation 0.02439601200822354
Column Neighborhood_Mitchel has correlation -0.02920474411800953
Column House.Style_2.5Fin has correlation 0.021875611860970274
Column House.Style_2.5Unf has correlation 0.00313578463815101
Column House.Style_SLvl has correlation -0.01746056211448349
Column Roof.Style_Other has correlation -0.017611332159067374
Column BsmtFin.Type.2_ALQ has correlation 0.01144116556681228
Column BsmtFin.Type.2_LwQ has correlation -0.02997983115956374
Column Sale.Condition_Alloca has correlation -0.008155189622134674
Column Condition_Railroad has correlation -0.007335746430641308
Column Exterior_BrkFace has co

In [513]:
X = model_data.drop('SalePrice', axis=1).copy()
y = model_data['SalePrice'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linreg = LinearRegression()
ridge = Ridge()
lasso = Lasso()
elasticNet = ElasticNet()

linreg.fit(X_train, y_train)
ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)
elasticNet.fit(X_train, y_train)

y_pred_linreg = linreg.predict(X_test)

y_pred_ridge = ridge.predict(X_test)
y_pred_lasso = lasso.predict(X_test)
y_pred_elasticNet = elasticNet.predict(X_test)

rmse_linreg = np.sqrt(mean_squared_error(y_test, y_pred_linreg))
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
rmse_elasticNet = np.sqrt(mean_squared_error(y_test, y_pred_elasticNet))

print("Error percent for Linear Regression: ", error_percent(rmse_linreg))
print("Error percent for Ridge Regression: ", error_percent(rmse_ridge))
print("Error percent for Lasso Regression: ", error_percent(rmse_lasso))
print("Error percent for Elastic Net Regression: ", error_percent(rmse_elasticNet))

Error percent for Linear Regression:  14.03%
Error percent for Ridge Regression:  14.03%
Error percent for Lasso Regression:  26.12%
Error percent for Elastic Net Regression:  24.00%


In [514]:
from sklearn.model_selection import GridSearchCV


model_linreg = LinearRegression()

parameters = {'fit_intercept': [True, False],
                'copy_X': [True, False],
                'n_jobs': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

grid = GridSearchCV(model_linreg, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_

print("Melhor modelo Linear Regression: ",best_model)

y_pred = best_model.predict(X_train)

rmse = np.sqrt(mean_squared_error(y_train, y_pred))

print("Error percent for Linear Regression: ", error_percent(rmse))

Melhor modelo Linear Regression:  LinearRegression(n_jobs=1)
Error percent for Linear Regression:  12.03%


In [515]:
model_ridge = Ridge()

parameters = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'fit_intercept': [True, False],
                'tol': [0.0001, 0.001, 0.01, 0.1]}
grid = GridSearchCV(model_ridge, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print("Melhor modelo Ridge: ",best_model)

best_params = grid.best_params_
print("Melhores parametros Ridge: ", best_params)

y_pred = best_model.predict(X_train)

rmse = np.sqrt(mean_squared_error(y_train, y_pred))

print("Error percent for Ridge Regression: ", error_percent(rmse))

Melhor modelo Ridge:  Ridge(alpha=10)
Melhores parametros Ridge:  {'alpha': 10, 'fit_intercept': True, 'tol': 0.0001}
Error percent for Ridge Regression:  12.12%


In [516]:
model_lasso = Lasso()

parameters = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

grid = GridSearchCV(model_lasso, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print("Melhor modelo Lasso: ",best_model)

best_params = grid.best_params_
print("Melhores parametros Lasso: ", best_params)

y_pred = best_model.predict(X_train)

rmse = np.sqrt(mean_squared_error(y_train, y_pred))

print("Error percent for Lasso Regression: ", error_percent(rmse))

Melhor modelo Lasso:  Lasso(alpha=0.001)
Melhores parametros Lasso:  {'alpha': 0.001}
Error percent for Lasso Regression:  13.62%


In [517]:
model_elasticNet = ElasticNet()

parameters = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}

grid = GridSearchCV(model_elasticNet, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_

print("Melhor modelo Elastic Net: ",best_model)

best_params = grid.best_params_

print("Melhores parametros Elastic Net: ", best_params)

y_pred = best_model.predict(X_train)

rmse = np.sqrt(mean_squared_error(y_train, y_pred))

print("Error percent for Elastic Net Regression: ", error_percent(rmse))

Melhor modelo Elastic Net:  ElasticNet(alpha=0.001, l1_ratio=0.1)
Melhores parametros Elastic Net:  {'alpha': 0.001, 'l1_ratio': 0.1}
Error percent for Elastic Net Regression:  12.20%


In [518]:
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

model_xgb = XGBRegressor()

# parameters = {'n_estimators': [100, 500, 1000],
#                 'learning_rate': [0.001, 0.01, 0.1, 1, 10, 100]}

# grid = GridSearchCV(model_xgb, parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

# grid.fit(X_train, y_train)

# best_model = grid.best_estimator_

# print("Melhor modelo XGBoost: ",best_model)

# best_params = grid.best_params_

# print("Melhores parametros XGBoost: ", best_params)

# y_pred = best_model.predict(X_train)

# rmse = np.sqrt(mean_squared_error(y_train, y_pred))

# print("Error percent for XGBoost: ", error_percent(rmse))\

pipe = Pipeline([('scaler', StandardScaler()), ('model', XGBRegressor())])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)

rmse = np.sqrt(mean_squared_error(y_train, y_pred))

print("Error percent for XGBoost: ", error_percent(rmse))

# validar p ver se ta overfitando
y_pred = pipe.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Error percent for XGBoost: ", error_percent(rmse))



Error percent for XGBoost:  1.71%
Error percent for XGBoost:  13.47%


  if is_sparse(data):
