In [46]:
import pickle
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

np.random.seed(42)

In [47]:
df = pd.read_csv('ames.csv')
df.head()

Unnamed: 0,MS.SubClass,MS.Zoning,Lot.Frontage,Lot.Area,Lot.Shape,Land.Contour,Lot.Config,Land.Slope,Neighborhood,Bldg.Type,...,Sale.Type,Sale.Condition,SalePrice,Condition,HasShed,HasAlley,Exterior,Garage.Age,Remod.Age,House.Age
0,20,RL,141.0,31770.0,IR1,Lvl,Corner,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.332438,Norm,False,False,BrkFace,50.0,50.0,50.0
1,20,RH,80.0,11622.0,Reg,Lvl,Inside,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.021189,Roads,False,False,VinylSd,49.0,49.0,49.0
2,20,RL,81.0,14267.0,IR1,Lvl,Corner,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.235528,Norm,False,False,Wd Sdng,52.0,52.0,52.0
3,20,RL,93.0,11160.0,Reg,Lvl,Corner,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.38739,Norm,False,False,BrkFace,42.0,42.0,42.0
4,60,RL,74.0,13830.0,IR1,Lvl,Inside,Gtl,Gilbert,1Fam,...,GroupedWD,Normal,5.278525,Norm,False,False,VinylSd,13.0,12.0,13.0


In [48]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,Lot.Frontage,Lot.Area,Overall.Qual,Overall.Cond,Mas.Vnr.Area,BsmtFin.SF.1,BsmtFin.SF.2,Bsmt.Unf.SF,Total.Bsmt.SF,X1st.Flr.SF,...,Exterior_BrkFace,Exterior_CemntBd,Exterior_HdBoard,Exterior_MetalSd,Exterior_Other,Exterior_Plywood,Exterior_Stucco,Exterior_VinylSd,Exterior_Wd Sdng,Exterior_WdShing
0,141.0,31770.0,6,5,112.0,639.0,0.0,441.0,1080.0,1656.0,...,1,0,0,0,0,0,0,0,0,0
1,80.0,11622.0,5,6,0.0,468.0,144.0,270.0,882.0,896.0,...,0,0,0,0,0,0,0,1,0,0
2,81.0,14267.0,6,6,108.0,923.0,0.0,406.0,1329.0,1329.0,...,0,0,0,0,0,0,0,0,1,0
3,93.0,11160.0,7,5,0.0,1065.0,0.0,1045.0,2110.0,2110.0,...,1,0,0,0,0,0,0,0,0,0
4,74.0,13830.0,5,5,0.0,791.0,0.0,137.0,928.0,928.0,...,0,0,0,0,0,0,0,1,0,0


In [49]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = df.drop('SalePrice', axis=1).copy()
y = df['SalePrice'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [50]:
linreg = LinearRegression()
ridge = Ridge()
lasso = Lasso()
elasticNet = ElasticNet()

linreg.fit(X_train, y_train)
ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)
elasticNet.fit(X_train, y_train)


In [51]:
def error_percent(rmse):
    return f"{100 * (10**rmse - 1):.2f}%"

In [52]:
y_pred_linreg = linreg.predict(X_test)
y_pred_ridge = ridge.predict(X_test)
y_pred_lasso = lasso.predict(X_test)
y_pred_elasticNet = elasticNet.predict(X_test)

rmse_linreg = np.sqrt(mean_squared_error(y_test, y_pred_linreg))
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
rmse_elasticNet = np.sqrt(mean_squared_error(y_test, y_pred_elasticNet))

print("Error percent for Linear Regression: ", error_percent(rmse_linreg))
print("Error percent for Ridge Regression: ", error_percent(rmse_ridge))
print("Error percent for Lasso Regression: ", error_percent(rmse_lasso))
print("Error percent for Elastic Net Regression: ", error_percent(rmse_elasticNet))

Error percent for Linear Regression:  11507156270834950607332403543704486605689777495879060553728.00%
Error percent for Ridge Regression:  14.33%
Error percent for Lasso Regression:  26.50%
Error percent for Elastic Net Regression:  24.36%


In [70]:
corr_matrix = df.corr()
corr_matrix_target = corr_matrix['SalePrice']
sorted_corr_matrix_target = corr_matrix_target.sort_values(ascending=False)

print("10 colunas mais correlacionadas com SalePrice: ")
print(sorted_corr_matrix_target.head(10))


10 colunas mais correlacionadas com SalePrice: 
SalePrice           1.000000
Overall.Qual        0.825587
Gr.Liv.Area         0.702893
Garage.Cars         0.682475
Garage.Area         0.659309
Total.Bsmt.SF       0.624716
X1st.Flr.SF         0.605974
Full.Bath           0.579069
Foundation_PConc    0.550270
Bsmt.Qual_Ex        0.503091
Name: SalePrice, dtype: float64
