In which I throw the data into various models and see what produces the best results.

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('df_for_modeling.csv')

In [4]:
y = df['SalePrice']
X = df[['LotArea','OverallQual','OverallCond','YearBuilt','BedroomAbvGr','GarageCars','TotalSF','TotFullBath','TotHalfBath','has_Pool']]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
cv_scores = cross_val_score(reg, X, y, cv=5)

print(f"Cross-validation scores (R-squared): {cv_scores}")
print(f"Average score: {np.mean(cv_scores)}")

Cross-validation scores (R-squared): [0.82599966 0.85967436 0.81782149 0.79911465 0.81864633]
Average score: 0.8242512992801357


In [11]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': np.linspace(0.1, 1, 10)}

lasso = Lasso()
lasso_cv = GridSearchCV(lasso, param_grid, cv=5)
lasso_cv.fit(X_train, y_train)

y_pred = lasso_cv.predict(X_test)
r2 = lasso_cv.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Winning parameters: {lasso_cv.best_params_}")
print(f"R-squared is: {r2}")
print(f"MSE is: {mse}")

Winning parameters: {'alpha': 0.1}
R-squared is: 0.8394802457521632
MSE is: 933204122.9849731


Interesting ... looks like a linear regression is actually pretty much the way to go, at least compared to a lasso.

In [12]:
from sklearn.linear_model import Ridge

param_grid = {'alpha': np.logspace(-4, 0, 20)}

ridge = Ridge()
ridge_cv = GridSearchCV(ridge, param_grid, cv=5)
ridge_cv.fit(X_train, y_train)

y_pred = ridge_cv.predict(X_test)
r2 = ridge_cv.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Winning parameters: {ridge_cv.best_params_}")
print(f"R-squared is: {r2}")
print(f"MSE is: {mse}")

Winning parameters: {'alpha': 1.0}
R-squared is: 0.8399337843650425
MSE is: 930567412.5971757


In [16]:
from sklearn.linear_model import ElasticNet

param_grid = {'l1_ratio': np.linspace(0.1, 1, 10)}

en = ElasticNet(max_iter=2000)
en_cv = GridSearchCV(en, param_grid, cv=5)
en_cv.fit(X_train, y_train)

y_pred = en_cv.predict(X_test)
r2 = en_cv.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Winning parameters: {en_cv.best_params_}")
print(f"R-squared is: {r2}")
print(f"MSE is: {mse}")

Winning parameters: {'l1_ratio': 1.0}
R-squared is: 0.8395508123748705
MSE is: 932793874.0186436
