In which I throw the data into various models and see what produces the best results.

In [46]:
import pandas as pd
import numpy as np

df = pd.read_csv('df_for_modeling.csv', index_col='Id')

In [47]:
y = df['SalePrice']
X = df[['LotArea','OverallQual','OverallCond','YearBuilt','BedroomAbvGr','GarageCars','TotalSF','TotFullBath','TotHalfBath','has_Pool']]

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [49]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

reg = LinearRegression()

reg_cv = GridSearchCV(reg, {}, cv=5)
reg_cv.fit(X_train, y_train)

y_pred = reg_cv.predict(X_test)
r2 = reg_cv.score(X_test, y_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"R-squared is: {r2}")
print(f"RMSE is: {rmse}, and MAE is: {mae}")

R-squared is: 0.8394723907727396
RMSE is: 30549.1372874985, and MAE is: 21672.98290611072


In [50]:
from sklearn.linear_model import Lasso

param_grid = {'alpha': np.linspace(0.1, 1, 10)}

lasso = Lasso()
lasso_cv = GridSearchCV(lasso, param_grid, cv=5)
lasso_cv.fit(X_train, y_train)

y_pred = lasso_cv.predict(X_test)
r2 = lasso_cv.score(X_test, y_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"Winning parameters: {lasso_cv.best_params_}")
print(f"R-squared is: {r2}")
print(f"RMSE is: {rmse}, and MAE is: {mae}")

Winning parameters: {'alpha': 0.1}
R-squared is: 0.8394802457521632
RMSE is: 30548.389859123068, and MAE is: 21672.718810760965


Interesting ... looks like a linear regression is actually pretty much the way to go, at least compared to a lasso.

In [51]:
from sklearn.linear_model import Ridge

param_grid = {'alpha': np.logspace(-4, 0, 20)}

ridge = Ridge()
ridge_cv = GridSearchCV(ridge, param_grid, cv=5)
ridge_cv.fit(X_train, y_train)

y_pred = ridge_cv.predict(X_test)
r2 = ridge_cv.score(X_test, y_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"Winning parameters: {ridge_cv.best_params_}")
print(f"R-squared is: {r2}")
print(f"RMSE is: {rmse}, and MAE is: {mae}")

Winning parameters: {'alpha': 1.0}
R-squared is: 0.8399337843650425
RMSE is: 30505.203041402227, and MAE is: 21656.20575192284


In [52]:
from sklearn.linear_model import ElasticNet

param_grid = {'l1_ratio': np.linspace(0.1, 1, 10)}

en = ElasticNet(max_iter=2000)
en_cv = GridSearchCV(en, param_grid, cv=5)
en_cv.fit(X_train, y_train)

y_pred = en_cv.predict(X_test)
r2 = en_cv.score(X_test, y_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"Winning parameters: {en_cv.best_params_}")
print(f"R-squared is: {r2}")
print(f"RMSE is: {rmse}, and MAE is: {mae}")

Winning parameters: {'l1_ratio': 1.0}
R-squared is: 0.8395508123748705
RMSE is: 30541.67438138655, and MAE is: 21670.34193294356


Of all of these, Ridge (with alpha = 1.0) was the highest performing, as evaluated by RMSE. 

I need to prepare my test data the same way I have prepared my training data.

In [53]:
X_holdout = pd.read_csv('test.csv', index_col='Id')

X_holdout['TotalLivAreaSF'] = X_holdout['1stFlrSF'] + X_holdout['2ndFlrSF']
X_holdout['TotalSF'] = X_holdout['TotalLivAreaSF'] + X_holdout['TotalBsmtSF'] + X_holdout['GarageArea']

X_holdout['TotFullBath'] = X_holdout['BsmtFullBath'] + X_holdout['FullBath']
X_holdout['TotHalfBath'] = X_holdout['BsmtHalfBath'] + X_holdout['HalfBath']
X_holdout['has_Pool'] = np.where(X_holdout['PoolArea']>0,1,0)

X_holdout_for_pred = X_holdout[['LotArea','OverallQual','OverallCond','YearBuilt','BedroomAbvGr','GarageCars','TotalSF','TotFullBath','TotHalfBath','has_Pool']]

In [54]:
X_holdout_for_pred.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 1461 to 2919
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotArea       1459 non-null   int64  
 1   OverallQual   1459 non-null   int64  
 2   OverallCond   1459 non-null   int64  
 3   YearBuilt     1459 non-null   int64  
 4   BedroomAbvGr  1459 non-null   int64  
 5   GarageCars    1458 non-null   float64
 6   TotalSF       1457 non-null   float64
 7   TotFullBath   1457 non-null   float64
 8   TotHalfBath   1457 non-null   float64
 9   has_Pool      1459 non-null   int64  
dtypes: float64(4), int64(6)
memory usage: 125.4 KB


First - it turns out we've been foiled by some `NaN`s in the holdout dataset (test.csv). Rats! They're all floats. I'm going to have to impute them, and I'm going to use a simple mean imputer. 

In [57]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

imputer =  SimpleImputer(strategy='mean')
ridge = Ridge(alpha=1.0)

steps = [('imputation', imputer), ('model', ridge)]
pipeline = Pipeline(steps)

pipeline.fit(X, y)
preds = pipeline.predict(X_holdout_for_pred)

In [62]:
submission_df = pd.DataFrame({'Id': X_holdout_for_pred.index, 'SalePrice': preds})
submission_df.to_csv('price_predictions.csv', index=False)