In [23]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn import set_config
from sklearn.ensemble import GradientBoostingRegressor

set_config(transform_output='pandas')

In [24]:
df = pd.read_csv('../data/raw/train.csv')
df = pd.read_csv('../data/raw/test.csv')

train_df = pd.read_csv('/Users/kubrademirhan/Documents/kaggle_house_prices/data/raw/train.csv')
test_df = pd.read_csv('/Users/kubrademirhan/Documents/kaggle_house_prices/data/raw/test.csv')

In [25]:
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']

y_log = np.log1p(y)

In [26]:
#Baseline Score
y.value_counts(normalize=True).values.max().round(2)*100

np.float64(1.0)

In [27]:
# Train/Test Split + evaluation
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X,
    y_log,
    test_size=0.2,
    random_state=42
)

In [28]:
from sklearn.pipeline import make_pipeline

num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

num_pipe

In [29]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)

cat_pipe

In [30]:
preprocess_pipe = ColumnTransformer(transformers=[
    ('num', num_pipe, make_column_selector(dtype_include=np.number)),
    ('cat', cat_pipe, make_column_selector(dtype_exclude=np.number)),
])

preprocess_pipe

In [31]:
preprocess_pipe.fit(X_train)

In [32]:
preprocess_pipe.transform(X_train)

Unnamed: 0,num__Id,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,...,cat__SaleType_ConLw,cat__SaleType_New,cat__SaleType_Oth,cat__SaleType_WD,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
254,-1.119284,-0.866764,-0.012468,-0.212896,-0.820445,0.372217,-0.455469,-1.346063,-0.597889,1.037269,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1066,0.790464,0.074110,-0.502357,-0.265245,-0.088934,1.268609,0.718609,0.439214,-0.597889,-0.971996,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
638,-0.216152,-0.631546,-0.146074,-0.177841,-0.820445,1.268609,-1.988293,-1.683818,-0.597889,-0.971996,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
799,0.162505,-0.161109,-0.457822,-0.324474,-0.820445,1.268609,-1.107734,-1.683818,0.861522,0.267995,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
380,-0.822944,-0.161109,-0.903175,-0.529035,-0.820445,0.372217,-1.531707,-1.683818,-0.597889,-0.496920,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0.858669,-0.866764,0.343814,-0.127631,-0.088934,-0.524174,1.142582,1.018222,-0.597889,-0.919694,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1130,0.940986,-0.161109,-0.235145,-0.268313,-1.551955,-2.316957,-1.401254,-1.683818,-0.597889,0.383495,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1294,1.326699,-0.866764,-0.457822,-0.234096,-0.820445,1.268609,-0.520696,0.246211,-0.597889,-0.608062,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
860,0.305971,-0.161109,-0.680498,-0.283376,0.642577,2.165000,-1.727387,0.632217,-0.597889,-0.971996,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
preprocess_pipe.transform(X_train).isna().sum()

num__Id                       0
num__MSSubClass               0
num__LotFrontage              0
num__LotArea                  0
num__OverallQual              0
                             ..
cat__SaleCondition_AdjLand    0
cat__SaleCondition_Alloca     0
cat__SaleCondition_Family     0
cat__SaleCondition_Normal     0
cat__SaleCondition_Partial    0
Length: 286, dtype: int64

In [34]:
# Modeling

full_pipeline = make_pipeline(preprocess_pipe, Ridge())
full_pipeline

In [35]:
gbr = GradientBoostingRegressor(random_state=42)

full_pipe_gbr = make_pipeline(
    preprocess_pipe,
    gbr
)

full_pipe_gbr

In [36]:
full_pipe_gbr.fit(X_train, y_train_log)

In [37]:
val_pred_log = full_pipe_gbr.predict(X_val)
val_pred = np.expm1(val_pred_log)
y_val = np.expm1(y_val_log)

In [38]:
rmse = np.sqrt(mean_squared_error(y_val, val_pred))
rmse

np.float64(29742.617032616126)

In [39]:
full_pipe_gbr.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x12891a2b0>),
                                   ('cat',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse_output=False))]),
                       

In [41]:
param_grid = {
    'gradientboostingregressor__n_estimators': [300, 600],
    'gradientboostingregressor__learning_rate': [0.03, 0.05],
    'gradientboostingregressor__max_depth': [2, 3],
    'gradientboostingregressor__subsample': [0.7, 1.0]
}

search = GridSearchCV(full_pipe_gbr, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')
search

In [42]:
search.fit(X_train, y_train_log)

In [43]:
search.best_params_, search.best_score_

({'gradientboostingregressor__learning_rate': 0.03,
  'gradientboostingregressor__max_depth': 3,
  'gradientboostingregressor__n_estimators': 600,
  'gradientboostingregressor__subsample': 0.7},
 np.float64(-0.12633341595580158))

In [44]:
best_model = search.best_estimator_

val_pred_log = best_model.predict(X_val)
val_pred = np.expm1(val_pred_log)
y_val = np.expm1(y_val_log)

In [45]:
rmse = np.sqrt(mean_squared_error(y_val, val_pred))
rmse

np.float64(29087.01412693457)

In [46]:
best_model.fit(X, y_log)

In [47]:
test_pred_log = best_model.predict(test_df)
test_pred = np.expm1(test_pred_log)

In [50]:
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': test_pred
})

In [51]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,120923.780859
1,1462,147262.247319
2,1463,178937.934204
3,1464,188369.686644
4,1465,183767.814914


In [52]:
submission_path = 'submissions/submission_gbr_logtarget.csv'
submission.to_csv(submission_path, index=False)

submission_path

'submissions/submission_gbr_logtarget.csv'