In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn import set_config

set_config(transform_output='pandas')

In [9]:
df = pd.read_csv('../data/raw/train.csv')
df = pd.read_csv('../data/raw/test.csv')

train_df = pd.read_csv('/Users/kubrademirhan/Documents/kaggle_house_prices/data/raw/train.csv')
test_df = pd.read_csv('/Users/kubrademirhan/Documents/kaggle_house_prices/data/raw/test.csv')

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [12]:
X = train_df.drop(columns=['SalePrice'])
y = np.log1p(train_df['SalePrice'])

In [13]:
#Baseline Score
y.value_counts(normalize=True).values.max().round(2)*100

np.float64(1.0)

In [14]:
# Split + evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [15]:
from sklearn.pipeline import make_pipeline

num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

num_pipe

In [16]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)

cat_pipe

In [17]:
preprocess_pipe = ColumnTransformer(transformers=[
    ('num', num_pipe, make_column_selector(dtype_include=np.number)),
    ('cat', cat_pipe, make_column_selector(dtype_exclude=np.number)),
])

preprocess_pipe

In [18]:
preprocess_pipe.fit(X_train)

In [19]:
preprocess_pipe.transform(X_train)

Unnamed: 0,num__Id,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,...,cat__SaleType_ConLw,cat__SaleType_New,cat__SaleType_Oth,cat__SaleType_WD,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
254,-1.119284,-0.866764,-0.012468,-0.212896,-0.820445,0.372217,-0.455469,-1.346063,-0.597889,1.037269,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1066,0.790464,0.074110,-0.502357,-0.265245,-0.088934,1.268609,0.718609,0.439214,-0.597889,-0.971996,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
638,-0.216152,-0.631546,-0.146074,-0.177841,-0.820445,1.268609,-1.988293,-1.683818,-0.597889,-0.971996,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
799,0.162505,-0.161109,-0.457822,-0.324474,-0.820445,1.268609,-1.107734,-1.683818,0.861522,0.267995,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
380,-0.822944,-0.161109,-0.903175,-0.529035,-0.820445,0.372217,-1.531707,-1.683818,-0.597889,-0.496920,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0.858669,-0.866764,0.343814,-0.127631,-0.088934,-0.524174,1.142582,1.018222,-0.597889,-0.919694,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1130,0.940986,-0.161109,-0.235145,-0.268313,-1.551955,-2.316957,-1.401254,-1.683818,-0.597889,0.383495,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1294,1.326699,-0.866764,-0.457822,-0.234096,-0.820445,1.268609,-0.520696,0.246211,-0.597889,-0.608062,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
860,0.305971,-0.161109,-0.680498,-0.283376,0.642577,2.165000,-1.727387,0.632217,-0.597889,-0.971996,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [20]:
preprocess_pipe.transform(X_train).isna().sum()

num__Id                       0
num__MSSubClass               0
num__LotFrontage              0
num__LotArea                  0
num__OverallQual              0
                             ..
cat__SaleCondition_AdjLand    0
cat__SaleCondition_Alloca     0
cat__SaleCondition_Family     0
cat__SaleCondition_Normal     0
cat__SaleCondition_Partial    0
Length: 286, dtype: int64

In [21]:
# Modeling

full_pipeline = make_pipeline(preprocess_pipe, Ridge())
full_pipeline

In [22]:
full_pipeline.fit(X_train, y_train)
pred = full_pipeline.predict(X_test)

  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [23]:
rmse = np.sqrt(mean_squared_error(y_test, pred))
print('RMSE:', rmse)

RMSE: 0.13134133248132435


In [24]:
# Fine Tuning
full_pipeline.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x129d603a0>),
                                   ('cat',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse_output=False))]),
                       

In [25]:
from sklearn.model_selection import GridSearchCV


grid = {'ridge__alpha': [0.1, 1.0, 10.0, 100.0]}

search = GridSearchCV(full_pipeline, param_grid=grid, cv=5, scoring='neg_root_mean_squared_error')
search

In [26]:
search.fit(X_train, y_train)

  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  r

In [27]:
search.best_params_

{'ridge__alpha': 10.0}

In [28]:
search.best_score_

np.float64(-0.14540907619483606)

In [29]:
best_model = search.best_estimator_
pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))
rmse

  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


np.float64(0.1358608117219996)

In [30]:
from sklearn.linear_model import ElasticNet

full_pipe_en = make_pipeline(
    preprocess_pipe,
    ElasticNet(max_iter=20000, random_state=42)
)

full_pipe_en

In [31]:
param_grid_en = {
    'elasticnet__alpha': [0.01, 0.1, 1.0, 10.0, 50.0],
    'elasticnet__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

search_en = GridSearchCV(
    estimator=full_pipe_en,
    param_grid=param_grid_en,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    error_score='raise',
    verbose=2
)

search_en

In [32]:
search_en.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
 

[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.5; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.5; total time=   0.1s
[CV] END ...elasticnet__alph

  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
 

[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.9; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.9; total time=   0.1s
[CV] END ...elasticnet__alpha=0.01, elasticnet__l1_ratio=0.9; total time=   0.1s
[CV] END ....elasticnet__alpha=0.1, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ....elasticnet__alpha=0.1, elasticnet__l1_ratio=0.1; total time=   0.0s
[CV] END ....elasticnet__alpha=0.1, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ....elasticnet__alpha=0.1, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ....elasticnet__alpha=0.1, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ....elasticnet__alpha=0.1, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ....elasticnet__alpha=0.1, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ....elasticnet__alpha=0.1, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ....elasticnet__alpha=0.1, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ....elasticnet__alp

  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
 

[CV] END ....elasticnet__alpha=1.0, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ....elasticnet__alpha=1.0, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ....elasticnet__alpha=1.0, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ....elasticnet__alpha=1.0, elasticnet__l1_ratio=0.5; total time=   0.0s
[CV] END ....elasticnet__alpha=1.0, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ....elasticnet__alpha=1.0, elasticnet__l1_ratio=0.3; total time=   0.1s
[CV] END ....elasticnet__alpha=1.0, elasticnet__l1_ratio=0.5; total time=   0.0s
[CV] END ....elasticnet__alpha=1.0, elasticnet__l1_ratio=0.7; total time=   0.0s
[CV] END ....elasticnet__alpha=1.0, elasticnet__l1_ratio=0.5; total time=   0.1s
[CV] END ....elasticnet__alpha=1.0, elasticnet__l1_ratio=0.9; total time=   0.0s
[CV] END ....elasticnet__alpha=1.0, elasticnet__l1_ratio=0.7; total time=   0.1s
[CV] END ....elasticnet__alpha=1.0, elasticnet__l1_ratio=0.9; total time=   0.0s
[CV] END ....elasticnet__alp

  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
 

[CV] END ...elasticnet__alpha=10.0, elasticnet__l1_ratio=0.7; total time=   0.1s
[CV] END ...elasticnet__alpha=10.0, elasticnet__l1_ratio=0.5; total time=   0.1s
[CV] END ...elasticnet__alpha=10.0, elasticnet__l1_ratio=0.9; total time=   0.1s
[CV] END ...elasticnet__alpha=10.0, elasticnet__l1_ratio=0.5; total time=   0.0s
[CV] END ...elasticnet__alpha=50.0, elasticnet__l1_ratio=0.1; total time=   0.0s
[CV] END ...elasticnet__alpha=10.0, elasticnet__l1_ratio=0.7; total time=   0.1s
[CV] END ...elasticnet__alpha=50.0, elasticnet__l1_ratio=0.1; total time=   0.0s
[CV] END ...elasticnet__alpha=50.0, elasticnet__l1_ratio=0.1; total time=   0.1s
[CV] END ...elasticnet__alpha=50.0, elasticnet__l1_ratio=0.1; total time=   0.0s
[CV] END ...elasticnet__alpha=10.0, elasticnet__l1_ratio=0.9; total time=   0.1s
[CV] END ...elasticnet__alpha=10.0, elasticnet__l1_ratio=0.9; total time=   0.0s
[CV] END ...elasticnet__alpha=50.0, elasticnet__l1_ratio=0.1; total time=   0.0s
[CV] END ...elasticnet__alph

In [33]:
search_en.best_params_

{'elasticnet__alpha': 0.01, 'elasticnet__l1_ratio': 0.1}

In [34]:
search_en.best_score_

np.float64(-0.14419531837160693)

In [35]:
best_rmse_cv = search_en.best_score_
best_rmse_cv

np.float64(-0.14419531837160693)

In [36]:
best_model = search_en.best_estimator_

pred_test = best_model.predict(X_test)

rmse_test = np.sqrt(mean_squared_error(y_test, pred_test))
rmse_test

  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


np.float64(0.14008457329318774)

In [37]:
best_model.fit(X, y)

In [38]:
test_preds = best_model.predict(test_df)
test_preds

  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


array([11.6191061 , 11.86438099, 12.03373813, ..., 11.98674115,
       11.65979607, 12.36087299])

In [40]:
log_pred = best_model.predict(test_df)
final_pred = np.expm1(log_pred)
final_pred

  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


array([111201.27259772, 142112.45478346, 168338.51268883, ...,
       160610.09219964, 115819.4085775 , 233483.97099134])

In [41]:
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': final_pred
})

submission.to_csv('submission_log_target.csv', index=False)