# Model - Polynomial with log of target

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler, Normalizer, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso

from sklearn import set_config
set_config(transform_output='pandas')

from category_encoders import OrdinalEncoder

from src.helpers import *

In [2]:
df = pd.read_csv('../data/train.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# ---Remove Outliers---

In [4]:
# OutlierRemover is a custom transformer imported from src/helpers.py
# It can remove outliers from multiple columns by specifying the iqr_multipler

# Features where there are significant outliers per EDA
outlier_cols = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','TotalBsmtSF','1stFlrSF','GrLivArea',
                'OpenPorchSF']

# Set the iqr multiplier to 5, to remove only the extreme outliers, in order to keep as much data as possible
outlier_remover = OutlierRemover(columns=outlier_cols,iqr_multiplier=5)

# Fit the dataset to the outlier remover and save to new variable
outlier_remover.fit(df)
df_proc = outlier_remover.transform(df)

# Check how many observations remain
df_proc.count()

Id               1428
MSSubClass       1428
MSZoning         1428
LotFrontage      1178
LotArea          1428
                 ... 
MoSold           1428
YrSold           1428
SaleType         1428
SaleCondition    1428
SalePrice        1428
Length: 81, dtype: int64

# ---Pipeline---

### 1. Filter for features to remove

In [5]:
# Defining the list of columns that will go into the pipeline
# Remove columns that should not be included in the pipeline

# Remove SalePrice b/c target feature
# Remove Id b/c no predictive meaning
# Remove Street b/c low variance
# Remove Utilities b/c low variance

all_columns = df.columns.tolist()
columns_remove_from_filter = ['SalePrice','Id','Street','Utilities']

columns_to_keep = [item for item in all_columns if item not in columns_remove_from_filter]

In [6]:
# Preprocessor to filter unwanted columns, or unexpected columns.

preprocessor_filter = ColumnTransformer([
  ('passthrough','passthrough',columns_to_keep)
],remainder='drop', verbose_feature_names_out=False)

### 2. Imputing null variables

In [7]:
# Preprocessor for imputing all null variables

cols_impute_na = ['Alley', 'GarageType', 'Fence','BsmtQual', 'BsmtCond', 'BsmtExposure',
                  'BsmtFinType1','BsmtFinType2', 'FireplaceQu', 'GarageFinish', 'GarageQual','GarageCond',
                  'PoolQC', 'MiscFeature']
cols_impute_none = ['MasVnrType']
cols_impute_most_freq = ['Electrical','MSZoning','Exterior1st','Exterior2nd','KitchenQual','Functional']
cols_impute_zero = ['LotFrontage', 'MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath',
                    'GarageCars','GarageArea','GarageYrBlt']

preprocessor_imputer = ColumnTransformer([
  ('impute_na', SimpleImputer(strategy='constant', fill_value='NA'), cols_impute_na),
  ('impute_none', SimpleImputer(strategy='constant', fill_value='None'), cols_impute_none),
  ('impute_zero', SimpleImputer(strategy='constant', fill_value=0), cols_impute_zero),
  ('impute_most_freq', SimpleImputer(strategy='most_frequent'), cols_impute_most_freq)
], remainder='passthrough', verbose_feature_names_out=False)

### 3. Mapping Ordinal Features

In [8]:
# Defining maps for Ordinal categories

dict_na_ex_6 = {'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
dict_na_gd_5 = {'NA':0,'No':1,'Mn':2,'Av':3,'Gd':4}
dict_bsmt = {'NA':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6}
dict_garage = {'NA':0,'Unf':1,'RFn':2,'Fin':3}
dict_fence = {'NA':0, 'MnWw':1, 'GdWo':2, 'MnPrv':3, 'GdPrv':4}


ordinal_cat_map = [
  {'col':'BsmtQual','mapping':dict_na_ex_6},
  {'col':'BsmtCond','mapping':dict_na_ex_6},
  {'col':'BsmtExposure','mapping':dict_na_gd_5},
  {'col':'BsmtFinType1','mapping':dict_bsmt},
  {'col':'BsmtFinType2','mapping':dict_bsmt},
  {'col':'FireplaceQu','mapping':dict_na_ex_6},
  {'col':'GarageFinish','mapping':dict_garage},
  {'col':'GarageQual','mapping':dict_na_ex_6},
  {'col':'GarageCond','mapping':dict_na_ex_6},
  {'col':'PoolQC','mapping':dict_na_ex_6},
  {'col': 'ExterQual', 'mapping': dict_na_ex_6},
  {'col': 'ExterCond', 'mapping': dict_na_ex_6},
  {'col': 'HeatingQC', 'mapping': dict_na_ex_6},
  {'col': 'KitchenQual', 'mapping': dict_na_ex_6},
  {'col': 'Fence', 'mapping': dict_fence}
  ]

### 4. Encoding and Scaling

In [9]:
standard_cols = ['LotFrontage','GarageArea']

robust_cols = ['LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF',
               'GrLivArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','ScreenPorch','PoolArea','MiscVal', 'LowQualFinSF',
               '3SsnPorch']

minmax_cols = ['YearBuilt','YearRemodAdd']


preprocessor_scalers = ColumnTransformer([
  ('standard_scaler', StandardScaler(), standard_cols),
  ('robust_scaler', RobustScaler(), robust_cols),
  ('minmax_scaler', MinMaxScaler(), minmax_cols)
], remainder='passthrough', verbose_feature_names_out=False)

scaler_pipe = Pipeline([
  ('preprocessor_scalers', preprocessor_scalers),
  ('poly', PolynomialFeatures())
])


In [10]:
# Preprocessor for encoding and scaling

ohe_cols = ['MSSubClass','MSZoning','Alley','LotShape','LandContour','LotConfig','LandSlope',
            'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st',
            'Exterior2nd','MasVnrType','Foundation','Heating', 'CentralAir','Electrical','Functional',
            'GarageType','PavedDrive','MiscFeature','MoSold','YrSold','SaleType','SaleCondition']

oe_cols = ['OverallQual','OverallCond','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure',
           'BsmtFinType1','BsmtFinType2','HeatingQC','FireplaceQu', 'GarageFinish','GarageQual', 'GarageCond',
           'PoolQC','KitchenQual','Fence']


preprocessor_encode_scale = ColumnTransformer([
  ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ohe_cols),
  ('oe', OrdinalEncoder(mapping=ordinal_cat_map), oe_cols)],
  remainder=scaler_pipe, verbose_feature_names_out=False)

# --- Train/Val Split, Fit & Transform ---

In [11]:
# Train/Val split, target feature is SalePrice

X = df.drop(columns='SalePrice')
y = df['SalePrice'].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# take the log of the target to normalize the distribution

log_y_train = np.log(y_train)
log_y_val = np.log(y_val)

In [13]:
# Main pipeline, include all preprocessors

main_pipe = Pipeline([
  ('preprocessor_filter', preprocessor_filter),
  ('preprocessor_imputer', preprocessor_imputer),
  ('preprocessor_encode_scale', preprocessor_encode_scale)
])

In [14]:
# Fit and transform X_train and X_val

# main_pipe.fit(X_train)

# X_train_proc = main_pipe.transform(X_train)
# X_val_proc = main_pipe.transform(X_val)

In [15]:
# Checking the number of columns after pipeline

# len(X_train_proc.columns)

In [16]:
# creating a pipeline for polynomial transformer using GridSearch

poly_pipe = Pipeline([
  ('main_pipe', main_pipe),
  ('model',Lasso())
])

In [19]:
poly_pipe.get_params()

{'memory': None,
 'steps': [('main_pipe',
   Pipeline(steps=[('preprocessor_filter',
                    ColumnTransformer(transformers=[('passthrough', 'passthrough',
                                                     ['MSSubClass', 'MSZoning',
                                                      'LotFrontage', 'LotArea',
                                                      'Alley', 'LotShape',
                                                      'LandContour', 'LotConfig',
                                                      'LandSlope', 'Neighborhood',
                                                      'Condition1', 'Condition2',
                                                      'BldgType', 'HouseStyle',
                                                      'OverallQual', 'OverallCond',
                                                      'YearBuilt', 'YearRemodAdd',
                                                      'RoofStyle', 'RoofMatl',
                        

# --- Model Evaluation ---

In [20]:

from sklearn.model_selection import GridSearchCV

params = {'main_pipe__preprocessor_encode_scale__remainder__poly__degree': range(1,4), 'model__alpha':[1000,100,10,1,0.1,.001,.0001]}
poly_gs = GridSearchCV(poly_pipe, params, verbose=2)

poly_gs.fit(X_train, log_y_train)


Fitting 5 folds for each of 21 candidates, totalling 105 fits
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=1, model__alpha=1000; total time=   0.1s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=1, model__alpha=1000; total time=   0.0s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=1, model__alpha=1000; total time=   0.0s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=1, model__alpha=1000; total time=   0.0s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=1, model__alpha=1000; total time=   0.0s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=1, model__alpha=100; total time=   0.0s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=1, model__alpha=100; total time=   0.0s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=1, model__alpha=100; total time=   0.0s
[CV] END main_pipe__preprocessor_encode_scale

  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=1, model__alpha=0.0001; total time=   0.2s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=1, model__alpha=0.0001; total time=   0.2s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=1, model__alpha=0.0001; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=1, model__alpha=0.0001; total time=   0.3s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=1000; total time=   0.1s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=1000; total time=   0.1s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=1000; total time=   0.1s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=1000; total time=   0.1s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=1000; total time=   0.1s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=100; total time=   0.1s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=100; total time=   0.1s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=100; total time= 

  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=10; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=1; total time=   0.3s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=1; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=1; total time=   0.3s
[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=1; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=1; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.1; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.1; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.1; total time=   0.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.1; total time=   0.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.001; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.001; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.001; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.001; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.001; total time=   0.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.0001; total time=   0.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.0001; total time=   0.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.0001; total time=   0.8s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.0001; total time=   0.7s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=2, model__alpha=0.0001; total time=   0.7s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=1000; total time=   2.8s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=1000; total time=   3.2s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=1000; total time=   2.9s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=1000; total time=   3.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=1000; total time=   2.9s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=100; total time=   3.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=100; total time=   2.9s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=100; total time=   3.2s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=100; total time=   3.7s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=100; total time=   3.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=10; total time=   3.1s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=10; total time=   3.1s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=10; total time=   3.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=10; total time=   3.1s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=10; total time=   3.1s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=1; total time=   3.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=1; total time=   3.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=1; total time=   3.3s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=1; total time=   3.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=1; total time=   3.2s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.1; total time=   3.5s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.1; total time=   3.4s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.1; total time=   3.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.1; total time=   3.7s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.1; total time=   3.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.001; total time=   4.7s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.001; total time=   4.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.001; total time=   4.7s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.001; total time=   5.0s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.001; total time=   4.8s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.0001; total time=   5.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.0001; total time=   5.6s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.0001; total time=   5.1s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.0001; total time=   6.1s


  model = cd_fast.enet_coordinate_descent(


[CV] END main_pipe__preprocessor_encode_scale__remainder__poly__degree=3, model__alpha=0.0001; total time=   6.1s


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [21]:
linear_results_train = evaluate_regression(poly_gs, X_train, log_y_train, 'poly_gs')
linear_results_val = evaluate_regression(poly_gs, X_val, log_y_val, 'poly_gs')

In [22]:
linear_results_train

Unnamed: 0,MAE,MSE,RMSE,MAPE,R2,adj_r2
poly_gs,0.080813,0.014244,0.119349,0.006753,0.906561,0.899685


In [23]:
linear_results_val

Unnamed: 0,MAE,MSE,RMSE,MAPE,R2,adj_r2
poly_gs,0.093486,0.01883,0.137222,0.007876,0.899097,0.86084


In [27]:
poly_gs.best_params_

{'main_pipe__preprocessor_encode_scale__remainder__poly__degree': 1,
 'model__alpha': 0.001}

In [26]:
log_val_preds = poly_gs.predict(X_val)
val_preds = np.exp(log_val_preds)
pred_log = np.log(val_preds)
y_log = np.log(y_val)
root_mean_squared_error(y_log, pred_log)

0.1372218029682547

# --- Predicting test ---

In [None]:
# import the test data

test_data = pd.read_csv('../data/test.csv')

In [None]:
# save the id column to be added back later to the predictions

test_id = test_data.loc[:,'Id'].copy()
test_id = pd.DataFrame(test_id)

In [None]:
# transform test data using the pipeline

test_proc = main_pipe.transform(test_data)

In [None]:
# predict test (keeping in mind it is still log)

log_test_pred = lasso.predict(test_proc)
log_test_pred = pd.DataFrame(log_test_pred)

In [None]:
# convert the test predictions back using exponential

test_pred = np.exp(log_test_pred)

In [None]:
# adding the id's back to the test predictions

combined_test = test_id.join(test_pred)

In [None]:
# change the column names to match the submission format

combined_test.columns = ['Id','SalePrice']

In [None]:
# check the final submission

combined_test

Unnamed: 0,Id,SalePrice
0,1461,120401.108792
1,1462,158593.943409
2,1463,175283.337671
3,1464,195848.450337
4,1465,203615.033128
...,...,...
1454,2915,80454.418070
1455,2916,80412.021112
1456,2917,162911.652723
1457,2918,117614.923731


In [None]:
# save results to csv

# combined_test.to_csv('../data/test_preds5_lasso_log.csv', index=False)

In [None]:
# save results to csv - submission

# combined_test.to_csv('../data/submission5.csv', index=False)

In [None]:
# calculate kaggle ranking, what percentage I am in

1557/3639

0.42786479802143446