In [63]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor

In [48]:
df = pd.read_csv("../../data/train.csv")
y = np.log(df['SalePrice']) # using the logarithm as the target
X = df.drop(columns=['SalePrice', 'Id']).copy() # dropped ID col too
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=9527)

In [56]:
# create the list of cols to drop based on missing rate
missing_rates = (X_train.isnull().sum()/X_train.shape[0]).sort_values(ascending=False)[:20]
cols_drop = list(missing_rates[:5].index)

# create list of categorical and numerical columns
cat_cols = [x for x in X_train.columns[[(x=='object') for x in X_train.dtypes]] if x not in cols_drop]
num_cols = [x for x in X_train.columns if (x not in cat_cols) and (x not in cols_drop)]

assert len(cat_cols) + len(num_cols) + len(cols_drop) == X_train.shape[1]

In [57]:
# define pipeline steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(np.nan, 'mean')),
    ('scaler', StandardScaler())
])

categoric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan,
                              strategy='most_frequent')),
    ("ohe", OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(remainder='drop', transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categoric_transformer, cat_cols)
])

model = LinearRegression()

ppl = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', model)])

In [58]:
ppl.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               

In [62]:
np.exp(ppl.predict(X_test)) - np.exp(y_test)

873    -9052.844080
714    45935.029626
280   -20878.282201
934    26895.804334
310     4139.769951
           ...     
20     12029.912583
468     2194.002997
124   -20062.531549
487    -7145.551072
905   -11104.890582
Name: SalePrice, Length: 438, dtype: float64