In [115]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.kernel_ridge import KernelRidge
from scipy import stats
from pandas.plotting import scatter_matrix
import subprocess
import lightgbm as lgb
import xgboost as xgb
%matplotlib inline

In [149]:
# Save Kaggle submission file
def submission_df(y_pred):
    X_test = load_x_test()
    return pd.DataFrame(y_pred, index=X_test.index, columns=["SalePrice"])

def save_submission_file(y_pred, filename):
    df = submission_df(y_pred)
    path = "./" + filename

    try:
        df.to_csv(path)
    except Exception:
        print("Couldnâ€™t save submission.")
    else:
        print("Submission saved.")
        
# Submit score to Kaggle
def submit_score_to_kaggle(y_pred, filename, message):
    save_submission_file(y_pred, filename)

    completed_process = subprocess.run(
        [
            "kaggle",
            "competitions",
            "submit",
            "-c",
            "house-prices-advanced-regression-techniques",
            "-f",
            filename,
            "-m",
            message
        ], 
        capture_output=True,
        text=True
    )
    
    print(completed_process.stdout)
    
def load_train_data(split=True):
    target = "SalePrice"
    data = pd.read_csv("./train.csv", index_col="Id")
    features = [column for column in data.columns if not column == target]
    print("load_train_data: done")
    
    if split:
        return data[features], data[target]
    else:
        return data
    
def load_x_test():
    return pd.read_csv("./test.csv", index_col="Id");

def load_y_true():
    y_true = pd.read_csv("./solution.csv", index_col="Id")
    return y_true

def load_test_data(split=True):
    X_test = pd.read_csv("./test.csv", index_col="Id")
    y_test = load_y_true()
    print("load_test_data: done")
    
    if split:
        return X_test, y_test
    else:
        return pd.concat([X_test, y_test], axis="columns")
    
def split_features_target(df, target="SalePrice"):
    features = [column for column in df.columns if not column == target]
    return df[features], df[target]

def root_mean_squared_log_error(y_true, y_pred, transform_negative_predictions=False):
    if transform_negative_predictions:
        y_pred_tr = [max(prediction, 0) for prediction in y_pred]
    else:
        y_pred_tr = y_pred
    
    # same as np.sqrt(np.mean(np.power(np.log(np.array(y_pred_tr) + 1) - np.log(np.array(y_true) + 1), 2)))
    return np.sqrt(mean_squared_log_error(y_true, y_pred_tr))

rmsle = root_mean_squared_log_error

def kaggle_score(y_pred, transform_negative_predictions=False, y_log_transformed=False):
    y_true = load_y_true()
    
    if y_log_transformed:
        y_pred = np.exp(y_pred)

    score = root_mean_squared_log_error(y_true, y_pred, transform_negative_predictions=transform_negative_predictions)
    return score

def print_kaggle_score(y_pred):
    y_true = load_y_true()
    score = kaggle_score(y_pred)
    print("The score is %.5f" % score)
    
# Make your own RMSLE (root mean square log error) scorer
rmsle_scorer = make_scorer(root_mean_squared_log_error, greater_is_better=False, transform_negative_predictions=True)

def get_pipe(model):
    numeric_pipe = Pipeline([
        ('impute_missing_numeric_values', SimpleImputer(strategy="median")),
        ('standard_scaler', StandardScaler())
    ])

    categorical_pipe = Pipeline([
        ('impute_missing_categorical_values', SimpleImputer(strategy="most_frequent")),
        ('standard_scaler', OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessing = ColumnTransformer([
        ('numeric', numeric_pipe, make_column_selector(dtype_include=np.number)),
        ('categorical', categorical_pipe, make_column_selector(dtype_include=object))
    ],
    n_jobs=-1)
    
    pipe = Pipeline([
        ("preprocessing", preprocessing),
        ("model", model)
    ])
    
    return pipe

def fit_evaluate(model, feature_engineered=False, ft_instructions=None):
    if feature_engineered:
        X_train, y_train, X_test = load_engineered_data(ft_instructions=ft_instructions)
    else:
        X_train, y_train = load_train_data()
        X_test, _ = load_test_data()

    pipe = get_pipe(model)
    pipe.fit(X_train, y_train)    
    y_pred = pipe.predict(X_test)

    if feature_engineered:
        score = kaggle_score(y_pred, y_log_transformed=True)
    else:
        score = kaggle_score(y_pred)
    
    result = {
        "model": type(model).__name__,
        "kaggle_score": score,
        "y_pred": y_pred,
        "fitted_estimator": pipe,
    }
    return result

def compare_models(models, feature_engineered=False, ft_instructions=None):
    results = []

    for model in models:
        if feature_engineered:
            result = fit_evaluate(model, feature_engineered=True, ft_instructions=ft_instructions)
        else:
            result = fit_evaluate(model)
        results.append(result)

    return results

def compute_cv_scores(model):
    X_train, y_train = load_train_data()
    X_test, _ = load_test_data()
    pipe = get_pipe(model)
    return cross_val_score(pipe, X_train, y_train, scoring=rmsle_scorer)

models = [
    RandomForestRegressor(random_state=42),
#     LinearRegression(),
    Ridge(),
    RidgeCV(),
    KernelRidge(),
    LassoCV(),
    ElasticNet(),
    SGDRegressor(),
    lgb.LGBMRegressor(),
    xgb.XGBRegressor()
]

In [117]:
# submit_score_to_kaggle(np.exp(results[8]["y_pred"]),
#                       "submission_LGBMRegressor_y_log.csv",
#                       "Second try with lgb, the only difference is y_train was log-transformed.")

In [118]:
# Example of cross val scoring with this setup
# rf = RandomForestRegressor(random_state=42)
# scores = compute_cv_scores(rf)
# print(f"mean score: {np.mean(-scores)}, all cv scores:{-scores}")

## Transform the data

In [141]:
def y_log(y_train):
    return np.log(y_train)

def order_categoricals(X):
    return X.replace({ "Alley" : {"Grvl" : 1, "Pave" : 2},
                        "BsmtCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                        "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                        "BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                        "BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                        "ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                        "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                        "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, 
                                       "Min2" : 6, "Min1" : 7, "Typ" : 8},
                        "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                        "LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
                        "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                        "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                        "Street" : {"Grvl" : 1, "Pave" : 2},
                        "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}})

def add_simplification_features(X_train, X_test):
    ten_to_three = {
        1 : 1, 2 : 1, 3 : 1, # bad
        4 : 2, 5 : 2, 6 : 2, # average
        7 : 3, 8 : 3, 9 : 3, 10 : 3 # good
    }

    five_to_three = {
        1 : 1, # bad
        2 : 1, 3 : 1, # average
        4 : 2, 5 : 2 # good
    }

    eight_to_four = {
        1 : 1, 2 : 1, # bad
        3 : 2, 4 : 2, # major
        5 : 3, 6 : 3, 7 : 3, # minor
        8 : 4 # typical
    }

    six_to_three = {
        1 : 1, # unfinished
        2 : 1, 3 : 1, # rec room
        4 : 2, 5 : 2, 6 : 2 # living quarters
    }

    for df in (X_train, X_test):
        df["SimplOverallQual"] = df.OverallQual.replace(ten_to_three)
        df["SimplOverallCond"] = df.OverallCond.replace(ten_to_three)
        df["SimplPoolQC"] = df.PoolQC.replace(five_to_three)
        df["SimplGarageCond"] = df.GarageCond.replace(five_to_three)
        df["SimplGarageQual"] = df.GarageQual.replace(five_to_three)
        df["SimplFireplaceQu"] = df.FireplaceQu.replace(five_to_three)
        df["SimplFireplaceQu"] = df.FireplaceQu.replace(five_to_three)
        df["SimplFunctional"] = df.Functional.replace(eight_to_four)
        df["SimplKitchenQual"] = df.KitchenQual.replace(five_to_three)
        df["SimplHeatingQC"] = df.HeatingQC.replace(five_to_three)
        df["SimplBsmtFinType1"] = df.BsmtFinType1.replace(six_to_three)
        df["SimplBsmtFinType2"] = df.BsmtFinType2.replace(six_to_three)
        df["SimplBsmtCond"] = df.BsmtCond.replace(five_to_three)
        df["SimplBsmtQual"] = df.BsmtQual.replace(five_to_three)
        df["SimplExterCond"] = df.ExterCond.replace(five_to_three)
        df["SimplExterQual"] = df.ExterQual.replace(five_to_three)
        
def add_totalizer_features(X_train, X_test):
    for df in (X_train, X_test):    
        # Overall quality of the house
        df["OverallGrade"] = df["OverallQual"] * df["OverallCond"]
        # Overall quality of the garage
        df["GarageGrade"] = df["GarageQual"] * df["GarageCond"]
        # Overall quality of the exterior
        df["ExterGrade"] = df["ExterQual"] * df["ExterCond"]
        # Overall kitchen score
        df["KitchenScore"] = df["KitchenAbvGr"] * df["KitchenQual"]
        # Overall fireplace score
        df["FireplaceScore"] = df["Fireplaces"] * df["FireplaceQu"]
        # Overall garage score
        df["GarageScore"] = df["GarageArea"] * df["GarageQual"]
        # Overall pool score
        df["PoolScore"] = df["PoolArea"] * df["PoolQC"]
        # Simplified overall quality of the house
        df["SimplOverallGrade"] = df["SimplOverallQual"] * df["SimplOverallCond"]
        # Simplified overall quality of the exterior
        df["SimplExterGrade"] = df["SimplExterQual"] * df["SimplExterCond"]
        # Simplified overall pool score
        df["SimplPoolScore"] = df["PoolArea"] * df["SimplPoolQC"]
        # Simplified overall garage score
        df["SimplGarageScore"] = df["GarageArea"] * df["SimplGarageQual"]
        # Simplified overall fireplace score
        df["SimplFireplaceScore"] = df["Fireplaces"] * df["SimplFireplaceQu"]
        # Simplified overall kitchen score
        df["SimplKitchenScore"] = df["KitchenAbvGr"] * df["SimplKitchenQual"]
        # Total number of bathrooms
        df["TotalBath"] = df["BsmtFullBath"] + (0.5 * df["BsmtHalfBath"]) + \
        df["FullBath"] + (0.5 * df["HalfBath"])
        # Total SF for house (incl. basement)
        df["AllSF"] = df["GrLivArea"] + df["TotalBsmtSF"]
        # Total SF for 1st + 2nd floors
        df["AllFlrsSF"] = df["1stFlrSF"] + df["2ndFlrSF"]
        # Total SF for porch
        df["AllPorchSF"] = df["OpenPorchSF"] + df["EnclosedPorch"] + df["3SsnPorch"] + df["ScreenPorch"] + df['WoodDeckSF']
        # Has masonry veneer or not
        df["HasMasVnr"] = df.MasVnrType.replace({"BrkCmn" : 1, "BrkFace" : 1, "CBlock" : 1, "Stone" : 1, "None" : 0})
        # House completed before sale or not
        df["BoughtOffPlan"] = df.SaleCondition.replace({"Abnorml" : 0, "Alloca" : 0, "AdjLand" : 0, 
                                                            "Family" : 0, "Normal" : 0, "Partial" : 1})
        df['YrBltAndRemod']=df['YearBuilt']+df['YearRemodAdd']
        df['TotalSF']=df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
        df['Total_sqr_footage'] = (df['BsmtFinSF1'] + df['BsmtFinSF2'] + df['1stFlrSF'] + df['2ndFlrSF'])
        df['haspool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
        df['has2ndfloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
        df['hasgarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
        df['hasbsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
        df['hasfireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
        
def add_polynomial_features(X_train, X_test):
    feat1 = ["OverallQual","AllSF","AllFlrsSF","GrLivArea","SimplOverallQual","ExterQual","GarageCars","TotalBath","KitchenQual","GarageScore",]

    for df in (X_train, X_test):
        for feat in feat1:
            df[feat+'_2'] =  df[feat] ** 2
            df[feat+'_3'] =  df[feat] ** 3
            df[feat+'_sqrt'] =  np.sqrt(df[feat])
            
def add_log_features(X_train, X_test):
    log_features = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
                 'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                 'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
                 'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
                 'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearRemodAdd','TotalSF']

    for df in (X_train, X_test):
        for feat in log_features:
            df[feat+'_log'] =  np.log1p(df[feat])

def feature_engineer(data, ft_instructions=None):
    X_train, y_train, X_test = data

    if "y_log" in ft_instructions:
        # Log-transform y_train
        y_train = y_log(y_train)
        
    if "order_categoricals" in ft_instructions:
        X_train = order_categoricals(X_train)
        X_test = order_categoricals(X_test)
        
    if "add_simplification_features" in ft_instructions:
        add_simplification_features(X_train, X_test)
        
    if "add_totalizer_features" in ft_instructions:
        add_totalizer_features(X_train, X_test)
        
    if "add_polynomial_features" in ft_instructions:
        add_polynomial_features(X_train, X_test)
        
    if "add_log_features" in ft_instructions:
        add_log_features(X_train, X_test)
        
    return X_train, y_train, X_test

def load_engineered_data(ft_instructions=None):
    # Load the original data
    X_train, y_train = load_train_data()
    X_test, _ = load_test_data()
    
    X_train, y_train, X_test = feature_engineer((X_train, y_train, X_test), ft_instructions=ft_instructions)
    
    return X_train, y_train, X_test

## Evaluating feature engineering impact

In [120]:
# X_train, y_train = load_train_data()
# X_test, _ = load_test_data()

# # y_train = y_log(y_train)
# X_train = order_categoricals(X_train)
# X_test = order_categoricals(X_test)

# model = RandomForestRegressor()
# pipe = get_pipe(model)
# pipe.fit(X_train, y_train)    
# y_pred = pipe.predict(X_test)

# np.sort(y_pred)

# score = kaggle_score(y_pred, y_log_transformed=True)

In [121]:
# No feature engineering
#results = compare_models(models, feature_engineered=False)
#results

In [122]:
# ft_instructions = {
#     "y_log": True
# }
# results = compare_models(models, feature_engineered=True, ft_instructions=ft_instructions)
# results

In [133]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
}
results = compare_models(models, feature_engineered=True, ft_instructions=ft_instructions)
results

load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done


[{'model': 'RandomForestRegressor',
  'kaggle_score': 0.14598928124918212,
  'y_pred': array([11.73399268, 11.94003773, 12.11798064, ..., 11.9468015 ,
         11.64755451, 12.36062206])},
 {'model': 'Ridge',
  'kaggle_score': 0.14172924822618732,
  'y_pred': array([11.67581985, 11.92327905, 12.06540916, ..., 11.98277   ,
         11.64675014, 12.30735373])},
 {'model': 'RidgeCV',
  'kaggle_score': 0.13455135630372234,
  'y_pred': array([11.6312517 , 11.91317045, 12.03709099, ..., 11.95916254,
         11.64971265, 12.33092812])},
 {'model': 'KernelRidge',
  'kaggle_score': 0.16738800901178202,
  'y_pred': array([11.70327424, 11.86258431, 12.06913926, ..., 11.96326104,
         11.71512901, 12.35705814])},
 {'model': 'LassoCV',
  'kaggle_score': 0.1328939189149408,
  'y_pred': array([11.65748388, 11.90985259, 12.05391393, ..., 12.00426402,
         11.68730082, 12.35507038])},
 {'model': 'ElasticNet',
  'kaggle_score': 0.41637366760951144,
  'y_pred': array([12.0240509, 12.0240509, 12.

In [132]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
}
results = compare_models(models, feature_engineered=True, ft_instructions=ft_instructions)
results

load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done


[{'model': 'RandomForestRegressor',
  'kaggle_score': 0.14639924848063146,
  'y_pred': array([11.7351833 , 11.9341645 , 12.11718869, ..., 11.95182489,
         11.63407876, 12.3495651 ])},
 {'model': 'Ridge',
  'kaggle_score': 0.1407874473073016,
  'y_pred': array([11.68367067, 11.91446662, 12.07569246, ..., 11.99354933,
         11.66295435, 12.31294374])},
 {'model': 'RidgeCV',
  'kaggle_score': 0.13413926912035126,
  'y_pred': array([11.64137565, 11.90523879, 12.04631661, ..., 11.97339152,
         11.66681574, 12.33685116])},
 {'model': 'KernelRidge',
  'kaggle_score': 0.16719986489353866,
  'y_pred': array([11.71832662, 11.84694121, 12.07954181, ..., 11.96550278,
         11.73549606, 12.36360486])},
 {'model': 'LassoCV',
  'kaggle_score': 0.1322638661291602,
  'y_pred': array([11.65678435, 11.90651468, 12.05997392, ..., 12.01474246,
         11.68911524, 12.3559875 ])},
 {'model': 'ElasticNet',
  'kaggle_score': 0.41637366760951144,
  'y_pred': array([12.0240509, 12.0240509, 12.0

In [138]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
}
results = compare_models(models, feature_engineered=True, ft_instructions=ft_instructions)
results

load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done


[{'model': 'RandomForestRegressor',
  'kaggle_score': 0.138712548328831,
  'y_pred': array([11.78699949, 11.98412446, 12.07516864, ..., 12.01253465,
         11.6302134 , 12.33603119])},
 {'model': 'Ridge',
  'kaggle_score': 0.1911016730097794,
  'y_pred': array([11.69044588, 11.92309405, 12.09474908, ..., 12.03374006,
         11.66797855, 12.32626468])},
 {'model': 'RidgeCV',
  'kaggle_score': 0.2029865480462114,
  'y_pred': array([11.66584599, 11.92483381, 12.0721491 , ..., 12.02142747,
         11.68192706, 12.34493738])},
 {'model': 'KernelRidge',
  'kaggle_score': 0.186974709967823,
  'y_pred': array([11.71077477, 11.82594843, 12.09405644, ..., 11.99165478,
         11.72807191, 12.37157547])},
 {'model': 'LassoCV',
  'kaggle_score': 0.20487741221691252,
  'y_pred': array([11.67197944, 11.91615369, 12.07518646, ..., 12.03275375,
         11.69225193, 12.35557855])},
 {'model': 'ElasticNet',
  'kaggle_score': 0.41637366760951144,
  'y_pred': array([12.0240509, 12.0240509, 12.02405

In [140]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
}
results = compare_models(models, feature_engineered=True, ft_instructions=ft_instructions)
results

load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done


  model = cd_fast.enet_coordinate_descent(


load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done


[{'model': 'RandomForestRegressor',
  'kaggle_score': 0.13865219369039805,
  'y_pred': array([11.78228181, 11.98201328, 12.07799669, ..., 12.00889234,
         11.62020881, 12.32166327])},
 {'model': 'Ridge',
  'kaggle_score': 0.14461401381560612,
  'y_pred': array([11.65693993, 11.96373572, 12.13146378, ..., 12.05211249,
         11.69960528, 12.31993642])},
 {'model': 'RidgeCV',
  'kaggle_score': 0.12797769148430285,
  'y_pred': array([11.64224153, 11.96654369, 12.1111276 , ..., 12.04378108,
         11.69178079, 12.33533097])},
 {'model': 'KernelRidge',
  'kaggle_score': 0.15714337864695488,
  'y_pred': array([11.67327329, 11.86084317, 12.12922929, ..., 12.0059282 ,
         11.76319202, 12.36469014])},
 {'model': 'LassoCV',
  'kaggle_score': 0.12460243637503508,
  'y_pred': array([11.66243153, 11.95940421, 12.10856083, ..., 12.04682007,
         11.69308973, 12.33956486])},
 {'model': 'ElasticNet',
  'kaggle_score': 0.41637366760951144,
  'y_pred': array([12.0240509, 12.0240509, 12

In [150]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
    "add_log_features": True,
}
results = compare_models(models, feature_engineered=True, ft_instructions=ft_instructions)
results

load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done


[{'model': 'RandomForestRegressor',
  'kaggle_score': 0.1385494368222794,
  'y_pred': array([11.7837215 , 11.98775351, 12.08737762, ..., 12.01212653,
         11.63092447, 12.32810437]),
  'fitted_estimator': Pipeline(steps=[('preprocessing',
                   ColumnTransformer(n_jobs=-1,
                                     transformers=[('numeric',
                                                    Pipeline(steps=[('impute_missing_numeric_values',
                                                                     SimpleImputer(strategy='median')),
                                                                    ('standard_scaler',
                                                                     StandardScaler())]),
                                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7f86a7698070>),
                                                   ('categorical',
                                                    Pipel

In [148]:
#y_pred = np.exp(results[-5]["y_pred"])
#submit_score_to_kaggle(y_pred, "LassoCV_feature_engineering", "LassoCV with feature engineering")

Submission saved.
Successfully submitted to House Prices: Advanced Regression Techniques


In [153]:
results[-5]["fitted_estimator"].get_params()

{'memory': None,
 'steps': [('preprocessing', ColumnTransformer(n_jobs=-1,
                     transformers=[('numeric',
                                    Pipeline(steps=[('impute_missing_numeric_values',
                                                     SimpleImputer(strategy='median')),
                                                    ('standard_scaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7f86a899d760>),
                                   ('categorical',
                                    Pipeline(steps=[('impute_missing_categorical_values',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('standard_scaler',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                