In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.kernel_ridge import KernelRidge
from scipy import stats
from pandas.plotting import scatter_matrix
import subprocess
import lightgbm as lgb
import xgboost as xgb
%matplotlib inline

In [2]:
# Save Kaggle submission file
def submission_df(y_pred):
    X_test = load_x_test()
    return pd.DataFrame(y_pred, index=X_test.index, columns=["SalePrice"])

def save_submission_file(y_pred, filename):
    df = submission_df(y_pred)
    path = "./" + filename

    try:
        df.to_csv(path)
    except Exception:
        print("Couldn’t save submission.")
    else:
        print("Submission saved.")
        
# Submit score to Kaggle
def submit_score_to_kaggle(y_pred, filename, message):
    save_submission_file(y_pred, filename)

    completed_process = subprocess.run(
        [
            "kaggle",
            "competitions",
            "submit",
            "-c",
            "house-prices-advanced-regression-techniques",
            "-f",
            filename,
            "-m",
            message
        ], 
        capture_output=True,
        text=True
    )
    
    print(completed_process.stdout)
    
def load_train_data(split=True):
    target = "SalePrice"
    data = pd.read_csv("./train.csv", index_col="Id")
    features = [column for column in data.columns if not column == target]
    print("load_train_data: done")
    
    if split:
        return data[features], data[target]
    else:
        return data
    
def load_x_test():
    return pd.read_csv("./test.csv", index_col="Id");

def load_y_true():
    y_true = pd.read_csv("./solution.csv", index_col="Id")
    return y_true

def load_test_data(split=True):
    X_test = pd.read_csv("./test.csv", index_col="Id")
    y_test = load_y_true()
    print("load_test_data: done")
    
    if split:
        return X_test, y_test
    else:
        return pd.concat([X_test, y_test], axis="columns")
    
def split_features_target(df, target="SalePrice"):
    features = [column for column in df.columns if not column == target]
    return df[features], df[target]

def root_mean_squared_log_error(y_true, y_pred, transform_negative_predictions=False):
    if transform_negative_predictions:
        y_pred_tr = [max(prediction, 0) for prediction in y_pred]
    else:
        y_pred_tr = y_pred
    
    # same as np.sqrt(np.mean(np.power(np.log(np.array(y_pred_tr) + 1) - np.log(np.array(y_true) + 1), 2)))
    return np.sqrt(mean_squared_log_error(y_true, y_pred_tr))

rmsle = root_mean_squared_log_error

def kaggle_score(y_pred, transform_negative_predictions=False, y_log_transformed=False):
    y_true = load_y_true()
    
    if y_log_transformed:
        y_pred = np.exp(y_pred)

    score = root_mean_squared_log_error(y_true, y_pred, transform_negative_predictions=transform_negative_predictions)
    return score

def print_kaggle_score(y_pred):
    y_true = load_y_true()
    score = kaggle_score(y_pred)
    print("The score is %.5f" % score)
    
# Make your own RMSLE (root mean square log error) scorer
rmsle_scorer = make_scorer(root_mean_squared_log_error, greater_is_better=False, transform_negative_predictions=True)

def get_pipe(model):
    numeric_pipe = Pipeline([
        ('impute_missing_numeric_values', SimpleImputer(strategy="median")),
        ('standard_scaler', StandardScaler())
    ])

    categorical_pipe = Pipeline([
        ('impute_missing_categorical_values', SimpleImputer(strategy="most_frequent")),
        ('standard_scaler', OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessing = ColumnTransformer([
        ('numeric', numeric_pipe, make_column_selector(dtype_include=np.number)),
        ('categorical', categorical_pipe, make_column_selector(dtype_include=object))
    ],
    n_jobs=-1)
    
    pipe = Pipeline([
        ("preprocessing", preprocessing),
        ("model", model)
    ])
    
    return pipe

def fit_evaluate(model, feature_engineered=False, ft_instructions=None):
    if feature_engineered:
        X_train, y_train, X_test = load_engineered_data(ft_instructions=ft_instructions)
    else:
        X_train, y_train = load_train_data()
        X_test, _ = load_test_data()

    pipe = get_pipe(model)
    pipe.fit(X_train, y_train)    
    y_pred = pipe.predict(X_test)

    if feature_engineered:
        score = kaggle_score(y_pred, y_log_transformed=True)
    else:
        score = kaggle_score(y_pred)
    
    result = {
        "model": type(model).__name__,
        "kaggle_score": score,
        "y_pred": y_pred,
        "estimator": pipe,
    }
    return result

def compare_models(models, feature_engineered=False, ft_instructions=None):
    results = []

    for model in models:
        if feature_engineered:
            result = fit_evaluate(model, feature_engineered=True, ft_instructions=ft_instructions)
        else:
            result = fit_evaluate(model)
        results.append(result)

    return results

def compute_cv_scores(model):
    X_train, y_train = load_train_data()
    X_test, _ = load_test_data()
    pipe = get_pipe(model)
    return cross_val_score(pipe, X_train, y_train, scoring=rmsle_scorer)

models = [
    RandomForestRegressor(random_state=42),
#     LinearRegression(),
    Ridge(),
    RidgeCV(),
    KernelRidge(),
    LassoCV(),
    ElasticNet(),
    SGDRegressor(),
    lgb.LGBMRegressor(),
    xgb.XGBRegressor()
]

In [117]:
# submit_score_to_kaggle(np.exp(results[8]["y_pred"]),
#                       "submission_LGBMRegressor_y_log.csv",
#                       "Second try with lgb, the only difference is y_train was log-transformed.")

In [118]:
# Example of cross val scoring with this setup
# rf = RandomForestRegressor(random_state=42)
# scores = compute_cv_scores(rf)
# print(f"mean score: {np.mean(-scores)}, all cv scores:{-scores}")

## Transform the data

In [3]:
def y_log(y_train):
    return np.log(y_train)

def order_categoricals(X):
    return X.replace({ "Alley" : {"Grvl" : 1, "Pave" : 2},
                        "BsmtCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                        "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                        "BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                        "BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                        "ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                        "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                        "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, 
                                       "Min2" : 6, "Min1" : 7, "Typ" : 8},
                        "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                        "LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                        "LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
                        "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                        "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                        "Street" : {"Grvl" : 1, "Pave" : 2},
                        "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}})

def add_simplification_features(X_train, X_test):
    ten_to_three = {
        1 : 1, 2 : 1, 3 : 1, # bad
        4 : 2, 5 : 2, 6 : 2, # average
        7 : 3, 8 : 3, 9 : 3, 10 : 3 # good
    }

    five_to_three = {
        1 : 1, # bad
        2 : 1, 3 : 1, # average
        4 : 2, 5 : 2 # good
    }

    eight_to_four = {
        1 : 1, 2 : 1, # bad
        3 : 2, 4 : 2, # major
        5 : 3, 6 : 3, 7 : 3, # minor
        8 : 4 # typical
    }

    six_to_three = {
        1 : 1, # unfinished
        2 : 1, 3 : 1, # rec room
        4 : 2, 5 : 2, 6 : 2 # living quarters
    }

    for df in (X_train, X_test):
        df["SimplOverallQual"] = df.OverallQual.replace(ten_to_three)
        df["SimplOverallCond"] = df.OverallCond.replace(ten_to_three)
        df["SimplPoolQC"] = df.PoolQC.replace(five_to_three)
        df["SimplGarageCond"] = df.GarageCond.replace(five_to_three)
        df["SimplGarageQual"] = df.GarageQual.replace(five_to_three)
        df["SimplFireplaceQu"] = df.FireplaceQu.replace(five_to_three)
        df["SimplFireplaceQu"] = df.FireplaceQu.replace(five_to_three)
        df["SimplFunctional"] = df.Functional.replace(eight_to_four)
        df["SimplKitchenQual"] = df.KitchenQual.replace(five_to_three)
        df["SimplHeatingQC"] = df.HeatingQC.replace(five_to_three)
        df["SimplBsmtFinType1"] = df.BsmtFinType1.replace(six_to_three)
        df["SimplBsmtFinType2"] = df.BsmtFinType2.replace(six_to_three)
        df["SimplBsmtCond"] = df.BsmtCond.replace(five_to_three)
        df["SimplBsmtQual"] = df.BsmtQual.replace(five_to_three)
        df["SimplExterCond"] = df.ExterCond.replace(five_to_three)
        df["SimplExterQual"] = df.ExterQual.replace(five_to_three)
        
def add_totalizer_features(X_train, X_test):
    for df in (X_train, X_test):    
        # Overall quality of the house
        df["OverallGrade"] = df["OverallQual"] * df["OverallCond"]
        # Overall quality of the garage
        df["GarageGrade"] = df["GarageQual"] * df["GarageCond"]
        # Overall quality of the exterior
        df["ExterGrade"] = df["ExterQual"] * df["ExterCond"]
        # Overall kitchen score
        df["KitchenScore"] = df["KitchenAbvGr"] * df["KitchenQual"]
        # Overall fireplace score
        df["FireplaceScore"] = df["Fireplaces"] * df["FireplaceQu"]
        # Overall garage score
        df["GarageScore"] = df["GarageArea"] * df["GarageQual"]
        # Overall pool score
        df["PoolScore"] = df["PoolArea"] * df["PoolQC"]
        # Simplified overall quality of the house
        df["SimplOverallGrade"] = df["SimplOverallQual"] * df["SimplOverallCond"]
        # Simplified overall quality of the exterior
        df["SimplExterGrade"] = df["SimplExterQual"] * df["SimplExterCond"]
        # Simplified overall pool score
        df["SimplPoolScore"] = df["PoolArea"] * df["SimplPoolQC"]
        # Simplified overall garage score
        df["SimplGarageScore"] = df["GarageArea"] * df["SimplGarageQual"]
        # Simplified overall fireplace score
        df["SimplFireplaceScore"] = df["Fireplaces"] * df["SimplFireplaceQu"]
        # Simplified overall kitchen score
        df["SimplKitchenScore"] = df["KitchenAbvGr"] * df["SimplKitchenQual"]
        # Total number of bathrooms
        df["TotalBath"] = df["BsmtFullBath"] + (0.5 * df["BsmtHalfBath"]) + \
        df["FullBath"] + (0.5 * df["HalfBath"])
        # Total SF for house (incl. basement)
        df["AllSF"] = df["GrLivArea"] + df["TotalBsmtSF"]
        # Total SF for 1st + 2nd floors
        df["AllFlrsSF"] = df["1stFlrSF"] + df["2ndFlrSF"]
        # Total SF for porch
        df["AllPorchSF"] = df["OpenPorchSF"] + df["EnclosedPorch"] + df["3SsnPorch"] + df["ScreenPorch"] + df['WoodDeckSF']
        # Has masonry veneer or not
        df["HasMasVnr"] = df.MasVnrType.replace({"BrkCmn" : 1, "BrkFace" : 1, "CBlock" : 1, "Stone" : 1, "None" : 0})
        # House completed before sale or not
        df["BoughtOffPlan"] = df.SaleCondition.replace({"Abnorml" : 0, "Alloca" : 0, "AdjLand" : 0, 
                                                            "Family" : 0, "Normal" : 0, "Partial" : 1})
        df['YrBltAndRemod']=df['YearBuilt']+df['YearRemodAdd']
        df['TotalSF']=df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
        df['Total_sqr_footage'] = (df['BsmtFinSF1'] + df['BsmtFinSF2'] + df['1stFlrSF'] + df['2ndFlrSF'])
        df['haspool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
        df['has2ndfloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
        df['hasgarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
        df['hasbsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
        df['hasfireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
        
def add_polynomial_features(X_train, X_test):
    feat1 = ["OverallQual","AllSF","AllFlrsSF","GrLivArea","SimplOverallQual","ExterQual","GarageCars","TotalBath","KitchenQual","GarageScore",]

    for df in (X_train, X_test):
        for feat in feat1:
            df[feat+'_2'] =  df[feat] ** 2
            df[feat+'_3'] =  df[feat] ** 3
            df[feat+'_sqrt'] =  np.sqrt(df[feat])
            
def add_log_features(X_train, X_test):
    log_features = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
                 'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                 'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
                 'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
                 'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearRemodAdd','TotalSF']

    for df in (X_train, X_test):
        for feat in log_features:
            df[feat+'_log'] =  np.log1p(df[feat])

def feature_engineer(data, ft_instructions=None):
    X_train, y_train, X_test = data

    if "y_log" in ft_instructions:
        # Log-transform y_train
        y_train = y_log(y_train)
        
    if "order_categoricals" in ft_instructions:
        X_train = order_categoricals(X_train)
        X_test = order_categoricals(X_test)
        
    if "add_simplification_features" in ft_instructions:
        add_simplification_features(X_train, X_test)
        
    if "add_totalizer_features" in ft_instructions:
        add_totalizer_features(X_train, X_test)
        
    if "add_polynomial_features" in ft_instructions:
        add_polynomial_features(X_train, X_test)
        
    if "add_log_features" in ft_instructions:
        add_log_features(X_train, X_test)
        
    return X_train, y_train, X_test

def load_engineered_data(ft_instructions=None):
    # Load the original data
    X_train, y_train = load_train_data()
    X_test, _ = load_test_data()
    
    X_train, y_train, X_test = feature_engineer((X_train, y_train, X_test), ft_instructions=ft_instructions)
    
    return X_train, y_train, X_test

## Evaluating feature engineering impact

In [None]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
    "add_log_features": True,
}
results = compare_models(models, feature_engineered=True, ft_instructions=ft_instructions)
results

In [148]:
#y_pred = np.exp(results[-5]["y_pred"])
#submit_score_to_kaggle(y_pred, "LassoCV_feature_engineering", "LassoCV with feature engineering")

Submission saved.
Successfully submitted to House Prices: Advanced Regression Techniques


In [None]:
results[-5]["estimator"].get_params()

In [4]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
    "add_log_features": True,
}

fit_evaluate(LassoCV(), feature_engineered=True, ft_instructions=ft_instructions)

load_train_data: done
load_test_data: done


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


{'model': 'LassoCV',
 'kaggle_score': 0.1225717876008984,
 'y_pred': array([11.65378216, 11.97600571, 12.12935428, ..., 12.05428699,
        11.70156304, 12.34094756]),
 'fitted_estimator': Pipeline(steps=[('preprocessing',
                  ColumnTransformer(n_jobs=-1,
                                    transformers=[('numeric',
                                                   Pipeline(steps=[('impute_missing_numeric_values',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('standard_scaler',
                                                                    StandardScaler())]),
                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fb86f809c10>),
                                                  ('categorical',
                                                   Pipeline(steps=[('impute_missing_

In [7]:
result = fit_evaluate(Lasso(), feature_engineered=True, ft_instructions=ft_instructions)
estimator = result["estimator"]

load_train_data: done
load_test_data: done


In [17]:
lasso = estimator.steps[-1][1]

In [18]:
lasso.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [23]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
    "add_log_features": True,
}

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = LassoCV(alphas=np.arange(0, 1, 0.01), cv=cv, n_jobs=-1)
result = fit_evaluate(model, feature_engineered=True, ft_instructions=ft_instructions)
result

load_train_data: done
load_test_data: done


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

{'model': 'LassoCV',
 'kaggle_score': 0.13335346064767442,
 'y_pred': array([11.66152863, 11.9676956 , 12.09989703, ..., 12.06820517,
        11.69909684, 12.32711182]),
 'estimator': Pipeline(steps=[('preprocessing',
                  ColumnTransformer(n_jobs=-1,
                                    transformers=[('numeric',
                                                   Pipeline(steps=[('impute_missing_numeric_values',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('standard_scaler',
                                                                    StandardScaler())]),
                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fb86fcc96d0>),
                                                  ('categorical',
                                                   Pipeline(steps=[('impute_missing_catego

In [24]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
    "add_log_features": True,
}

model = LassoCV()
result = fit_evaluate(model, feature_engineered=True, ft_instructions=ft_instructions)
result

load_train_data: done
load_test_data: done


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


{'model': 'LassoCV',
 'kaggle_score': 0.1225717876008984,
 'y_pred': array([11.65378216, 11.97600571, 12.12935428, ..., 12.05428699,
        11.70156304, 12.34094756]),
 'estimator': Pipeline(steps=[('preprocessing',
                  ColumnTransformer(n_jobs=-1,
                                    transformers=[('numeric',
                                                   Pipeline(steps=[('impute_missing_numeric_values',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('standard_scaler',
                                                                    StandardScaler())]),
                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fb86fce90d0>),
                                                  ('categorical',
                                                   Pipeline(steps=[('impute_missing_categor

## Lasso with alpha=0.0011 yields best score

In [63]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
    "add_log_features": True,
}

model = Lasso(alpha=0.0011, max_iter=100000)
result = fit_evaluate(model, feature_engineered=True, ft_instructions=ft_instructions)
result

load_train_data: done
load_test_data: done


In [64]:
kaggle_score(np.exp(result["y_pred"]))

0.1210916191026897

In [53]:
#submit_score_to_kaggle(np.exp(result["y_pred"]), "Lasso_manually_tuned.csv", "Feature-en  gineered, manually alpha-tuned Lasso.")

Submission saved.
Successfully submitted to House Prices: Advanced Regression Techniques


In [61]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
    "add_log_features": True,
}

model = LassoCV(n_alphas=1000, max_iter=100000)
result = fit_evaluate(model, feature_engineered=True, ft_instructions=ft_instructions)
result

load_train_data: done
load_test_data: done


{'model': 'LassoCV',
 'kaggle_score': 0.12242425705898889,
 'y_pred': array([11.65287283, 11.97609803, 12.12912204, ..., 12.05447644,
        11.70147293, 12.34088951]),
 'estimator': Pipeline(steps=[('preprocessing',
                  ColumnTransformer(n_jobs=-1,
                                    transformers=[('numeric',
                                                   Pipeline(steps=[('impute_missing_numeric_values',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('standard_scaler',
                                                                    StandardScaler())]),
                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fb8700dd3d0>),
                                                  ('categorical',
                                                   Pipeline(steps=[('impute_missing_catego

In [66]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
    "add_log_features": True,
}

lasso = LassoCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], 
                max_iter = 50000,
                cv = 10)
result = fit_evaluate(lasso, feature_engineered=True, ft_instructions=ft_instructions)
result

load_train_data: done
load_test_data: done


{'model': 'LassoCV',
 'kaggle_score': 0.12186062759345946,
 'y_pred': array([11.65192801, 11.97691013, 12.12862332, ..., 12.05609475,
        11.70139064, 12.34271295]),
 'estimator': Pipeline(steps=[('preprocessing',
                  ColumnTransformer(n_jobs=-1,
                                    transformers=[('numeric',
                                                   Pipeline(steps=[('impute_missing_numeric_values',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('standard_scaler',
                                                                    StandardScaler())]),
                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fb8700df190>),
                                                  ('categorical',
                                                   Pipeline(steps=[('impute_missing_catego

In [68]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
    "add_log_features": True,
}

lasso = LassoCV(alphas = [0.0006, 0.0008, 0.0010, 0.0011], 
                max_iter = 50000,
                cv = 10)
result = fit_evaluate(lasso, feature_engineered=True, ft_instructions=ft_instructions)
result

load_train_data: done
load_test_data: done


{'model': 'LassoCV',
 'kaggle_score': 0.12186062759345946,
 'y_pred': array([11.65192801, 11.97691013, 12.12862332, ..., 12.05609475,
        11.70139064, 12.34271295]),
 'estimator': Pipeline(steps=[('preprocessing',
                  ColumnTransformer(n_jobs=-1,
                                    transformers=[('numeric',
                                                   Pipeline(steps=[('impute_missing_numeric_values',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('standard_scaler',
                                                                    StandardScaler())]),
                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fb8700df280>),
                                                  ('categorical',
                                                   Pipeline(steps=[('impute_missing_catego

In [69]:
lasso.alpha_

0.0006

In [71]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
    "add_log_features": True,
}

lasso = Lasso(alpha = 0.0011, 
                max_iter = 50000)
result = fit_evaluate(lasso, feature_engineered=True, ft_instructions=ft_instructions)
result

load_train_data: done
load_test_data: done


{'model': 'Lasso',
 'kaggle_score': 0.1210916191026897,
 'y_pred': array([11.65109779, 11.98516858, 12.12314576, ..., 12.06809036,
        11.70891412, 12.35907915]),
 'estimator': Pipeline(steps=[('preprocessing',
                  ColumnTransformer(n_jobs=-1,
                                    transformers=[('numeric',
                                                   Pipeline(steps=[('impute_missing_numeric_values',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('standard_scaler',
                                                                    StandardScaler())]),
                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fb8700dfcd0>),
                                                  ('categorical',
                                                   Pipeline(steps=[('impute_missing_categoric

## XGB tuning; best kaggle_score: 0.12609330307170352

In [9]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
    "add_log_features": True,
}

model = xgb.XGBRegressor()

parameters = {'nthread': [4], #when use hyperthread, xgboost may become slower
              'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.3], #so called `eta` value
              'max_depth': np.arange(2, 10, 1),
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': np.arange(60, 220, 40)}

xgb_grid = GridSearchCV(model,
                        parameters,
                        cv = 5,
                        n_jobs = -1,
                        verbose=True)

result = fit_evaluate(xgb_grid, feature_engineered=True, ft_instructions=ft_instructions)
result

load_train_data: done
load_test_data: done
Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  5.8min finished


[18:01:11] DEBUG: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/gbm/gbtree.cc:147: Using tree method: 2
[18:01:11] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:01:11] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:01:11] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:01:11] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:01:11] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 0 extra nodes, 0 pruned nodes, max_depth=0
[18:01:11] INFO: /Use

[18:01:11] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=4
[18:01:11] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=4
[18:01:11] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 18 extra nodes, 0 pruned nodes, max_depth=5
[18:01:11] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 20 extra nodes, 0 pruned nodes, max_depth=5
[18:01:11] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 16 extra nodes, 0 pruned nodes, max_depth=5
[18:01:11] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 20 extra nod

[18:01:12] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 26 extra nodes, 0 pruned nodes, max_depth=5
[18:01:12] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 42 extra nodes, 0 pruned nodes, max_depth=5
[18:01:12] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 26 extra nodes, 0 pruned nodes, max_depth=5
[18:01:12] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 44 extra nodes, 0 pruned nodes, max_depth=5
[18:01:12] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 26 extra nodes, 0 pruned nodes, max_depth=5
[18:01:12] INFO: /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/tree/updater_prune.cc:101: tree pruning end, 26 extra nod

{'model': 'GridSearchCV',
 'kaggle_score': 0.12609330307170352,
 'y_pred': array([11.752572, 12.026872, 12.098705, ..., 12.022032, 11.672254,
        12.340769], dtype=float32),
 'estimator': Pipeline(steps=[('preprocessing',
                  ColumnTransformer(n_jobs=-1,
                                    transformers=[('numeric',
                                                   Pipeline(steps=[('impute_missing_numeric_values',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('standard_scaler',
                                                                    StandardScaler())]),
                                                   <sklearn.compose._column_transformer.make_column_selector object at 0x7fd853e7fcd0>),
                                                  ('categorical',
                                                   Pipeline(steps=[('impute_missin

In [None]:
ft_instructions = {
    "y_log": True,
    "order_categoricals": True,
    "add_simplification_features": True,
    "add_totalizer_features": True,
    "add_polynomial_features": True,
    "add_log_features": True,
}

model = xgb.XGBRegressor()

parameters = {'nthread': [4], #when use hyperthread, xgboost may become slower
              'objective': ['reg:squarederror'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'verbosity': [3],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(model,
                        parameters,
                        cv = 3,
                        n_jobs = -1,
                        verbose=True)

result = fit_evaluate(xgb_grid, feature_engineered=True, ft_instructions=ft_instructions)
result