In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from scipy import stats
from pandas.plotting import scatter_matrix
import subprocess
%matplotlib inline

## Save Kaggle submission file
---

In [23]:
def submission_df(y_pred):
    X_test = load_x_test()
    return pd.DataFrame(y_pred, index=X_test.index, columns=["SalePrice"])

def save_submission_file(y_pred, filename):
    df = submission_df(y_pred)
    path = "./" + filename

    try:
        df.to_csv(path)
    except Exception:
        print("Couldn’t save submission.")
    else:
        print("Submission saved.")

## Submit score to Kaggle
---

In [24]:
def submit_score_to_kaggle(y_pred, filename, message):
    save_submission_file(y_pred, filename)

    completed_process = subprocess.run(
        [
            "kaggle",
            "competitions",
            "submit",
            "-c",
            "house-prices-advanced-regression-techniques",
            "-f",
            filename,
            "-m",
            message
        ], 
        capture_output=True,
        text=True
    )
    
    print(completed_process.stdout)

In [11]:
def load_train_data(split=True):
    target = "SalePrice"
    data = pd.read_csv("./train.csv", index_col="Id")
    features = [column for column in data.columns if not column == target]
    print("load_train_data: done")
    
    if split:
        return data[features], data[target]
    else:
        return data
    
def load_x_test():
    return pd.read_csv("./test.csv", index_col="Id");

def load_y_true():
    y_true = pd.read_csv("./solution.csv", index_col="Id")
    return y_true

def load_test_data(split=True):
    X_test = pd.read_csv("./test.csv", index_col="Id")
    y_test = load_y_true()
    print("load_test_data: done")
    
    if split:
        return X_test, y_test
    else:
        return pd.concat([X_test, y_test], axis="columns")
    
def split_features_target(df, target="SalePrice"):
    features = [column for column in df.columns if not column == target]
    return df[features], df[target]

def root_mean_squared_log_error(y_true, y_pred, transform_negative_predictions=False):
    if transform_negative_predictions:
        y_pred_tr = [max(prediction, 0) for prediction in y_pred]
    else:
        y_pred_tr = y_pred
    
    # same as np.sqrt(np.mean(np.power(np.log(np.array(y_pred_tr) + 1) - np.log(np.array(y_true) + 1), 2)))
    return np.sqrt(mean_squared_log_error(y_true, y_pred_tr))

rmsle = root_mean_squared_log_error

def kaggle_score(y_pred, transform_negative_predictions=False):
    y_true = load_y_true()
    score = root_mean_squared_log_error(y_true, y_pred, transform_negative_predictions=transform_negative_predictions)
    return score

def print_kaggle_score(y_pred):
    y_true = load_y_true()
    score = kaggle_score(y_pred)
    print("The score is %.5f" % score)
    
# Make your own RMSLE (root mean square log error) scorer
rmsle_scorer = make_scorer(root_mean_squared_log_error, greater_is_better=False, transform_negative_predictions=True)

## Fit multiple models at once

In [9]:
def get_pipe(model):
    numeric_pipe = Pipeline([
        ('impute_missing_numeric_values', SimpleImputer(strategy="median")),
        ('standard_scaler', StandardScaler())
    ])

    categorical_pipe = Pipeline([
        ('impute_missing_categorical_values', SimpleImputer(strategy="most_frequent")),
        ('standard_scaler', OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessing = ColumnTransformer([
        ('numeric', numeric_pipe, make_column_selector(dtype_include=np.number)),
        ('categorical', categorical_pipe, make_column_selector(dtype_include=object))
    ],
    n_jobs=-1)
    
    pipe = Pipeline([
        ("preprocessing", preprocessing),
        ("model", model)
    ])
    
    return pipe

In [21]:
def fit_evaluate(model):
    X_train, y_train = load_train_data()
    pipe = get_pipe(model)
    pipe.fit(X_train, y_train)
    X_test, _ = load_test_data()
    y_pred = pipe.predict(X_test)
    result = {
        "model": type(model).__name__,
        "kaggle_score": kaggle_score(y_pred),
        "y_pred": y_pred,
    }
    return result

In [22]:
models = [
    RandomForestRegressor(random_state=42),
    LinearRegression(),
    Ridge(),
    RidgeCV(),
    LassoCV(),
    ElasticNet(),
    SGDRegressor()
]

results = []

for model in models:
    result = fit_evaluate(model)
    results.append(result)
    
results

load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done
load_train_data: done
load_test_data: done


[{'model': 'RandomForestRegressor',
  'kaggle_score': 0.14686040645660278,
  'y_pred': array([128603.  , 157906.5 , 184263.  , ..., 156011.24, 115149.  ,
         222536.77])},
 {'model': 'LinearRegression',
  'kaggle_score': 0.21188772535392925,
  'y_pred': array([113068.2944287 , 151525.9021414 , 187273.11868348, ...,
         179460.84154476, 107920.69599968, 221929.30751364])},
 {'model': 'Ridge',
  'kaggle_score': 0.19020278483904776,
  'y_pred': array([106292.31790174, 146202.81484171, 177286.01635399, ...,
         161384.03011237, 100905.76470667, 222615.27847833])},
 {'model': 'RidgeCV',
  'kaggle_score': 0.15000408656340605,
  'y_pred': array([103486.21749693, 146937.86224909, 173533.91012636, ...,
         157744.46566913, 100555.1504879 , 226342.20641187])},
 {'model': 'LassoCV',
  'kaggle_score': 0.13857216558665356,
  'y_pred': array([111422.99640262, 153481.3975303 , 177455.7154778 , ...,
         164361.33813388, 107104.62341241, 224445.70090939])},
 {'model': 'ElasticN

In [29]:
# submit_score_to_kaggle(results[4]["y_pred"], "submission_lasso_cv_pipe", "First submission after setting up pipelines, and fitting multiple models at once.")


Submission saved.
Successfully submitted to House Prices: Advanced Regression Techniques


In [13]:
X_train, y_train = load_train_data()
X_test, _ = load_test_data()
rf = RandomForestRegressor(random_state=42)
pipe = get_pipe(rf)
scores = cross_val_score(pipe, X_train, y_train, scoring=rmsle_scorer)
print(f"mean score: {np.mean(-scores)}, all cv scores:{-scores}")

load_train_data: done
load_test_data: done
mean score: 0.14594647920349874, all cv scores:[0.13972054 0.15315697 0.14414612 0.1371902  0.15551856]
