In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from scipy import stats
from pandas.plotting import scatter_matrix
import subprocess
%matplotlib inline

In [18]:
def load_train_data(split=True):
    target = "SalePrice"
    data = pd.read_csv("./train.csv", index_col="Id")
    features = [column for column in data.columns if not column == target]
    print("load_train_data: done")
    
    if split:
        return data[features], data[target]
    else:
        return data
    
def load_x_test():
    return pd.read_csv("./test.csv", index_col="Id");

def load_y_true():
    y_true = pd.read_csv("./solution.csv", index_col="Id")
    return y_true

def load_test_data(split=True):
    X_test = pd.read_csv("./test.csv", index_col="Id")
    y_test = load_y_true()
    print("load_test_data: done")
    
    if split:
        return X_test, y_test
    else:
        return pd.concat([X_test, y_test], axis="columns")
    
def split_features_target(df, target="SalePrice"):
    features = [column for column in df.columns if not column == target]
    return df[features], df[target]

def root_mean_squared_log_error(y_true, y_pred, transform_negative_predictions=False):
    if transform_negative_predictions:
        y_pred_tr = [max(prediction, 0) for prediction in y_pred]
    else:
        y_pred_tr = y_pred
    
    # same as np.sqrt(np.mean(np.power(np.log(np.array(y_pred_tr) + 1) - np.log(np.array(y_true) + 1), 2)))
    return np.sqrt(mean_squared_log_error(y_true, y_pred_tr))

rmsle = root_mean_squared_log_error

def kaggle_score(y_pred, transform_negative_predictions=False):
    y_true = load_y_true()
    score = root_mean_squared_log_error(y_true, y_pred, transform_negative_predictions=transform_negative_predictions)
    return score

def print_kaggle_score(y_pred):
    y_true = load_y_true()
    score = kaggle_score(y_pred)
    print("The score is %.5f" % score)
    
# Make your own RMSLE (root mean square log error) scorer
rmsle_scorer = make_scorer(root_mean_squared_log_error, greater_is_better=False, transform_negative_predictions=True)

In [10]:
numeric_pipe = Pipeline([
    ('impute_missing_numeric_values', SimpleImputer(strategy="median")),
    ('standard_scaler', StandardScaler())
])

categorical_pipe = Pipeline([
    ('impute_missing_categorical_values', SimpleImputer(strategy="most_frequent")),
    ('standard_scaler', OneHotEncoder(handle_unknown="ignore"))
])

preprocessing = ColumnTransformer([
        ('numeric', numeric_pipe, make_column_selector(dtype_include=np.number)),
        ('categorical', categorical_pipe, make_column_selector(dtype_include=object))
    ],
    n_jobs=-1)

## Random Forest pipe

### Train on entire set

In [17]:
X_train, y_train = load_train_data()
rf = RandomForestRegressor(random_state=42)

pipe = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", rf)
])

pipe.fit(X_train, y_train)

X_test, _ = load_test_data()

y_pred = pipe.predict(X_test)
kaggle_score(y_pred)

load_train_data: done
load_test_data: done


0.14686040645660278

### Train with k-fold cross validation

In [19]:
X_train, y_train = load_train_data()
X_test, _ = load_test_data()
rf = RandomForestRegressor(random_state=42)
pipe = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", rf)
])
scores = cross_val_score(pipe, X_train, y_train, scoring=rmsle_scorer)
print(f"mean score: {np.mean(-scores)}, all cv scores:{-scores}")

load_train_data: done
load_test_data: done


array([0.13972054, 0.15315697, 0.14414612, 0.1371902 , 0.15551856])

## Linear Regression pipe

In [16]:
X_train, y_train = load_train_data()
linear_regression = LinearRegression()

pipe = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", linear_regression)
])

pipe.fit(X_train, y_train)

X_test, _ = load_test_data()

y_pred = pipe.predict(X_test)
kaggle_score(y_pred)

load_train_data: done
load_test_data: done


0.21188772535392925

### For reference

In [None]:

# From /Users/andy/repos/housing, or https://github.com/jvanelteren/housing
def get_pipeline(model, impute_cat='default', impute_num = 'default', scale='default',onehot='default',remove_outliers='default'):
    # in essence this splits the input into a categorical pipeline and a numeric pipeline
    # merged with a ColumnTransformer
    # on top a model is plugged (within OutlierExtractor if remove_outliers = True)
    # this works very nicely!

    cat_steps = []
    if impute_cat=='default':
        cat_steps.append(('impute_cat', DFSimpleImputer(strategy='constant',fill_value='None')))
    elif impute_cat:
        cat_steps.append(('impute_cat', impute_cat))
    
    if onehot == 'default':
        cat_steps.append(('cat_to_num', DFGetDummies()))
    elif onehot: 
        cat_steps.append(('cat_to_num', onehot))
        # equal to: cat_steps.append(('cat_to_num', DFOneHotEncoder(handle_unknown="ignore")))
    categorical_transformer = Pipeline(steps=cat_steps)

    num_steps = []
    if impute_num == 'default':
        num_steps.append(('impute_num', DFSimpleImputer(strategy='mean')))
    elif impute_num:
        num_steps.append(('impute_num', impute_num))
    
    if scale == 'default': 
        num_steps.append(('scale_num', DFStandardScaler()))
    elif scale:
        num_steps.append(('scale_num', scale))

    numeric_transformer = Pipeline(steps=num_steps)

    col_trans = DFColumnTransformer(transformers=[
        ('numeric', numeric_transformer, make_column_selector(dtype_include=np.number)),
        ('category', categorical_transformer, make_column_selector(dtype_exclude=np.number)),
        ])
    
    preprocessor_steps = [('col_trans', col_trans)]
    preprocessor = Pipeline(steps=preprocessor_steps,memory=memory)

    final_pipe = [('preprocess', preprocessor)]
    if remove_outliers == 'default': 
        final_pipe.append(('model',model))
    elif remove_outliers:
        final_pipe.append(('model',remove_outliers)) # DFOutlierExtractor(model, corruption=0.005)

    return Pipeline(steps=final_pipe)