# Baseline Models

Baseline models made by using `DummyRegression` and `permutation_test_score`.

In [None]:
#Imports

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.dummy import DummyRegressor

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import permutation_test_score
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
shuffle_split_cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

Let's load the data and split into training and test

In [None]:
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state=42)

## `LinearRegression` 

In [None]:
lin_reg_pipeline = Pipeline([("feature_scaling", StandardScaler()),
                                ("lin_reg", LinearRegression())])

lin_reg_cv_results = cross_validate(lin_reg_pipeline, train_features,
                                    train_labels, cv=shuffle_split_cv,
                                    scoring="neg_mean_squared_error",
                                    n_jobs=2)

lin_reg_errors = pd.Series(-lin_reg_cv_results["test_score"],
                            name="Linear regression error")

## `DummyRegressor`

In [None]:
def dummy_regressor_baseline(strategy, constant_val=None, quantile_val=None):
    
    baseline_model_median = DummyRegressor(strategy=strategy,
                                            constant=constant_val,
                                            quantile=quantile_val)

    baseline_median_cv_results = cross_validate(baseline_model_median,
                                                train_features, train_labels,
                                                cv=shuffle_split_cv,
                                                scoring="neg_mean_absolute_error",
                                                n_jobs=2)
    
    return pd.Series(-baseline_median_cv_results["test_score"],
                            name="Dummy regressor error")

In [None]:
baseline_median_cv_results_error = dummy_regressor_baseline(strategy = 'median')
baseline_mean_cv_results_error = dummy_regressor_baseline(strategy = 'mean')
baseline_constant_cv_results_error = dummy_regressor_baseline(strategy = 'constant', constant_val=2)
baseline_quantile_cv_results_error = dummy_regressor_baseline(strategy = 'quantile', quantile_val=0.55)

Let's compare the performance of these dummy regressors

In [None]:
dummy_error_df = pd.concat([baseline_median_cv_results_error,
                            baseline_mean_cv_results_error,
                            baseline_constant_cv_results_error,
                            baseline_quantile_cv_results_error],
                            axis=1)
            
dummy_error_df.columns = ['Median cv', 'Mean cv', 'Constant cv', 'Quantile cv']

In [None]:
dummy_error_df.plot.hist(bins=50, density=True, edgecolor="black")
plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left")
plt.xlabel("Mean absolute error ($k$)")
_ = plt.title("Distribution of the testing errors")

## `permutation_test_score`

It permutes the target to generate randomised data and computes the emperical p-value against the null hypothesis, that the features and targets are independent.

In [None]:
score, permutation_score, pvalue = permutation_test_score(lin_reg_pipeline,
                                                            train_features,
                                                            train_labels,
                                                            cv=shuffle_split_cv,
                                                            scoring="neg_mean_absolute_error",
                                                            n_jobs=2, n_permutations=30)

permutation_errors = pd.Series(-permutation_score, name="Permuted error")

print(permutation_score)

## Model Comparison

In [None]:
error_df = pd.concat([lin_reg_errors, baseline_median_cv_results_error, permutation_errors],
                        axis = 1)

error_df.plot.hist(bins=50, density=True, edgecolor="black")
plt.legend(loc="best")
plt.xlabel("Mean absolute error ($k$)")
_ = plt.title("Distribution of the testing errors")