# Linear Regression for house price prediction

1. Linear regression(with normal equation and iterativee optimisation procedure)
2. Polynomial regression
3. Regularised regression model - ridge and lasso 

In [None]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import loguniform
from scipy.stats import uniform

from sklearn.datasets import fetch_california_housing
from sklearn.dummy import DummyRegressor

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import SGDRegressor


from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import Pipeline

## Common setup

In [None]:
np.random.seed(306)
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

## Data loading and splitting

In [None]:
#fetch dataset
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)

# train-test split
com_train_features, test_features, com_train_labels, test_labels = train_test_split(features, labels, random_state=42)
# train --> train + dev split
train_features, dev_features, train_labels, dev_labels = train_test_split(com_train_features, com_train_labels, random_state=42)

## Linear Regression with normal equation

In [None]:
lin_reg_pipeline = Pipeline([("feature_scaling", StandardScaler()),
                                ("lin_reg", LinearRegression())])

lin_reg_cv_results = cross_validate(lin_reg_pipeline,
                                    com_train_features,
                                    com_train_labels,
                                    cv=cv,
                                    scoring="neg_mean_absolute_error",
                                    return_train_score=True,
                                    return_estimator=True)

lin_reg_train_error = -1 * lin_reg_cv_results['train_score'] 
lin_reg_test_error = -1 * lin_reg_cv_results['test_score']

print(f"Mean absolute error of linear regression model on the train set:\n"
        f"{lin_reg_train_error.mean():.3f} +/- {lin_reg_train_error.std():.3f}")

print(f"Mean absolute error of linear regression model on the test set:\n"
        f"{lin_reg_test_error.mean():.3f} +/- {lin_reg_test_error.std():.3f}")

## Linear regression with SGD

In [None]:
sgd_reg_pipeline = Pipeline([("feature_scaling", StandardScaler()),
                        ("sgd", SGDRegressor(max_iter=np.ceil(1e6/com_train_features.shape[0]),
                                            early_stopping=True,
                                            eta0 = 1e-4,
                                            tol=1e-5,
                                            learning_rate='constant',
                                            validation_fraction=0.1,
                                            n_iter_no_change=5,
                                            average=10,
                                            random_state=42))])

sgd_reg_cv_results = cross_validate(sgd_reg_pipeline,
                                    com_train_features,
                                    com_train_labels,
                                    cv=cv,
                                    scoring="neg_mean_absolute_error",
                                    return_train_score=True,
                                    return_estimator=True)

sgd_reg_train_error = -1 * sgd_reg_cv_results['train_score'] 
sgd_reg_test_error = -1 * sgd_reg_cv_results['test_score']

print(f"Mean absolute error of linear regression model on the train set:\n"
        f"{sgd_reg_train_error.mean():.3f} +/- {sgd_reg_train_error.std():.3f}")

print(f"Mean absolute error of linear regression model on the test set:\n"
        f"{sgd_reg_test_error.mean():.3f} +/- {sgd_reg_test_error.std():.3f}")

## Polynomial regression

In [None]:
poly_reg_pipeline = Pipeline([("poly", PolynomialFeatures(degree=2)),
                            ("feature_scaling", StandardScaler()),
                            ("lin_reg", LinearRegression())])

poly_reg_cv_results = cross_validate(poly_reg_pipeline,
                                    com_train_features,
                                    com_train_labels,
                                    cv=cv,
                                    scoring="neg_mean_absolute_error",
                                    return_train_score=True,
                                    return_estimator=True)

poly_reg_train_error = -1 * poly_reg_cv_results['train_score'] 
poly_reg_test_error = -1 * poly_reg_cv_results['test_score']

print(f"Mean absolute error of linear regression model on the train set:\n"
        f"{poly_reg_train_error.mean():.3f} +/- {poly_reg_train_error.std():.3f}")

print(f"Mean absolute error of linear regression model on the test set:\n"
        f"{poly_reg_test_error.mean():.3f} +/- {poly_reg_test_error.std():.3f}")

Let's use only inteeraction terms in polynomial regression.

In [None]:
poly_reg_pipeline = Pipeline([("poly", PolynomialFeatures(degree=2, interaction_only=True)),
                            ("feature_scaling", StandardScaler()),
                            ("lin_reg", LinearRegression())])

poly_reg_cv_results = cross_validate(poly_reg_pipeline,
                                    com_train_features,
                                    com_train_labels,
                                    cv=cv,
                                    scoring="neg_mean_absolute_error",
                                    return_train_score=True,
                                    return_estimator=True)

poly_reg_train_error = -1 * poly_reg_cv_results['train_score'] 
poly_reg_test_error = -1 * poly_reg_cv_results['test_score']

print(f"Mean absolute error of linear regression model on the train set:\n"
        f"{poly_reg_train_error.mean():.3f} +/- {poly_reg_train_error.std():.3f}")

print(f"Mean absolute error of linear regression model on the test set:\n"
        f"{poly_reg_test_error.mean():.3f} +/- {poly_reg_test_error.std():.3f}")

Let's figure out which degree polynomial is best suited

In [None]:

degree = [1, 2, 3, 4, 5]
train_scores, test_scores = validation_curve(
                                            poly_reg_pipeline, com_train_features, com_train_labels, param_name="poly__degree",
                                            param_range=degree, cv=cv, scoring="neg_mean_absolute_error",
                                            n_jobs=2)

train_errors, test_errors = -train_scores, -test_scores

plt.plot(degree, train_errors.mean(axis=1), 'b-x', label='Training error')
plt.plot(degree, test_errors.mean(axis=1), 'r-x', label='Test error')
plt.legend()
plt.xlabel("degree")
plt.ylabel("Mean absolute error ($k$)")
_ = plt.title("Validation curve for polynomial regression")

Best degree = 2

## Ridge regression

In [None]:
ridge_reg_pipeline = Pipeline([("poly", PolynomialFeatures(degree=2, interaction_only=True)),
                            ("feature_scaling", StandardScaler()),
                            ("ridge", Ridge(alpha=0.5))])

ridge_reg_cv_results = cross_validate(ridge_reg_pipeline,
                                    com_train_features,
                                    com_train_labels,
                                    cv=cv,
                                    scoring="neg_mean_absolute_error",
                                    return_train_score=True,
                                    return_estimator=True)

ridge_reg_train_error = -1 * ridge_reg_cv_results['train_score'] 
ridge_reg_test_error = -1 * ridge_reg_cv_results['test_score']

print(f"Mean absolute error of linear regression model on the train set:\n"
        f"{ridge_reg_train_error.mean():.3f} +/- {ridge_reg_train_error.std():.3f}")

print(f"Mean absolute error of linear regression model on the test set:\n"
        f"{ridge_reg_test_error.mean():.3f} +/- {ridge_reg_test_error.std():.3f}")

# HPT for ridge regularisation rate

In [None]:
alpha_list = np.logspace(-4, 0, num=20)
ridge_reg_pipeline = Pipeline([("poly", PolynomialFeatures(degree=2)),
                            ("feature_scaling", StandardScaler()),
                            ("ridge_cv", RidgeCV(alphas=alpha_list,
                                                cv=cv,
                                                scoring="neg_mean_absolute_error"))])

ridge_reg_cv_results = ridge_reg_pipeline.fit(com_train_features, com_train_labels)

print("The score with the best alpha is: ",
        f"{ridge_reg_cv_results[-1].best_score_:.3f}")

print("The error with the best alpha is: ",
        f"{-ridge_reg_cv_results[-1].best_score_:.3f}")

In [None]:
print("The best value of alpha is: ", ridge_reg_cv_results[-1].alpha_)

## `RidgeCV` with cross validation
exercise

## Ridge HPT through `GridSearchCV`

In [None]:
ridge_grid_pipeline = Pipeline([("poly", PolynomialFeatures(degree=2)),
                            ("feature_scaling", StandardScaler()),
                            ("ridge", Ridge())])

param_grid = {'poly__degree': (1,2,3),
            'ridge__alpha': np.logspace(-4, 0, num=20)}

ridge_grid_search = GridSearchCV(ridge_grid_pipeline,
                                param_grid=param_grid,
                                n_jobs=2,
                                cv=cv,
                                scoring="neg_mean_absolute_error",
                                return_train_score=True)

ridge_grid_search.fit(com_train_features, com_train_labels)

In [None]:
mean_train_error = -1 * ridge_grid_search.cv_results_['mean_train_score'][ridge_grid_search.best_index_]
mean_test_error = -1 * ridge_grid_search.cv_results_['mean_test_score'][ridge_grid_search.best_index_]
std_train_error = -1 * ridge_grid_search.cv_results_['std_train_score'][ridge_grid_search.best_index_]
std_test_error = -1 * ridge_grid_search.cv_results_['std_test_score'][ridge_grid_search.best_index_]

print(f"Best Mean absolute error of polynomial ridge regression model on the train set:\n"
        f"{mean_train_error.mean():.3f} +/- {std_train_error.std():.3f}")

print(f"Mean absolute error of polynomial ridge regression model on the test set:\n"
        f"{mean_test_error.mean():.3f} +/- {std_test_error.std():.3f}")

In [None]:
print("The best parameter value is:", ridge_grid_search.best_params_)