<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Huber-Regressor" data-toc-modified-id="Huber-Regressor-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Huber Regressor</a></span></li><li><span><a href="#RANSAC-Regressor-(best)" data-toc-modified-id="RANSAC-Regressor-(best)-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>RANSAC Regressor (best)</a></span></li></ul></div>

# Huber Regressor
Linear regression model that is robust to outliers.

The Huber Regressor optimizes the squared loss for the samples where |(y - X'w) / sigma| < epsilon and the absolute loss for the samples where |(y - X'w) / sigma| > epsilon, where w and sigma are parameters to be optimized. The parameter sigma makes sure that if y is scaled up or down by a certain factor, one does not need to rescale epsilon to achieve the same robustness. Note that this does not take into account the fact that the different features of X may be of different scales.

This makes sure that the loss function is not heavily influenced by the outliers while not completely ignoring their effect.



In [1]:
import numpy as np
from sklearn.linear_model import HuberRegressor, LinearRegression, RANSACRegressor
from sklearn.datasets import make_regression

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [10, 8]
plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
rng = np.random.RandomState(0)

X, y, coef = make_regression(
    n_samples=200, n_features=2, noise=4.0, coef=True, random_state=0)

X[:4] = rng.uniform(10, 20, (4, 2))
y[:4] = rng.uniform(10, 20, 4)
X[:4]

array([[15.48813504, 17.15189366],
       [16.02763376, 15.44883183],
       [14.23654799, 16.45894113],
       [14.37587211, 18.91773001]])

In [3]:
y[:4]

array([19.63662761, 13.83441519, 17.91725038, 15.2889492 ])

In [4]:
huber = HuberRegressor().fit(X, y)
huber.score(X, y) 

-7.284608623514573

In [5]:
huber.predict(X[:1,])

array([806.72000092])

In [7]:
ransac = RANSACRegressor().fit(X,y)
ransac.score(X,y)

-9.314258529562249

In [9]:
linear = LinearRegression().fit(X, y)

print("True coefficients:", coef)
print("Linear Regression coefficients:", linear.coef_)
print("Huber coefficients:", huber.coef_)
print("RANSAC coefficients:", ransac.estimator_.coef_)


True coefficients: [20.4923687  34.16981149]
Linear Regression coefficients: [-1.92210833  7.02266092]
Huber coefficients: [17.79064252 31.01066091]
RANSAC coefficients: [20.64730585 33.80386683]


# RANSAC Regressor (best)

In [None]:
import numpy as np
from matplotlib import pyplot as plt

from sklearn import linear_model, datasets


n_samples = 1000
n_outliers = 50


X, y, coef = datasets.make_regression(n_samples=n_samples, n_features=1,
                                      n_informative=1, noise=10,
                                      coef=True, random_state=0)

# Add outlier data
np.random.seed(0)
X[:n_outliers] = 3 + 0.5 * np.random.normal(size=(n_outliers, 1))
y[:n_outliers] = -3 + 10 * np.random.normal(size=n_outliers)

# Fit line using all data
lr = linear_model.LinearRegression()
lr.fit(X, y)

# Robustly fit linear model with RANSAC algorithm
ransac = linear_model.RANSACRegressor()
ransac.fit(X, y)
inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

# Predict data of estimated models
line_X = np.arange(X.min(), X.max())[:, np.newaxis]
line_y = lr.predict(line_X)
line_y_ransac = ransac.predict(line_X)

# Compare estimated coefficients
print("Estimated coefficients (true, linear regression, RANSAC):")
print(coef, lr.coef_, ransac.estimator_.coef_)

lw = 2
plt.scatter(X[inlier_mask], y[inlier_mask], color='yellowgreen', marker='.',
            label='Inliers')
plt.scatter(X[outlier_mask], y[outlier_mask], color='gold', marker='.',
            label='Outliers')
plt.plot(line_X, line_y, color='navy', linewidth=lw, label='Linear regressor')
plt.plot(line_X, line_y_ransac, color='cornflowerblue', linewidth=lw,
         label='RANSAC regressor')
plt.legend(loc='lower right')
plt.xlabel("Input")
plt.ylabel("Response")
plt.show()
