# Exploring OLS and WLS using NumPy and matplotlib
Author: Björn Dahlgren, 2016-04-09

In [None]:
import numpy as np
import matplotlib.pyplot as plt
def rnd(shape, positive=True):
    if positive:
        return np.abs(np.random.normal(size=shape))
        #return np.random.random(shape)
    else:
        return np.random.normal(size=shape)
        #return np.random.random(shape) - 0.5
true_beta = [np.exp(1), -np.pi]
%matplotlib inline

In [None]:
def get_data(N=20, rel_minerr=1e-2, abs_maxerr=.2, xmin=0, xmax=1):
    x = np.linspace(xmin, xmax, N)
    err = rnd(x.shape)*(1 - rel_minerr) + rel_minerr
    heteroscedastic=0
    homoscedastic=1
    y = ((true_beta[1] + (abs_maxerr*rnd(x.shape, False)*err)*heteroscedastic)*x + 
         true_beta[0] + (abs_maxerr*rnd(x.shape, False)*err)*homoscedastic)
    return x, y, err

In [None]:
x, y, err = get_data()

In [None]:
def plot(*args):
    plt.errorbar(x, y, yerr=err, ls='None', marker='.')
    for beta in args:
        plt.plot(x, beta[0] + x*beta[1])

In [None]:
plt.figure(figsize=(14, 3))
plt.subplot(1, 3, 1)
plot(true_beta)
plt.subplot(1, 3, 2)
plt.plot(err, '.')
plt.subplot(1, 3, 3)
_ = plt.errorbar(x/err, (y-true_beta[0])/err, yerr=err, marker='.', ls='None')

In [None]:
def LS(x, y, w=1):  # w == 1 => OLS, w != 1 => WLS
    """ Least squares 
    
    References
    -----------
    Wikipedia & standard texts on least squares.
    A note about R2 in WLS:
        Willett, John B., and Judith D. Singer. "Another cautionary note about R 2:
        Its use in weighted least-squares regression analysis."
        The American Statistician 42.3 (1988): 236-238.
    """
    sqrtw = np.sqrt(w)
    y = y * sqrtw
    X = np.ones((x.size, 2))
    X[:, 1] = x
    if hasattr(sqrtw, 'ndim') and sqrtw.ndim == 1:
        sqrtw = sqrtw.reshape((sqrtw.size, 1))
    X *= sqrtw
    
    beta = np.linalg.lstsq(X, y)[0]
    eps = X.dot(beta) - y
    SSR = eps.T.dot(eps)  # sum of squared residuals
    vcv = SSR/(x.size - 2)*np.linalg.inv(X.T.dot(X))
    TSS = np.sum(np.square(y - np.mean(y)))  # total sum of squares
    R2 = 1 - SSR/TSS
    return beta, vcv, R2
    XtX = X.T.dot(X)
    return np.linalg.lstsq(XtX, X.T.dot(y))[0]

In [None]:
def model(x, beta):
    X = np.ones((x.size, 2))
    X[:, 1] = x
    return X.dot(beta)

In [None]:
beta, vcv, R2 = LS(x, y)
beta_w, vcv_w, R2_w = LS(x, y, err**-2)
plot(true_beta, beta, beta_w)
print(beta, np.diag(vcv)**0.5, R2)
print(beta_w, np.diag(vcv_w)**0.5, R2_w)

In [None]:
import statsmodels.api as sm
ols_model = sm.OLS(y, sm.add_constant(x))
wls_model = sm.WLS(y, sm.add_constant(x), weights=err**-2)
ols_res = ols_model.fit()
wls_res = wls_model.fit()
print(ols_res.params, ols_res.bse, ols_res.rsquared)
print(wls_res.params, wls_res.bse, wls_res.rsquared)

## Pooling multiple reggresions

In [None]:
def weighted_average(obs, s2):
    avg, sum_of_w = np.average(obs, axis=0, weights=1/s2, returned=True)
    var = np.sum(np.square(obs - avg)/s2, axis=0)/((avg.shape[0] - 1) * sum_of_w)
    return avg, var

In [None]:
def weighted_average_plot(obs, s2, xlbl=r'$\beta_0$', ylbl=r'$\beta_1$', ttl=r'$y(x) = \beta_0 + \beta_1 \cdot x$',
                         label_cb=None):
    plt.errorbar(obs[:, 0], obs[:, 1], marker='s', ls='None', xerr=s2[:, 0]**0.5, yerr=s2[:, 1]**0.5, alpha=.5)
    plt.xlabel(xlbl); plt.ylabel(ylbl); plt.title(ttl)
    avg, var = weighted_average(obs, s2)
    lbl = None if label_cb is None else label_cb(avg, var)
    plt.errorbar(avg[0], avg[1], xerr=var[0]**0.5, yerr=var[1]**0.5, marker='o', c='r',
                 linewidth=2, markersize=10, label=lbl)

In [None]:
Ns = [32, 17, 43, 29, 31, 37]
beta = np.empty((len(Ns), 2))
s2 = np.empty_like(beta)
for idx, N in enumerate(Ns):
    x, y, err = get_data()
    beta[idx, :], cov, R2 = LS(x, y, err**-2)
    s2[idx, :] = np.diag(cov)
weighted_average_plot(beta, s2)
plt.gca().set_xlim([true_beta[0]-.1, true_beta[0]+.1]), plt.gca().set_ylim([true_beta[1]-.1, true_beta[1]+.1])
#plt.legend(numpoints=1)_ = plt.legend(numpoints=1)
_ = plt.plot(*true_beta, c='g', marker='d')

## Visualizing OLS vs. WLS

In [None]:
all_N = np.logspace(1, 3, 20000)
R2 = np.empty((2, all_N.size))
beta = np.empty((2, all_N.size, 2))
cov = np.empty((2, all_N.size, 2, 2))
for idx_N, N in enumerate(all_N):
    x, y, err = get_data(N)
    for idx_m, w in enumerate([1, err**-2]):
        beta[idx_m, idx_N, :], cov[idx_m, idx_N, :, :], R2[idx_m, idx] = LS(x, y, w)

In [None]:
plt.figure(figsize=(14, 8))
for idx_beta, true_val in enumerate(true_beta):
    for idx_m, lbl in enumerate(['OLS', 'WLS']):
        ax = plt.subplot(2, 2, idx_beta+1)
        style = dict(c='bg'[idx_m], ls='None', marker='.',)
        plt.plot(all_N, np.abs(beta[idx_m, :, idx_beta] - true_val),
                     #yerr=cov[idx_m, :, idx_beta, idx_beta]**0.5,
                alpha=0.1, **style)
        plt.plot(np.nan, np.nan, label=lbl, **style)  # avoid alpha in legend
        
        ax = plt.subplot(2, 2, idx_beta+3)
        plt.plot(all_N, cov[idx_m, :, idx_beta, idx_beta],
                     #yerr=cov[idx_m, :, idx_beta, idx_beta]**0.5,
                alpha=0.1, **style)
        plt.plot(np.nan, np.nan, label=lbl, **style)  # avoid alpha in legend
        
    ax = plt.subplot(2, 2, idx_beta+1)
    ax.set_xscale('log')
    ax.legend(numpoints=1, loc='best', frameon=False)
    ax.set_xlabel(r'$n_{obs}$', fontsize=16)
    ax.set_ylabel(r'$| \beta_%d - \hat{\beta}_%d |$' % (idx_beta, idx_beta), fontsize=16)
    
    ax = plt.subplot(2, 2, idx_beta+3)
    ax.set_xscale('log')
    ax.legend(numpoints=1, loc='best', frameon=False)
    ax.set_xlabel(r'$n_{obs}$', fontsize=16)
    ax.set_ylabel(r'$var(\hat{\beta}_%d)$' % idx_beta, fontsize=16)
