# Exploring OLS and WLS using NumPy and matplotlib
Author: Björn Dahlgren, 2016-04-09

In [None]:
import numpy as np
import matplotlib.pyplot as plt
def rnd(shape, positive=True):
    if positive:
        return np.abs(np.random.normal(size=shape))
        #return np.random.random(shape)
    else:
        return np.random.normal(size=shape)
        #return np.random.random(shape) - 0.5
    
%matplotlib inline

In [None]:
def get_data(N=20, rel_minerr=1e-2, abs_maxerr=2, xmin=0, xmax=10):
    x = np.linspace(xmin, xmax, N)
    err = rnd(x.shape)*(1 - rel_minerr) + rel_minerr
    heteroscedastic=0
    homoscedastic=1
    y = ((np.pi + (abs_maxerr*rnd(x.shape, False)*err)*heteroscedastic)*x + 
         np.exp(1) + (abs_maxerr*rnd(x.shape, False)*err)*homoscedastic)
    return x, y, err

In [None]:
x, y, err = get_data()

In [None]:
def plot(*args):
    plt.errorbar(x, y, yerr=3*err, ls='None', marker='.')
    for beta in args:
        plt.plot(x, beta[0] + x*beta[1])

In [None]:
plt.figure(figsize=(14, 3))
plt.subplot(1, 3, 1)
plot([np.exp(1), np.pi])
plt.subplot(1, 3, 2)
plt.plot(err, '.')
plt.subplot(1, 3, 3)
plt.plot(x/err, y/err, '.')

In [None]:
def LS(x, y, w=1):  # w == 1 => OLS, w != 1 => WLS
    """ Least squares 
    
    References
    -----------
    Wikipedia & standard texts on least squares.
    A note about R2 in WLS:
        Willett, John B., and Judith D. Singer. "Another cautionary note about R 2:
        Its use in weighted least-squares regression analysis."
        The American Statistician 42.3 (1988): 236-238.
    """
    sqrtw = np.sqrt(w)
    y = y * sqrtw
    X = np.ones((x.size, 2))
    X[:, 1] = x
    if hasattr(sqrtw, 'ndim') and sqrtw.ndim == 1:
        sqrtw = sqrtw.reshape((sqrtw.size, 1))
    X *= sqrtw
    
    beta = np.linalg.lstsq(X, y)[0]
    eps = X.dot(beta) - y
    SSR = eps.T.dot(eps)  # sum of squared residuals
    vcv = SSR/(x.size - 2)*np.linalg.inv(X.T.dot(X))
    TSS = np.sum(np.square(y - np.mean(y)))  # total sum of squares
    R2 = 1 - SSR/TSS
    return beta, vcv, R2
    XtX = X.T.dot(X)
    return np.linalg.lstsq(XtX, X.T.dot(y))[0]

In [None]:
def model(x, beta):
    X = np.ones((x.size, 2))
    X[:, 1] = x
    return X.dot(beta)

In [None]:
beta, vcv, R2 = LS(x, y)
beta_w, vcv_w, R2_w = LS(x, y, err**-2)
plot([np.exp(1), np.pi], beta, beta_w)
print(beta, np.diag(vcv)**0.5, R2)
print(beta_w, np.diag(vcv_w)**0.5, R2_w)

In [None]:
import statsmodels.api as sm
ols_model = sm.OLS(y, sm.add_constant(x))
wls_model = sm.WLS(y, sm.add_constant(x), weights=err**-2)
ols_res = ols_model.fit()
wls_res = wls_model.fit()
print(ols_res.params, ols_res.bse, ols_res.rsquared)
print(wls_res.params, wls_res.bse, wls_res.rsquared)

In [None]:
plot([np.exp(1), np.pi], ols_res.params, wls_res.params)

In [None]:
all_N = np.logspace(1, 3, 20000)
R2 = np.empty((2, all_N.size))
beta = np.empty((2, all_N.size, 2))
cov = np.empty((2, all_N.size, 2, 2))
for idx_N, N in enumerate(all_N):
    x, y, err = get_data(N)
    for idx_m, w in enumerate([1, err**-2]):
        beta[idx_m, idx_N, :], cov[idx_m, idx_N, :, :], R2[idx_m, idx] = LS(x, y, w)

In [None]:
plt.figure(figsize=(14, 8))
for idx_beta, true_val in enumerate([np.exp(1), np.pi]):
    for idx_m, lbl in enumerate(['OLS', 'WLS']):
        ax = plt.subplot(2, 2, idx_beta+1)
        style = dict(c='bg'[idx_m], ls='None', marker='.',)
        plt.plot(all_N, np.abs(beta[idx_m, :, idx_beta] - true_val),
                     #yerr=cov[idx_m, :, idx_beta, idx_beta]**0.5,
                alpha=0.1, **style)
        plt.plot(np.nan, np.nan, label=lbl, **style)  # avoid alpha in legend
        
        ax = plt.subplot(2, 2, idx_beta+3)
        plt.plot(all_N, cov[idx_m, :, idx_beta, idx_beta],
                     #yerr=cov[idx_m, :, idx_beta, idx_beta]**0.5,
                alpha=0.1, **style)
        plt.plot(np.nan, np.nan, label=lbl, **style)  # avoid alpha in legend
        
    ax = plt.subplot(2, 2, idx_beta+1)
    ax.set_xscale('log')
    ax.legend(numpoints=1, loc='best', frameon=False)
    ax.set_xlabel(r'$n_{obs}$', fontsize=16)
    ax.set_ylabel(r'$| \beta_%d - \hat{\beta}_%d |$' % (idx_beta, idx_beta), fontsize=16)
    
    ax = plt.subplot(2, 2, idx_beta+3)
    ax.set_xscale('log')
    ax.legend(numpoints=1, loc='best', frameon=False)
    ax.set_xlabel(r'$n_{obs}$', fontsize=16)
    ax.set_ylabel(r'$var(\hat{\beta}_%d)$' % idx_beta, fontsize=16)
