# Update

The univariate approach is a strong approximation to the sklearn model, but the model needs a rebuild that aims to reduce the ordinary least squares:

$ \hat{\beta}=\left( \mathbf{X}^T\mathbf{X} \right)^{-1}\mathbf{X}^T\mathbf{y} $

- *$\hat{\beta}$=* OLS estimator
- *$\mathbf{X}$=* Matrix regressor variable (features)
- *$\mathbf{y}$=* Response value vector

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# write a function for the OLS

def beta_hat(X, y):
    # ensure X, y are numpy arrays
    X = np.array(X)
    y = np.array(y)
    return np.linalg.inv(X.T @ X) @ X.T @ y

In [16]:
# Create the design matrix X, basically adding a column of ones for the intercept
def design_matrix(X):
    return np.hstack((np.ones((X.shape[0], 1)), X))

# so, if we have a dataset with features X and target y, we can compute the coefficients
X = np.array([[2, 2, 5, 4, 4], [4, 3, 1, 2, 5], [3, 4, 5, 1, 2]]) # Example feature matrix
y = np.array([1, 4, 3]) # Example target vector

# we have 3 observations and 5 features
print(f"Beta hat: {beta_hat(design_matrix(X), y)}")

Beta hat: [-18.         0.90625    0.75      -1.109375   0.         0.140625]


In [None]:
# we might as well combine the two functions into one...
def ols(X, y):
    """
    OLS coefficients estimation.
    
    Takes a feature matrix X and a target vector y, ensures they are correct sizes and shape,
    adds a column of ones to X for the intercept, and returns the OLS coefficients.

    The output is:
    the intercept, beta_0; followed by the coefficients for each feature, beta_1, ..., beta_p.
    """

    # ensure X, y are numpy arrays
    X = np.array(X)
    y = np.array(y)

    # if X is 1D, reshape
    if X.ndim == 1:
        X = X.reshape(-1, 1)

    # ensure y is 1D. if not, then flatten
    if y.ndim > 1:
        y = y.flatten()

    # check if the number of observations in X and y match
    # this could be caused by flattening in cases where y is not a friendly shape
    if X.shape[0] != y.shape[0]:
        raise ValueError("X and y must match.")

    # column of ones
    X = np.hstack((np.ones((X.shape[0], 1)), X))
    return np.linalg.inv(X.T @ X) @ X.T @ y

# try again and see if we get the same result
print(f"Beta hat: {ols(X, y)}")

Beta hat: [-18.         0.90625    0.75      -1.109375   0.         0.140625]


In [19]:
# now to predict y based on a new situation X
def predict(X_test, beta):
    # ensure X_test is numpy
    X_test = np.array(X_test)

    # if X_test is 1D, reshape
    if X_test.ndim == 1:
        X_test = X_test.reshape(-1, 1)

    # add a column of ones for the intercept
    X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

    return X_test @ beta

In [20]:
# try predicting with a new set of features
X_test = np.array([[3, 2, 4, 5, 1], [1, 3, 2, 4, 5]]) # Example new feature matrix

predictions = predict(X_test, ols(X, y))
print(f"Predictions: {predictions}")

Predictions: [-18.078125 -16.359375]
