## For an unsolvable system of linear equations, Ax = b, there exists a best solution, x_hat, that is found by solving the equation p = Ax_hat, where p is the vector resulting from the projection of b onto the column space of A. 
## Since e = b-p is orthogonal to the column space of A (i.e., belongs to the left-nullspace of A), it follows that A'(b-Ax_hat) = 0 and A'Ax_hat = A'b, allowing us to solve for x_hat.

In [4]:
import pandas as pd
import numpy as np

In [5]:
seed = 311
rng = np.random.default_rng(seed) 

In [6]:
df = pd.DataFrame(rng.integers(0,100,size=(100, 4)), columns=['x1','x2','x3','x4'])
df

Unnamed: 0,x1,x2,x3,x4
0,34,15,23,56
1,8,64,4,70
2,46,13,6,79
3,60,98,43,8
4,2,18,33,8
...,...,...,...,...
95,0,68,99,27
96,33,59,21,33
97,7,79,49,84
98,6,57,0,43


In [7]:
# https://colab.research.google.com/drive/1TOOjC0VDQyEr-KE6762z5KJWj1-cc3pL?usp=sharing

correlations = rng.uniform(low=-1.0, high=1.0, size=4)
correlations

array([-0.16206972, -0.84142446,  0.76057477, -0.11313913])

In [8]:
# https://stackoverflow.com/questions/42902938/create-correlated-pandas-series

from scipy.stats import pearsonr
from scipy.optimize import minimize

df['y'] = (minimize(lambda x: abs(
                                 (correlations[0] - pearsonr(df['x1'], x)[0])
                                  + (correlations[1] - pearsonr(df['x2'], x)[0])
                                  + (correlations[2] - pearsonr(df['x3'], x)[0])
                                  + (correlations[3] - pearsonr(df['x4'], x)[0])
                                 ), rng.random(len(df))).x) * 100

display(df)

Unnamed: 0,x1,x2,x3,x4,y
0,34,15,23,56,37.137474
1,8,64,4,70,81.414166
2,46,13,6,79,60.273728
3,60,98,43,8,84.242199
4,2,18,33,8,103.286051
...,...,...,...,...,...
95,0,68,99,27,41.289700
96,33,59,21,33,10.294394
97,7,79,49,84,82.080967
98,6,57,0,43,98.852285


In [9]:
train_df = df.iloc[:int(0.8*len(df)),:]
test_df = df.iloc[int(0.8*len(df)):,:]
print(len(train_df)+len(test_df))

100


In [10]:
from lin_reg import LR
intercept, coefficients, train_sse = LR().fit(train_df,'y')
print(intercept, coefficients, train_sse)

82.85258681960056 [-0.16573585 -0.16288678 -0.13519709 -0.19693179] 75147.0354303233


In [11]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_df[['x1','x2','x3','x4']].to_numpy(), train_df['y'].to_numpy())
print(reg.intercept_)
reg.coef_

82.85258681960069


array([-0.16573585, -0.16288678, -0.13519709, -0.19693179])

In [22]:
print('Intercepts')
print(f'sklearn: {reg.intercept_}')
print(f'lin_reg: {intercept}')
print(f'difference: {reg.intercept_ - intercept}')
print('')
print('Coefficients')
print(f'sklearn: {reg.coef_}')
print(f'lin_reg: {coefficients}')
print(f'difference: {reg.coef_ - coefficients}')

Intercepts
sklearn: 82.85258681960069
lin_reg: 82.85258681960056
difference: 1.2789769243681803e-13

Coefficients
sklearn: [-0.16573585 -0.16288678 -0.13519709 -0.19693179]
lin_reg: [-0.16573585 -0.16288678 -0.13519709 -0.19693179]
difference: [ 0. -0. -0. -0.]


In [21]:
sklearn_preds = reg.predict(test_df.drop("y", axis=1).to_numpy())
lin_reg_preds = LR().predict(test_df, "y", coefficients, intercept)[0]

print(f'sklearn preds: \n{sklearn_preds}')
print('')
print(f'lin_reg preds: \n{lin_reg_preds}')
print('')
print(f'difference: {sklearn_preds - lin_reg_preds}')

sklearn preds: 
[41.38134633 69.8479561  58.49094641 35.18045119 57.82500064 55.92384195
 52.2825636  59.28107676 49.02670402 47.88050552 49.42097641 50.86531667
 40.68310198 70.23255198 52.00282131 53.07461631 58.43509612 45.65745296
 64.10555849 39.70074405]

lin_reg preds: 
[41.38134633 69.8479561  58.49094641 35.18045119 57.82500064 55.92384195
 52.2825636  59.28107676 49.02670402 47.88050552 49.42097641 50.86531667
 40.68310198 70.23255198 52.00282131 53.07461631 58.43509612 45.65745296
 64.10555849 39.70074405]

difference: [-0.  0.  0. -0.  0.  0.  0.  0.  0.  0. -0.  0.  0.  0.  0. -0.  0. -0.
  0. -0.]
