## For an unsolvable system of linear equations, Ax = b, there exists a best solution, x_hat, that is found by 
## solving the equation p = Ax_hat, where p is the vector resulting from the projection of b onto the column 
## space of A. Since e = b-p is orthogonal to the column space of A (i.e., belongs to the left-nullspace of A), 
## it follows that A'(b-Ax_hat) = 0 and A'Ax_hat = A'b, allowing us to solve for x_hat.

In [28]:
import pandas as pd
import numpy as np

In [29]:
seed = 311
rng = np.random.default_rng(seed) 

In [30]:
df = pd.DataFrame(rng.integers(0,100,size=(100, 4)), columns=['x1','x2','x3','x4'])
df

Unnamed: 0,x1,x2,x3,x4
0,34,15,23,56
1,8,64,4,70
2,46,13,6,79
3,60,98,43,8
4,2,18,33,8
...,...,...,...,...
95,0,68,99,27
96,33,59,21,33
97,7,79,49,84
98,6,57,0,43


In [31]:
correlations = rng.uniform(low=-1.0, high=1.0, size=4)
correlations

array([-0.16206972, -0.84142446,  0.76057477, -0.11313913])

In [32]:
# https://stackoverflow.com/questions/42902938/create-correlated-pandas-series

from scipy.stats import pearsonr
from scipy.optimize import minimize

# data = pd.DataFrame({'Country A': [10, 11, 10, 9]})

# data['Country B'] = minimize(lambda x: abs(0.8 - pearsonr(data['Country A'], x)[0]), 
#                              np.random.rand(len(data['Country A']))).x

# df['y1'] = (minimize(lambda x: abs(correlations[0] - pearsonr(df['x1'], x)[0]),
#                                   rng.random(len(df))).x) * 100

# df['y2'] = (minimize(lambda x: abs(correlations[1] - pearsonr(df['x2'], x)[0]),
#                                   rng.random(len(df))).x) * 100

# df['y3'] = (minimize(lambda x: abs(correlations[2] - pearsonr(df['x3'], x)[0]),
#                                   rng.random(len(df))).x) * 100

# df['y4'] = (minimize(lambda x: abs(correlations[3] - pearsonr(df['x4'], x)[0]),
#                                   rng.random(len(df))).x) * 100

df['y'] = (minimize(lambda x: abs(
                                 (correlations[0] - pearsonr(df['x1'], x)[0])
                                  + (correlations[1] - pearsonr(df['x2'], x)[0])
                                  + (correlations[2] - pearsonr(df['x3'], x)[0])
                                  + (correlations[3] - pearsonr(df['x4'], x)[0])
                                 ), rng.random(len(df))).x) * 100

display(df)

Unnamed: 0,x1,x2,x3,x4,y
0,34,15,23,56,37.137474
1,8,64,4,70,81.414166
2,46,13,6,79,60.273728
3,60,98,43,8,84.242199
4,2,18,33,8,103.286051
...,...,...,...,...,...
95,0,68,99,27,41.289700
96,33,59,21,33,10.294394
97,7,79,49,84,82.080967
98,6,57,0,43,98.852285


In [33]:
from lin_reg import LR
intercept, coefficients, train_sse = LR().fit(df,'y')
print(intercept, coefficients, train_sse)

68.91167632039958 [-0.17432897 -0.1168181  -0.04999463 -0.06386287] 95844.61724311876


In [34]:
correlations

array([-0.16206972, -0.84142446,  0.76057477, -0.11313913])

In [35]:
'''
>>> import numpy as np
>>> from sklearn.linear_model import LinearRegression
>>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
>>> # y = 1 * x_0 + 2 * x_1 + 3
>>> y = np.dot(X, np.array([1, 2])) + 3
>>> reg = LinearRegression().fit(X, y)
>>> reg.score(X, y)
1.0
>>> reg.coef_
array([1., 2.])
>>> reg.intercept_
3.0...
>>> reg.predict(np.array([[3, 5]]))
array([16.])
'''
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(df[['x1','x2','x3','x4']].to_numpy(), df['y'].to_numpy())
print(reg.intercept_)
reg.coef_

68.91167632039965


array([-0.17432897, -0.1168181 , -0.04999463, -0.06386287])

In [39]:
print('Intercepts')
print(f'sklearn: {reg.intercept_}')
print(f'lin_reg: {intercept}')
print('')
print('Coefficients')
print(f'sklearn: {reg.coef_}')
print(f'lin_reg: {coefficients}')

Intercepts
sklearn: 68.91167632039965
lin_reg: 68.91167632039958

Coefficients
sklearn: [-0.17432897 -0.1168181  -0.04999463 -0.06386287]
lin_reg: [-0.17432897 -0.1168181  -0.04999463 -0.06386287]


In [37]:
# To Do: add predict method, do train test split with data 