In [8]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model


![Alt text](image-3.png)

## Read data

In [9]:
new_datas = []
with open("../data/fuel.txt", "r") as f:
    f.readline()
    datas = f.readlines()
    datas = [line.replace("\n", "") for line in datas]

for line in datas:
    temp = line.split(",")

    temp.pop(0) # remove the 'State' Column

    driver = float(temp[0])
    fuelC = float(temp[1])
    income = float(temp[2])
    mile = float(temp[3])
    pop = int(temp[5])
    tax = float(temp[6])

    fuel = 1000 * fuelC / pop
    dlic = 1000 * driver / pop
    log_mile = np.log2(mile)

    new_datas.append([tax, dlic, income, log_mile, fuel])

df = pd.DataFrame(new_datas, columns=["Tax", "Dlic", "Income", "logMiles", "Fuel"])
df.head(5)

Unnamed: 0,Tax,Dlic,Income,logMiles,Fuel
0,18.0,1031.380067,23471.0,16.52711,690.264418
1,8.0,1031.641062,30064.0,13.734286,514.279223
2,18.0,908.597153,25578.0,15.753556,621.475071
3,21.7,946.570576,22257.0,16.582436,655.292668
4,18.0,844.703336,32275.0,17.364708,573.912855


- Get X and Y data

In [10]:
X = df.drop(columns=["Fuel"])
Y = np.array([df["Fuel"]])
Y



array([[690.26441757, 514.2792226 , 621.47507144, 655.29266793,
        573.91285495, 616.61151167, 549.99260782, 626.02393371,
        317.49239716, 586.34609606, 750.90741721, 426.34936956,
        628.42794733, 526.23766215, 666.53646256, 647.00162472,
        600.90240949, 659.74131389, 633.73476444, 584.09261655,
        602.28617335, 543.23206817, 642.97059452, 672.91914994,
        683.50195471, 689.36611392, 666.59775913, 617.69053989,
        614.89398503, 689.65212114, 597.64026149, 646.52727397,
        374.16406957, 645.44182635, 666.18874657, 572.07563999,
        657.06051765, 556.34551224, 518.3286299 , 482.32693716,
        711.73305769, 697.05277706, 638.23106095, 681.10013659,
        591.49994589, 691.02271186, 681.03112144, 576.06974956,
        562.4109333 , 581.79371695, 842.79175243]])

In [11]:
def qr_decomposition(X):
    m = X.shape[0]
    n = X.shape[1]

    Q = np.identity(m)

    R = np.copy(X)

    for i in range(n):
        x = X[i:, i]
        k = x.shape[0]

        rho = -np.sign(x[0]) * np.linalg.norm(x)

        e = np.zeros(k)

        e[0] = 1

        v = (1/(x[0] - rho)) * (x - (rho * e))

        for j in range(n):
            R[i:, j] = R[i: , j] - (2 / (v@v)) * ((np.outer(v, v)) @ R[i:, j])

        for j in range(m):
            Q[i:, j] = Q[i:, j] - (2 / (v@v)) * ((np.outer(v, v)) @ Q[i:, j])

    return Q.transpose(), R


def linear_regression(X, Y):
    Xbar = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
    Q, R = qr_decomposition(Xbar)
    inverted_R = np.linalg.pinv(R)

    A = inverted_R @ Q.T

    return A @ Y.T

In [12]:
w = linear_regression(X, Y)
w = w.T.tolist()
line = ['Intercept', 'Tax', "Dlic", "Income", 'LogMiles']
res = list(zip(line, w[0]))
for o in res:
    print("{: >20}: {: >10}".format(*o))

           Intercept: 154.19284457730757
                 Tax: -4.227983208329615
                Dlic: 0.47187121344198474
              Income: -0.006135330970417853
            LogMiles: 18.545274506048013


## Method 2

In [24]:
regrressor = linear_model.LinearRegression(fit_intercept=True)
# Xbar = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
X = df.drop(columns=["Fuel"])
Y = df["Fuel"]
regrressor.fit(X, Y)

print(regrressor.intercept_)
print(regrressor.coef_)


154.1928445773043
[-4.22798321e+00  4.71871213e-01 -6.13533097e-03  1.85452745e+01]
