In [20]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [21]:
df = pd.read_csv('cleaned_car_data.csv')
df.head(5)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,horsepower-binned
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,mpfi,3.47,2.68,9.0,111,5000,21,27,13495,Low
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,mpfi,3.47,2.68,9.0,111,5000,21,27,16500,Low
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,mpfi,2.68,3.47,9.0,154,5000,19,26,16500,Medium
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,mpfi,3.19,3.4,10.0,102,5500,24,30,13950,Low
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,mpfi,3.19,3.4,8.0,115,5500,18,22,17450,Low


# Multiple Linear Regression
- using multiple independent variables to predict one dependent variable.

The equation is given

$$
Yhat = a + b\_1 X\_1 + b\_2 X\_2 + b\_3 X\_3 + b\_4 X\_4
$$

Lets use these variables as a predictor (independent variables)

In [22]:
x = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]
y = df['price']
lm = LinearRegression()

In [23]:
lm.fit(x, y)

LinearRegression()

In [24]:
y_predict=lm.predict(x)
y_predict[0:5]

array([13699.07700462, 13699.07700462, 19052.71346719, 10620.61524404,
       15520.90025344])

In [25]:
lm.intercept_

-15811.863767729232

In [26]:
lm.coef_

array([53.53022809,  4.70805253, 81.51280006, 36.1593925 ])

We get final linear model of

- <b>Price</b> = -15678.742628061467 + 52.65851272 x <b>horsepower</b> + 4.69878948 x <b>curb-weight</b> + 81.95906216 x <b>engine-size</b> + 33.58258185 x <b>highway-mpg</b>

# Multivariate Polynomial function
$$
Yhat = a + b\_1 X\_1 +b\_2 X\_2 +b\_3 X\_1 X\_2+b\_4 X\_1^2+b\_5 X\_2^2
$$

To perform polynomial transform, we use PolynomialFeatures

In [27]:
pr = PolynomialFeatures(degree=2)
pr_t = pr.fit_transform(x)
pr_t.shape

(201, 15)

Before the transformation

In [28]:
x.shape

(201, 4)

# Pipeline
- Pipeline simplify the steps of processing data.
- Data goes into pipeline, performs the tasks in order and, gives result.

In [16]:
Input=[('scale',StandardScaler()), ('polynomial', PolynomialFeatures(degree=2, include_bias=False)), ('model',LinearRegression())]

if `include_bias = True` then it includes a bias column which all polynomial powers are zero

Inputing the list into the Pipeline constructor

In [17]:
pipe=Pipeline(Input)
pipe

Pipeline(steps=[('scale', StandardScaler()),
                ('polynomial', PolynomialFeatures(include_bias=False)),
                ('model', LinearRegression())])

We then convert x into float to avoid error and fit the datas into the pipe

In [18]:
x = x.astype(float)
pipe.fit(x,y)

Pipeline(steps=[('scale', StandardScaler()),
                ('polynomial', PolynomialFeatures(include_bias=False)),
                ('model', LinearRegression())])

After fitting, we can predict our value by entering our independent variables (z)

In [19]:
ypipe=pipe.predict(x)
ypipe[0:4]

array([13102.93329646, 13102.93329646, 18226.43450275, 10391.09183955])