# Linear Regression

## Part - 1

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
plt.style.use('seaborn')

#### Auto MPG Data Set
Source: [UCI Machine Learning Repository - Auto MPG Data Set](https://archive.ics.uci.edu/ml/datasets/Auto+MPG)

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
auto_df = pd.read_csv(url, delim_whitespace=True)
auto_df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']

In [4]:
auto_df.horsepower = pd.to_numeric(auto_df.horsepower, errors='coerce')
auto_df.car_name = auto_df.car_name.astype('string')

In [5]:
auto_df = auto_df[auto_df.horsepower.notnull()]
len(auto_df)

391

### Multiple Linear Regression

#### Create a multi-dimensional numpy array from pandas dataframe

Include `horseposwer`, `displacement`, and `weight` columns from auto_df

In [6]:
x_df = auto_df[['horsepower', 'displacement', 'weight']]
x_df.insert(0, 'beta_0', 1)
x_df.head()

Unnamed: 0,beta_0,horsepower,displacement,weight
0,1,165.0,350.0,3693.0
1,1,150.0,318.0,3436.0
2,1,150.0,304.0,3433.0
3,1,140.0,302.0,3449.0
4,1,198.0,429.0,4341.0


In [7]:
X = x_df.to_numpy()
ym = auto_df.mpg
X[:5,]

array([[1.000e+00, 1.650e+02, 3.500e+02, 3.693e+03],
       [1.000e+00, 1.500e+02, 3.180e+02, 3.436e+03],
       [1.000e+00, 1.500e+02, 3.040e+02, 3.433e+03],
       [1.000e+00, 1.400e+02, 3.020e+02, 3.449e+03],
       [1.000e+00, 1.980e+02, 4.290e+02, 4.341e+03]])

$\beta = np.linalg.solve(X^TX, X^Tym)$

In [8]:
beta = np.linalg.solve(np.dot(X.T, X), np.dot(X.T, ym))
beta

array([ 4.48678383e+01, -4.17458038e-02, -5.66362495e-03, -5.35915276e-03])

In [9]:
ym_bar = ym.mean()
ym_hat = np.dot(X, beta)

#### R-Squared

In [10]:
sse_dm = ym - ym_hat
sst_dm = ym - ym_bar
r2 = 1 - sse_dm.dot(sse_dm)/sst_dm.dot(sst_dm)
r2

0.7066247353661022