### Read the CSV file:

In [1]:
import pandas as pd
import numpy as np
autompg = pd.read_csv("auto-mpg.csv")

print(autompg.head(n=10))

    mpg  cylinders  displacement horsepower  weight  acceleration  model year  \
0  18.0          8         307.0        130    3504          12.0          70   
1  15.0          8         350.0        165    3693          11.5          70   
2  18.0          8         318.0        150    3436          11.0          70   
3  16.0          8         304.0        150    3433          12.0          70   
4  17.0          8         302.0        140    3449          10.5          70   
5  15.0          8         429.0        198    4341          10.0          70   
6  14.0          8         454.0        220    4354           9.0          70   
7  14.0          8         440.0        215    4312           8.5          70   
8  14.0          8         455.0        225    4425          10.0          70   
9  15.0          8         390.0        190    3850           8.5          70   

   origin                   car name  
0       1  chevrolet chevelle malibu  
1       1          buick skyla

### Train/Test Data Split

In [2]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(autompg, 
                        test_size=0.2, random_state=123)
print('Train size: ', len(train_set), 'Test size: ', len(test_set))

Train size:  318 Test size:  80


### Train and Evaluate a Linear Regression Model

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

X = train_set[['cylinders', 'weight']]
y = train_set['mpg']

X_test = test_set[['cylinders', 'weight']]
y_test = test_set['mpg']

lr_model = LinearRegression()
lr_model.fit(X, y)

y_pred = lr_model.predict(X)
print('Results for linear regression on training data')
print('  Default settings')
print('Internal parameters:')
print('   Bias is ', lr_model.intercept_)
print('   Coefficients', lr_model.coef_)
print('   Score', lr_model.score(X, y))

print('MAE is   ', mean_absolute_error(y, y_pred))
print('RMSE is  ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is   ', mean_squared_error(y, y_pred))
print('R^2      ', r2_score(y, y_pred))

y_test_pred = lr_model.predict(X_test)
print()
print('Results for linear regression on test data')

print('MAE is   ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is  ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is   ', mean_squared_error(y_test, y_test_pred))
print('R^2      ', r2_score(y_test, y_test_pred))

Results for linear regression on training data
  Default settings
Internal parameters:
   Bias is  46.37040163155278
   Coefficients [-0.61080829 -0.00653599]
   Score 0.6995694274153458
MAE is    3.224227443492472
RMSE is   4.29986369578714
MSE is    18.488827802348244
R^2       0.6995694274153458

Results for linear regression on test data
MAE is    3.464656343580139
RMSE is   4.316753176508568
MSE is    18.63435798689681
R^2       0.6805115080359773


In [4]:
print('Weight: ', autompg['weight'].min(), '-', autompg['weight'].max())
print('Cylinders: ', autompg['cylinders'].min(), '-', autompg['cylinders'].max())

Weight:  1613 - 5140
Cylinders:  3 - 8


### Pipelined Linear Regression Model

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

X = train_set[['cylinders', 'weight']]
y = train_set['mpg']

X_test = test_set[['cylinders', 'weight']]
y_test = test_set['mpg']

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
scale = StandardScaler()

lr_model = LinearRegression()

stages = [('imp_median', imp_median),
           ('scale', scale),
           ('lr_model', lr_model)
           ]

pipe_model = Pipeline(stages)
           
pipe_model.fit(X,y)

y_pred = pipe_model.predict(X)
print('Results for pipeline linear regression on training data')
# print('  Default settings')
# print('Internal parameters:')
print('   Bias is ', pipe_model.predict([[0,0]]))
#print('   Coefficients', lr_model.coef_)
print('   Score', pipe_model.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

y_test_pred = pipe_model.predict(X_test)
print()
print('Results for pipeline linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))

Results for pipeline linear regression on training data
   Bias is  [46.37040163]
   Score 0.6995694274153458
MAE is   3.224227443492473
RMSE is  4.29986369578714
MSE is  18.488827802348244
R^2     0.6995694274153458

Results for pipeline linear regression on test data
MAE is   3.4646563435801396
RMSE is  4.316753176508568
MSE is  18.634357986896813
R^2     0.6805115080359772




### Pipeline 2 Linear Regression Model

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

X = train_set[['cylinders', 'weight']]
y = train_set['mpg']

X_test = test_set[['cylinders', 'weight']]
y_test = test_set['mpg']

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
poly3 = PolynomialFeatures(degree=3, include_bias=False)
scale = StandardScaler()

lr_model = LinearRegression()

stages = [('imp_median', imp_median),
           ('poly3',poly3),
           ('scale', scale),
           ('lr_model', lr_model)
           ]

pipe_model = Pipeline(stages)
           
pipe_model.fit(X,y)

y_pred = pipe_model.predict(X)
print('Results for pipeline linear regression on training data')
# print('  Default settings')
# print('Internal parameters:')
print('   Bias is ', pipe_model.predict([[0,0]]))
#print('   Coefficients', lr_model.coef_)
print('   Score', pipe_model.score(X,y))

print('MAE is  ', mean_absolute_error(y, y_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y, y_pred)))
print('MSE is ', mean_squared_error(y, y_pred))
print('R^2    ', r2_score(y,y_pred))

y_test_pred = pipe_model.predict(X_test)
print()
print('Results for pipeline linear regression on test data')

print('MAE is  ', mean_absolute_error(y_test, y_test_pred))
print('RMSE is ', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE is ', mean_squared_error(y_test, y_test_pred))
print('R^2    ', r2_score(y_test,y_test_pred))

Results for pipeline linear regression on training data
   Bias is  [-56.20592138]
   Score 0.7443735079842804
MAE is   2.93013365028998
RMSE is  3.966299965430602
MSE is  15.731535415774793
R^2     0.7443735079842804

Results for pipeline linear regression on test data
MAE is   3.061382942619044
RMSE is  4.038216018241966
MSE is  16.307188609986
R^2     0.7204111297614427




### Results

Internal parameters for regression models to predict mpg on the autompg data.

| Model | Training Features | Bias | Coefficients | Score |
|:---|:---|:---|:---|:---|
|Linear Regression|Cylinders, Weight|46.37|-0.6108, -0.0065|0.6996|

Basic results for our regression models to predict mpg on the autompg data.

| Model | Training Features | Set | RMSE | R2 |
|:---|:---|:---|:---|:---|
|Linear Regression|Cylinders, Weight|Training|4.30|69.96|
|Linear Regression|Cylinders, Weight|Test|4.32|68.05|
|Pipeline 1|Cylinders, Weight|Training|4.30|69.96|
|Pipeline 1|Cylinders, Weight|Test|4.32|68.05|
|Pipeline 2|Cylinders, Weight|Training|3.97|74.43|
|Pipeline 2|Cylinders, Weight|Test|4.04|72.05|