# Applied Machine Learning Final
Beth Harvey

April 25, 2023

In [1]:
# Import modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from watermark import watermark

In [2]:
# Document versions
print(watermark())

Last updated: 2023-04-26T19:31:12.095607-05:00

Python implementation: CPython
Python version       : 3.9.12
IPython version      : 8.2.0

Compiler    : Clang 12.0.0 
OS          : Darwin
Release     : 22.4.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit



In [3]:
# Load dataset as DataFrame
df = pd.read_csv('auto-mpg.csv')

In [4]:
# View data
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [13]:
# Change horsepower data type, remove null values for linear regression
df['horsepower'] = pd.to_numeric(df['horsepower'], errors = 'coerce')
df2 = df.copy()
df2.dropna(inplace = True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car name      392 non-null    object 
dtypes: float64(4), int64(4), object(1)
memory usage: 30.6+ KB


### Select Features

In [7]:
X = df2[['horsepower', 'weight']]
y = df2['mpg']

### Train Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Linear Regression Model

In [9]:
# Train model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict target
y_train_pred = lr_model.predict(X_train)

# Evaluate model performance on training data
print('Linear Regression Model Training Performance')
print('Bias:', lr_model.intercept_)
print('Coefficients:', lr_model.coef_)
print('Score:', lr_model.score(X_train, y_train))
print('MAE:', mean_absolute_error(y_train, y_train_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_train, y_train_pred)))
print('MSE:', mean_squared_error(y_train, y_train_pred))
print('R^2:', r2_score(y_train, y_train_pred))
print()

# Evaluate model performance on test data
y_test_pred = lr_model.predict(X_test)

print('Linear Regression Model Test Performance')
print('MAE:', mean_absolute_error(y_test, y_test_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_test_pred)))
print('MSE:', mean_squared_error(y_test, y_test_pred))
print('R^2:', r2_score(y_test, y_test_pred))

Linear Regression Model Training Performance
Bias: 46.586741438997876
Coefficients: [-0.05214672 -0.00587153]
Score: 0.7149822041359517
MAE: 3.234368596258616
RMSE: 4.240819490930014
MSE: 17.984549954651904
R^2: 0.7149822041359517

Linear Regression Model Test Performance
MAE: 3.5056538974903217
RMSE: 4.218029885247154
MSE: 17.791776112838125
R^2: 0.651419028085443


### Pipeline Model 1

In [10]:
# Get features from original data set
X_pipe = df[['horsepower', 'weight']]
y_pipe = df['mpg']

#Train test split
X_pipe_train, X_pipe_test, y_pipe_train, y_pipe_test = train_test_split(X_pipe, y_pipe, test_size = 0.2, 
                                                                        random_state = 42)

In [11]:
# Imputer with median strategy
imp_med = SimpleImputer(missing_values = np.nan, strategy = 'median')

# Standard scalar
scale = StandardScaler()

# Linear regression
lr_pipe = LinearRegression()

# Create pipeline stages
stages1 = [('imp_med', imp_med),
          ('scale', scale),
          ('lr_pipe', lr_pipe)]

# Create and fit pipeline
pipe1 = Pipeline(stages1)
pipe1.fit(X_pipe_train.values, y_pipe_train.values)

# Evaluate performance on train data
y_train_pred1 = pipe1.predict(X_pipe_train.values)

print('Pipeline 1 Model Train Performance')
print('Bias:', pipe1.predict([[0, 0]]))
print('Score:', pipe1.score(X_pipe_train.values, y_pipe_train.values))
print('MAE:', mean_absolute_error(y_pipe_train.values, y_train_pred1))
print('RMSE:', np.sqrt(mean_squared_error(y_pipe_train.values, y_train_pred1)))
print('MSE:', mean_squared_error(y_pipe_train.values, y_train_pred1))
print('R^2:', r2_score(y_pipe_train.values, y_train_pred1))
print()

# Evaluate performance on test data
y_test_pred1 = pipe1.predict(X_pipe_test.values)

print('Pipeline 1 Model Test Performance')
print('MAE:', mean_absolute_error(y_pipe_test.values, y_test_pred1))
print('RMSE:', np.sqrt(mean_squared_error(y_pipe_test.values, y_test_pred1)))
print('MSE:', mean_squared_error(y_pipe_test.values, y_test_pred1))
print('R^2:', r2_score(y_pipe_test.values, y_test_pred1))

Pipeline 1 Model Train Performance
Bias: [46.36665563]
Score: 0.6983291042713098
MAE: 3.3180603665679422
RMSE: 4.3489798330389
MSE: 18.913625588179062
R^2: 0.6983291042713098

Pipeline 1 Model Test Performance
MAE: 3.1286426894126533
RMSE: 3.8300395125447864
MSE: 14.669202667654304
R^2: 0.7271680690680923


### Pipeline Model 2

In [12]:
# Imputer with mean strategy
imp_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean')

# Polynomial Features with degree = 3
poly3 = PolynomialFeatures(degree = 3, include_bias = False)

# Standard Scaler and Linear Regression models used from pipeline 1

# Define stages
stages2 = [('imp_mean', imp_mean),
           ('poly3', poly3),
           ('scale', scale),
           ('lr_pipe', lr_pipe)]

# Create and fit pipeline
pipe2 = Pipeline(stages2)
pipe2.fit(X_pipe_train.values, y_pipe_train.values)

# Evaluate performance on training data
y_train_pred2 = pipe2.predict(X_pipe_train.values)

print('Pipeline 2 Model Train Performance')
print('Bias:', pipe2.predict([[0, 0]]))
print('Score:', pipe2.score(X_pipe_train.values, y_pipe_train.values))
print('MAE:', mean_absolute_error(y_pipe_train.values, y_train_pred2))
print('RMSE:', np.sqrt(mean_squared_error(y_pipe_train.values, y_train_pred2)))
print('MSE:', mean_squared_error(y_pipe_train.values, y_train_pred2))
print('R^2:', r2_score(y_pipe_train.values, y_train_pred2))
print()

# Evaluate performance on test data
y_test_pred2 = pipe2.predict(X_pipe_test.values)

print('Pipeline 2 Model Test Performance')
print('MAE:', mean_absolute_error(y_pipe_test.values, y_test_pred2))
print('RMSE:', np.sqrt(mean_squared_error(y_pipe_test.values, y_test_pred2)))
print('MSE:', mean_squared_error(y_pipe_test.values, y_test_pred2))
print('R^2:', r2_score(y_pipe_test.values, y_test_pred2))

Pipeline 2 Model Train Performance
Bias: [58.99122203]
Score: 0.7388254707108981
MAE: 2.977255278111605
RMSE: 4.046561057129744
MSE: 16.37465638907899
R^2: 0.7388254707108981

Pipeline 2 Model Test Performance
MAE: 2.7290610901574226
RMSE: 3.4738349399634396
MSE: 12.067529190110793
R^2: 0.7755564930754641


### Comparison

| Model | Training Features | RMSE Train | R2 Train | RMSE Test | R2 Test |
|:---|:---|:---|:---|:---|:---|
|Linear Regression|Horsepower, Weight|4.24|71.50|4.22|65.14|
|Pipeline 1|Horsepower, Weight|4.35|69.83|3.83|72.72|
|Pipeline 2|Horsepower, Weight|4.05|73.88|3.47|77.56|