# Import packages & dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load the dataset
data = pd.read_csv('Automobile_data.csv')
# List the available columns
list(data)

['symboling',
 'normalized-losses',
 'make',
 'fuel-type',
 'aspiration',
 'num-of-doors',
 'body-style',
 'drive-wheels',
 'engine-location',
 'wheel-base',
 'length',
 'width',
 'height',
 'curb-weight',
 'engine-type',
 'num-of-cylinders',
 'engine-size',
 'fuel-system',
 'bore',
 'stroke',
 'compression-ratio',
 'horsepower',
 'peak-rpm',
 'city-mpg',
 'highway-mpg',
 'price']

In [3]:
data.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [4]:
data.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    205 non-null object
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 205 non-null object
stroke               205 non-null object
compression-ratio    205 non-null float64
horsepower           205 non-nul

In [6]:
# Preprocess the dataset by coercing the important columns to numeric values
data['horsepower'] = pd.to_numeric(data['horsepower'], 
                                   errors='coerce') # if value is not numeric, will fill in as NaN
data['price'] = pd.to_numeric(data['price'], 
                              errors='coerce')

# Remove any rows which contain missing data
data.dropna(subset=['price', 'horsepower'], inplace=True)

# Statistically visualized the dataset

In [7]:
from scipy.stats.stats import pearsonr
pearsonr(data['horsepower'], data['price'])

# Pearson correlation to check the linear relationship between 2 variables, a number between -1 and 1
# more toward -1 negatively related, more toward 1 positively related, 0 no relation
# 2nd number is p-value, probability that 2 variables are NOT linearly related
# the smaller the lower the chance -> strongly indicated that 2 variables are related

(0.8105330821322063, 1.1891278276946011e-47)

In [8]:
# Graph the datapoints using bokeh to visualize linear relationship

from bokeh.io import output_notebook
from bokeh.plotting import ColumnDataSource, figure, show

# enable notebook output
output_notebook()

source = ColumnDataSource(data=dict(
    x=data['horsepower'],
    y=data['price'],
    make=data['make'],
))

tooltips = [
    ('make', '@make'),
    ('horsepower', '$x'),
    ('price', '$y{$0}')
]

p = figure(plot_width=600, plot_height=400, tooltips=tooltips)
p.xaxis.axis_label = 'Horsepower'
p.yaxis.axis_label = 'Price'

# add a square renderer with a size, color, and alpha
p.circle('x', 'y', source=source, size=8, color='blue', alpha=0.5)

# show the results
show(p)

# Compare Simple vs. Ridge vs. Lasso Regressions

## 1 dependent and 1 independent variable

In [28]:
# Create training and testing set
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.25)

In [29]:
# Import models
from sklearn import linear_model
lr = linear_model.LinearRegression()
rr = linear_model.Ridge()
lar = linear_model.Lasso()

In [30]:
# Choose 1 dependent and 1 independent variable

# the linear regression model expects a 2d array, so we add an extra dimension with reshape
# input: [1, 2, 3], output: [ [1], [2], [3] ]
# this allows us to regress on multiple independent variables later
training_x = np.array(train['horsepower']).reshape(-1, 1)
training_y = np.array(train['price'])

test_x = np.array(test['horsepower']).reshape(-1, 1)
test_y = np.array(test['price'])

In [31]:
# output is a nested array in the form of [ [1] ]
# squeeze removes all zero dimensions -> [1]
# asscalar turns a single number array into a number -> 1
#slope = np.asscalar(np.squeeze(model.coef_))
#intercept = model.intercept_
#print('slope:', slope, 'intercept:', intercept)

def slope_intercept(model):
    slope = np.asscalar(np.squeeze(model.coef_))
    intercept = model.intercept_
    print(' slope:', slope, ' intercept:', intercept)

In [32]:
# Perform regression


rr.fit(training_x, training_y)
slope_intercept(rr)

lar.fit(training_x, training_y)
slope_intercept(lar)

 slope: 172.591980742373  intercept: -4628.025949940315
 slope: 172.591246100388  intercept: -4627.948768157541
 slope: 172.59134651753112  intercept: -4627.959318023033


In [21]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# define a function to generate a prediction and then compute the desired metrics
def predict_metrics(model, x, y):
    pred = model.predict(x)
    mae = mean_absolute_error(y, pred)
    mse = mean_squared_error(y, pred)
    r2 = r2_score(y, pred)
    return mae, mse, r2

training_mae, training_mse, training_r2 = predict_metrics(lr, training_x, training_y)
test_mae, test_mse, test_r2 = predict_metrics(lr, test_x, test_y)
print('training mean error:', training_mae, 'training mse:', training_mse, 'training r2:', training_r2)
print('test mean error:', test_mae, 'test mse:', test_mse, 'test r2:', test_r2)

training_mae, training_mse, training_r2 = predict_metrics(rr, training_x, training_y)
test_mae, test_mse, test_r2 = predict_metrics(rr, test_x, test_y)
print('training mean error:', training_mae, 'training mse:', training_mse, 'training r2:', training_r2)
print('test mean error:', test_mae, 'test mse:', test_mse, 'test r2:', test_r2)

training_mae, training_mse, training_r2 = predict_metrics(lar, training_x, training_y)
test_mae, test_mse, test_r2 = predict_metrics(lar, test_x, test_y)
print('training mean error:', training_mae, 'training mse:', training_mse, 'training r2:', training_r2)
print('test mean error:', test_mae, 'test mse:', test_mse, 'test r2:', test_r2)

training mean error: 3476.666998336201 training mse: 23899739.81650969 training r2: 0.6637955902919701
test mean error: 3076.9911220278236 test mse: 16106489.217319828 test r2: 0.5879616055048792
training mean error: 3476.6681884178784 training mse: 23899739.81754456 training r2: 0.6637955902774122
test mean error: 3076.987006638723 test mse: 16106401.506836267 test r2: 0.5879638493263811
training mean error: 3476.667975560021 training mse: 23899739.81720747 training r2: 0.6637955902821542
test mean error: 3076.987742716694 test mse: 16106417.194582822 test r2: 0.5879634480003183


## Several independent variables and 1 dependent variable

In [22]:
cols = ['horsepower', 'engine-size', 'peak-rpm', 'length', 'width', 'height']

# preprocess the data as before (coerce to number)
for col in cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# And removing any rows which contain missing data
data.dropna(subset=['price', 'horsepower'], inplace=True)

# Let's see how strongly each column is correlated to price
for col in cols:
    print(col, pearsonr(data[col], data['price']))

horsepower (0.8105330821322063, 1.1891278276946011e-47)
engine-size (0.8738869517981516, 1.2650674479074428e-63)
peak-rpm (-0.10164886620219901, 0.15311824317199588)
length (0.6939647745646871, 6.39831060305001e-30)
width (0.7538710519013427, 8.679834788813268e-38)
height (0.13499022754460993, 0.05730390719825449)


In [46]:
# split train and test data as before
model_cols = ['horsepower', 'engine-size', 'length', 'width']
multi_x = np.column_stack(tuple(data[col] for col in model_cols))
multi_train_x, multi_test_x, multi_train_y, multi_test_y = \
    train_test_split(multi_x, data['price'], test_size=0.25)

In [47]:
# fit the model as before
lr_multi_model = linear_model.LinearRegression()
rr_multi_model = linear_model.Ridge()
lar_multi_model = linear_model.Lasso()

In [48]:
def multi_intercept_coeffs(model):
    multi_intercept = model.intercept_
    multi_coeffs = dict(zip(model_cols, model.coef_))
    print('intercept:', multi_intercept)
    print('coefficients:', multi_coeffs)

In [51]:
lr_multi_model.fit(multi_train_x, multi_train_y)
multi_intercept_coeffs(lr_multi_model)

rr_multi_model.fit(multi_train_x, multi_train_y)
multi_intercept_coeffs(rr_multi_model)

lar_multi_model.fit(multi_train_x, multi_train_y)
multi_intercept_coeffs(lar_multi_model)

intercept: -54500.123556421524
coefficients: {'horsepower': 49.482599084937476, 'engine-size': 98.79731248661044, 'length': 21.551805733548925, 'width': 699.934137568741}
intercept: -54289.58103158413
coefficients: {'horsepower': 49.47076988676316, 'engine-size': 98.87059053612107, 'length': 22.15381033797855, 'width': 695.0272499680083}
intercept: -54459.37267199075
coefficients: {'horsepower': 49.480391652737964, 'engine-size': 98.81229357721465, 'length': 21.65449755996208, 'width': 699.0192383261291}


In [52]:
# calculate error metrics
multi_train_mae, multi_train_mse, multi_train_r2 = predict_metrics(lr_multi_model, multi_train_x, multi_train_y)
multi_test_mae, multi_test_mse, multi_test_r2 = predict_metrics(lr_multi_model, multi_test_x, multi_test_y)
print('training mean error:', multi_train_mae, 'training mse:', multi_train_mse, 'training r2:', multi_train_r2)
print('test mean error:', multi_test_mae, 'test mse:', multi_test_mse, 'test r2:', multi_test_r2)

multi_train_mae, multi_train_mse, multi_train_r2 = predict_metrics(rr_multi_model, multi_train_x, multi_train_y)
multi_test_mae, multi_test_mse, multi_test_r2 = predict_metrics(rr_multi_model, multi_test_x, multi_test_y)
print('training mean error:', multi_train_mae, 'training mse:', multi_train_mse, 'training r2:', multi_train_r2)
print('test mean error:', multi_test_mae, 'test mse:', multi_test_mse, 'test r2:', multi_test_r2)

multi_train_mae, multi_train_mse, multi_train_r2 = predict_metrics(lar_multi_model, multi_train_x, multi_train_y)
multi_test_mae, multi_test_mse, multi_test_r2 = predict_metrics(lar_multi_model, multi_test_x, multi_test_y)
print('training mean error:', multi_train_mae, 'training mse:', multi_train_mse, 'training r2:', multi_train_r2)
print('test mean error:', multi_test_mae, 'test mse:', multi_test_mse, 'test r2:', multi_test_r2)

training mean error: 2395.75945281987 training mse: 11303785.932158751 training r2: 0.818329769786372
test mean error: 2512.6591124524275 test mse: 13013633.64901685 test r2: 0.8044919441597634
training mean error: 2395.5632398964567 training mse: 11303808.686682615 training r2: 0.8183294040841538
test mean error: 2513.8089296930225 test mse: 13019471.485197563 test r2: 0.8044042404458902
training mean error: 2395.720770476176 training mse: 11303786.731438676 training r2: 0.8183297569406431
test mean error: 2512.8633882209247 test mse: 13014788.103859533 test r2: 0.8044746004087454
