# Linear Regression practice

Linear regression practice using the car price prediction problem as in datatalks zoomcamp, with few changes:
1. `msrp` feature viewed in terms of logarithm to base 10
2. `make` column redesigned such that each make has a number associated
3. `plotly` environment used for graphics - cleaner and visually pleasing ;)

In [219]:
import pandas as pd
import numpy as np
import wget

# Plotting options

import plotly.graph_objs as go
import plotly.offline as py


In [220]:
# Only run this piece of code for fresh save of the dataset
#url = 'https://raw.githubusercontent.com/anjanavasudevan/mlbookcamp-code/master/chapter-02-car-price/data.csv'
#file = wget.download(url)


## Data Preparation and analysis

In [221]:
#importing the data

#df = pd.read_csv(file)
df = pd.read_csv('data.csv')
df.head(10)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500
5,BMW,1 Series,2012,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,31200
6,BMW,1 Series,2012,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,26,17,3916,44100
7,BMW,1 Series,2012,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,39300
8,BMW,1 Series,2012,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,36900
9,BMW,1 Series,2013,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,27,18,3916,37200


## String Processing

Clean the string for easy labelling

In [222]:
# Checking the types of data in the file
df.dtypes

Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

In [223]:
# Changing names of columns to ones without space - easy access
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')
df.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'msrp'],
      dtype='object')

In [224]:
# Replacing all spaces in string objects to underscores
columns_with_string = list(df.dtypes[df.dtypes == 'object'].index)

for column in columns_with_string:
    df[column] = df[column].str.lower().str.replace(' ', '_')

df.tail(10)



Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
11904,bmw,z8,2002,premium_unleaded_(required),394.0,8.0,manual,rear_wheel_drive,2.0,"exotic,luxury,high-performance",compact,convertible,19,12,3916,130000
11905,bmw,z8,2003,premium_unleaded_(required),394.0,8.0,manual,rear_wheel_drive,2.0,"exotic,luxury,high-performance",compact,convertible,19,12,3916,131500
11906,acura,zdx,2011,premium_unleaded_(required),300.0,6.0,automatic,all_wheel_drive,4.0,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,46020
11907,acura,zdx,2011,premium_unleaded_(required),300.0,6.0,automatic,all_wheel_drive,4.0,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,56570
11908,acura,zdx,2011,premium_unleaded_(required),300.0,6.0,automatic,all_wheel_drive,4.0,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,50520
11909,acura,zdx,2012,premium_unleaded_(required),300.0,6.0,automatic,all_wheel_drive,4.0,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,46120
11910,acura,zdx,2012,premium_unleaded_(required),300.0,6.0,automatic,all_wheel_drive,4.0,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,56670
11911,acura,zdx,2012,premium_unleaded_(required),300.0,6.0,automatic,all_wheel_drive,4.0,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,50620
11912,acura,zdx,2013,premium_unleaded_(recommended),300.0,6.0,automatic,all_wheel_drive,4.0,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,50920
11913,lincoln,zephyr,2006,regular_unleaded,221.0,6.0,automatic,front_wheel_drive,4.0,luxury,midsize,sedan,26,17,61,28995


In [225]:
# Information on the numeric data
df.describe()

Unnamed: 0,year,engine_hp,engine_cylinders,number_of_doors,highway_mpg,city_mpg,popularity,msrp
count,11914.0,11845.0,11884.0,11908.0,11914.0,11914.0,11914.0,11914.0
mean,2010.384338,249.38607,5.628829,3.436093,26.637485,19.733255,1554.911197,40594.74
std,7.57974,109.19187,1.780559,0.881315,8.863001,8.987798,1441.855347,60109.1
min,1990.0,55.0,0.0,2.0,12.0,7.0,2.0,2000.0
25%,2007.0,170.0,4.0,2.0,22.0,16.0,549.0,21000.0
50%,2015.0,227.0,6.0,4.0,26.0,18.0,1385.0,29995.0
75%,2016.0,300.0,6.0,4.0,30.0,22.0,2009.0,42231.25
max,2017.0,1001.0,16.0,4.0,354.0,137.0,5657.0,2065902.0


## Checking the unique values and visualise data

In [226]:
for column in df.columns:
    print("{}:".format(column))
    print(df[column].unique()[:5])
    print(df[column].nunique())

make:
['bmw' 'audi' 'fiat' 'mercedes-benz' 'chrysler']
48
model:
['1_series_m' '1_series' '100' '124_spider' '190-class']
914
year:
[2011 2012 2013 1992 1993]
28
engine_fuel_type:
['premium_unleaded_(required)' 'regular_unleaded'
 'premium_unleaded_(recommended)' 'flex-fuel_(unleaded/e85)' 'diesel']
10
engine_hp:
[335. 300. 230. 320. 172.]
356
engine_cylinders:
[ 6.  4.  5.  8. 12.]
9
transmission_type:
['manual' 'automatic' 'automated_manual' 'direct_drive' 'unknown']
5
driven_wheels:
['rear_wheel_drive' 'front_wheel_drive' 'all_wheel_drive'
 'four_wheel_drive']
4
number_of_doors:
[ 2.  4.  3. nan]
3
market_category:
['factory_tuner,luxury,high-performance' 'luxury,performance'
 'luxury,high-performance' 'luxury' 'performance']
71
vehicle_size:
['compact' 'midsize' 'large']
3
vehicle_style:
['coupe' 'convertible' 'sedan' 'wagon' '4dr_hatchback']
16
highway_mpg:
[26 28 27 25 24]
59
city_mpg:
[19 20 18 17 16]
69
popularity:
[3916 3105  819  617 1013]
48
msrp:
[46135 40650 36350 29450 34

## Visualising the price

In [227]:
#plotting
pd.options.plotting.backend = "plotly"

In [228]:
# Viewing the range of prices of the cars
#ply.iplot(go.Histogram(x=df['msrp']))
fig = go.Figure(go.Histogram(x=df['msrp'],
                           name='Vehicle'))

fig.update_layout(title='Vehicle prices',
                  showlegend=True)

fig.show()


In [229]:
# Interactive Plot environment
data = [go.Histogram(x=df['msrp'],
                           name='Vehicle')]
layout = go.Layout(title = 'Vehicle Prices', xaxis_title = 'MSRP')

figure = go.Figure(data=data, layout=layout)
py.iplot(figure)


In [230]:
# Viewing things on log scale
log_val = df['msrp'].apply(np.log10)

# Plot the log values
data = [go.Histogram(x=log_val,
                    name='Vehicle')]
layout = go.Layout(title='Vehicle Prices (in log scale)', xaxis_title='log(MSRP)')

figure = go.Figure(data=data, layout=layout)
py.iplot(figure)


In [231]:
# Information on null values
df.isnull().sum()

make                    0
model                   0
year                    0
engine_fuel_type        3
engine_hp              69
engine_cylinders       30
transmission_type       0
driven_wheels           0
number_of_doors         6
market_category      3742
vehicle_size            0
vehicle_style           0
highway_mpg             0
city_mpg                0
popularity              0
msrp                    0
dtype: int64

## Splitting the dataset

The dataset is split in the following manner:
1. 60% -  training
2. 20% - Validation
3. 20% - Testing

The ratios can change according to preference, but a certain portion of the dataset must be allocated for validation and testing

In [232]:
# Setting the ratios
n = int(len(df))

n_val = int(0.2*n)
n_test = n_val
n_train = n - (n_val + n_test)

print('Length of dataset: {}\nLength of training set: {}\nLength of validation set: {}\nLength of testing set:{}'.format
                (n, n_train, n_val, n_test))

# Check to see if the split tallies
print(n == n_val+n_test+n_train)


Length of dataset: 11914
Length of training set: 7150
Length of validation set: 2382
Length of testing set:2382
True


In [233]:
# Shuffling the dataset and split accordingly - Can use scikit's train-test-split module
idx = np.arange(n)

# Fixing the seed for ensuring same randomness
np.random.seed(11)
np.random.shuffle(idx)

idx

array([ 8428, 11661,  4001, ...,  5200,  3775, 10137])

In [234]:
# Split the dataset
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:(n_train+n_val)]]
df_test = df.iloc[idx[(n_train+n_val):]]

df_train.head(10)

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
8428,acura,rdx,2016,premium_unleaded_(recommended),279.0,6.0,automatic,front_wheel_drive,4.0,"crossover,luxury",midsize,4dr_suv,29,20,204,40370
11661,suzuki,xl-7,2004,regular_unleaded,185.0,6.0,manual,four_wheel_drive,4.0,,midsize,4dr_suv,20,15,481,21999
4001,gmc,envoy_xl,2006,regular_unleaded,291.0,6.0,automatic,rear_wheel_drive,4.0,,large,4dr_suv,19,14,549,27380
2904,lincoln,continental,2017,regular_unleaded,305.0,6.0,automatic,all_wheel_drive,4.0,luxury,large,sedan,24,16,61,49515
11722,suzuki,xl7,2009,regular_unleaded,252.0,6.0,automatic,all_wheel_drive,4.0,crossover,midsize,4dr_suv,23,16,481,29079
308,nissan,370z,2015,premium_unleaded_(required),332.0,6.0,manual,rear_wheel_drive,2.0,high-performance,compact,convertible,24,17,2009,48100
3553,land_rover,defender,1997,regular_unleaded,182.0,8.0,automatic,four_wheel_drive,2.0,luxury,compact,convertible_suv,14,12,258,36931
3187,cadillac,ct6,2017,premium_unleaded_(required),404.0,6.0,automatic,all_wheel_drive,4.0,"luxury,high-performance",large,sedan,26,18,1624,87495
5197,pontiac,g6,2009,regular_unleaded,221.0,6.0,automatic,front_wheel_drive,2.0,performance,midsize,coupe,26,17,210,24080
11115,volvo,v60_cross_country,2015,regular_unleaded,250.0,5.0,automatic,all_wheel_drive,4.0,"crossover,luxury",midsize,wagon,28,20,870,41000


In [235]:
# Reset the index numbers
def reset_indexing(dataframe):
    return dataframe.reset_index(drop=True)

# Preprocessing functions under one roof
def preprocess(data):
    """
    Preprocess the data frame - Split the output columns, shuffle and reindex
    """
    # 1. Reset the indices
    data = reset_indexing(data)

    # 2. Split the output columns
    actual = np.log10(data['msrp'].values)

    # 3. Delete the output from train
    del data['msrp']

    return actual, data


In [236]:
y_train, df_train = preprocess(df_train)

df_train.head(10)


Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity
0,acura,rdx,2016,premium_unleaded_(recommended),279.0,6.0,automatic,front_wheel_drive,4.0,"crossover,luxury",midsize,4dr_suv,29,20,204
1,suzuki,xl-7,2004,regular_unleaded,185.0,6.0,manual,four_wheel_drive,4.0,,midsize,4dr_suv,20,15,481
2,gmc,envoy_xl,2006,regular_unleaded,291.0,6.0,automatic,rear_wheel_drive,4.0,,large,4dr_suv,19,14,549
3,lincoln,continental,2017,regular_unleaded,305.0,6.0,automatic,all_wheel_drive,4.0,luxury,large,sedan,24,16,61
4,suzuki,xl7,2009,regular_unleaded,252.0,6.0,automatic,all_wheel_drive,4.0,crossover,midsize,4dr_suv,23,16,481
5,nissan,370z,2015,premium_unleaded_(required),332.0,6.0,manual,rear_wheel_drive,2.0,high-performance,compact,convertible,24,17,2009
6,land_rover,defender,1997,regular_unleaded,182.0,8.0,automatic,four_wheel_drive,2.0,luxury,compact,convertible_suv,14,12,258
7,cadillac,ct6,2017,premium_unleaded_(required),404.0,6.0,automatic,all_wheel_drive,4.0,"luxury,high-performance",large,sedan,26,18,1624
8,pontiac,g6,2009,regular_unleaded,221.0,6.0,automatic,front_wheel_drive,2.0,performance,midsize,coupe,26,17,210
9,volvo,v60_cross_country,2015,regular_unleaded,250.0,5.0,automatic,all_wheel_drive,4.0,"crossover,luxury",midsize,wagon,28,20,870


In [237]:
np.shape(y_train)

(7150,)

## Linear Regression

Model the price using all the numeric columns. A sample Linear regression model is given below:

3-dimensional model for the $i$th observation, given by:
$$
\hat{y_i} = \hat{\beta_0} + \hat{\beta_1}x_{i1} + \hat{\beta_2}x_{i2} + \hat{\beta_3}x_{i3}
$$

In [238]:
# Preparing the baseline model
df_train.dtypes

make                  object
model                 object
year                   int64
engine_fuel_type      object
engine_hp            float64
engine_cylinders     float64
transmission_type     object
driven_wheels         object
number_of_doors      float64
market_category       object
vehicle_size          object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
popularity             int64
dtype: object

In [239]:
# Extracting the numeric columns only
x_train_columns = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']
# x_train = df_train[x_train_columns].values

## Clean up missing values

If there are missing values, fit the missing values using one of the following techniques:
1. Forward fill - take values from the data ahead
2. Backward fill - take values from data behind
3. Average fill - fill the values using the mean
4. Zero fill - Pad the `NaN` using zeroes

In [240]:
# Fill the missing values using zero fill
df_train.isnull().sum()

make                    0
model                   0
year                    0
engine_fuel_type        1
engine_hp              45
engine_cylinders       17
transmission_type       0
driven_wheels           0
number_of_doors         5
market_category      2255
vehicle_size            0
vehicle_style           0
highway_mpg             0
city_mpg                0
popularity              0
dtype: int64

In [241]:
# Clean up functions under one roof
def cleanup(data):
    """
    All cleanup functions in one roof
    """

    # 1. Filtering the numeric columns
    columns = ['engine_hp', 'engine_cylinders',
               'highway_mpg', 'city_mpg', 'popularity']

    # 2. Cleaning up the missing data, and converting to numpy
    data = data[columns].fillna(0).values

    # 3. Stacking the ones for extra beta
    data = np.column_stack([np.ones(data.shape[0]), data])

    return data


In [242]:
# Clean the training
x_train = cleanup(df_train)

In [243]:
# Defining the linear regression modeL:
def linear_regression(train, actual):
    """
    Linear regression function definition
    Input: training dataset (as numpy array), actual output (single dimension)
    Output: Model parameters - ß
    """
    xtx = train.T.dot(train)
    xtx_inv = np.linalg.inv(xtx)
    ß_hat = xtx_inv.dot(train.T).dot(actual)
    return ß_hat

ß = linear_regression(x_train, y_train)
ß


array([ 3.42883246e+00,  4.22037749e-03, -6.75993161e-02,  6.99635917e-03,
        5.75798631e-03, -3.50492009e-06])

In [244]:
## Comparison between actual and predicted values

y_hat = x_train.dot(ß)

## Plotting set up
trace1 = go.Histogram(x = y_train, name = 'Actual values')
trace2 = go.Histogram(x = y_hat, name='Prediction')

data = [trace1, trace2]

layout = go.Layout(title='Vehicle Prices - actual vs predicted (in log scale)',
                   xaxis_title='log(MSRP)')

figure = go.Figure(data=data, layout=layout)
py.iplot(figure)



## Errors

The error term used for tuning will be the RMSE (Root mean squared error)

In [245]:
# Defining error term
def rmse(y_actual, y_pred):
    error = (y_actual - y_pred)**2
    mse = error.mean()
    return np.sqrt(mse)



In [246]:
# Test on training data
rmse_round1 = rmse(y_train, y_hat)
rmse_round1


0.3277871497016988

## Errors on the validation data

In [247]:
# Processing on the validation set

y_val, df_val = preprocess(df_val)
x_val = cleanup(df_val)

# Linear regression
ß_val = linear_regression(x_val, y_val)

# Prediction
y_val_hat = x_val.dot(ß_val)

# RMSE
rmse_val = rmse(y_val, y_val_hat)
rmse_val


0.32777210722482475

## Feature Engineering

1. Including the `year` column - as age of a vehicle affects the price
2. Remodify the categorical variables - `make` and `number_of_doors` to a numeric type.

In [248]:
df_train.year.max()

# df_train['age'] = df_train.year.max() - df_train['year']

2017

In [249]:
# Remodify the preprocess function to add the age column

def cleanup_feature_eng(data):
    """
    Preprocess the data with some feature engineering
    """

    # 1. Filtering the numeric columns
    columns = ['engine_hp', 'engine_cylinders',
               'highway_mpg', 'city_mpg', 'popularity', 'year']
    
    data['age'] = data['year'].max() - data['year']

    # 2. Cleaning up the missing data, and converting to numpy
    data = data[columns].fillna(0).values

    # 3. Stacking the ones for extra beta
    data = np.column_stack([np.ones(data.shape[0]), data])

    return data




In [250]:
# Regression using feature engineering
x_train_feature_eng = cleanup_feature_eng(df_train)

ß_train = linear_regression(x_train_feature_eng, y_train)

# RMSE:
y_feature_hat = x_train_feature_eng.dot(ß_train)
rmse_feature = rmse(y_train, y_feature_hat)

rmse_feature

0.22777039777695438

In [251]:
# Visualising the changes:

## Plotting set up
trace_1 = go.Histogram(x=y_train, name='Actual values')
trace_2 = go.Histogram(x=y_feature_hat, name='Prediction with year of the vehicle')

data = [trace_1, trace_2]

layout = go.Layout(title='Vehicle Prices - actual vs predicted with year feature (in log scale)',
                   xaxis_title='log(MSRP)')

figure = go.Figure(data=data, layout=layout)
py.iplot(figure)


In [252]:
# No. of doors
# df_train['number_of_doors'].unique()

# Feature engineer the doors - 
#for no_doors in [2, 3, 4]:
    #df_train['{}_door'.format(no_doors)] = (df_train['number_of_doors'] == no_doors).astype(int)

# Make of the vehicle
# make_vehicle = df['make'].unique()

# Feature engineer the make - assign a no. the make - certain makes are pricier:
# len(make_vehicle)
# Enumeration:
# map_make = {make: label for (label, make) in enumerate(make_vehicle)}

# df_train['make_label'] = df_train['make'].apply(lambda make: map_make[make])
# df_train.head()



In [253]:
# Full-fledged feature engineering

def cleanup_full_fledged(data):
    """
    Preprocess the data with some feature engineering
    """

    # 1. Filtering the numeric columns
    columns = ['engine_hp', 'engine_cylinders',
               'highway_mpg', 'city_mpg', 'popularity']


    # 2. The age of the vehicle
    data['age'] = data['year'].max() - data['year']
    columns.append('age')

    # 3. No of doors
    for no_doors in [2, 3, 4]:
        data['{}_door'.format(no_doors)] = (data['number_of_doors'] == no_doors).astype(int)
        columns.append('{}_door'.format(no_doors))
    
    # 4. Make of the vehicle
    map_make = {make: label for (label, make) in enumerate(data['make'].unique())}
    data['make_label'] = data['make'].apply(lambda make: map_make[make])
    columns.append('make_label')

    # 2. Cleaning up the missing data, and converting to numpy
    data = data[columns].fillna(0).values

    # 3. Stacking the ones for extra beta
    data = np.column_stack([np.ones(data.shape[0]), data])

    return data



In [254]:
# Linear regression with full feature engineering

x_train_full = cleanup_full_fledged(df_train)

ß_train_full = linear_regression(x_train_full, y_train)

# RMSE:
y_full_hat = x_train_full.dot(ß_train_full)
rmse_feature = rmse(y_train, y_full_hat)

rmse_feature


0.22596630455769365

In [255]:
# Visualising the fit
trace_1 = go.Histogram(x=y_train, name='Actual values')
trace_2 = go.Histogram(
    x=y_full_hat, name='Prediction with year of the vehicle')

data = [trace_1, trace_2]

layout = go.Layout(title='Vehicle Prices - actual vs predicted (with feature engineering - in log scale)',
                   xaxis_title='log(MSRP)')

figure = go.Figure(data=data, layout=layout)
py.iplot(figure)
