## Automobile Miles Per Gallon (MPG) Prediction 

Given data about various cars, let's try to predict the **miles per gallon** of a given vehicle.

We will use linear regression, decision tree, and random forest models to make our predictions. 

Data Source: https://www.kaggle.com/datasets/uciml/autompg-dataset

### Getting Started

In [1]:
import numpy as np
import pandas as pd

import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv('auto-mpg.csv')
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


### Preprocessing

In [4]:
def onehot_encode(df, column_dict):
    df = df.copy()
    for column, prefix in column_dict.items():
        dummies = pd.get_dummies(df[column], prefix=prefix, dtype=int)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [5]:
def preprocess_inputs(df):
    df = df.copy()

    # Fill in missing horsepower values with the column mean
    df['horsepower'] = df['horsepower'].replace('?', np.NaN).astype(float) # nan values are considered usually as float
    df['horsepower'] = df['horsepower'].fillna(df['horsepower'].mean())

    # Create make feature
    df['make'] = df['car name'].apply(lambda x: re.search(r'^\w+', x).group(0))
    df = df.drop('car name', axis=1)

    # Fix typos in make names
    make_typo_correction = {
        'vw': 'volkswagen',
        'chevy': 'chevrolet',
        'maxda' : 'mazda',
        'vokswagen': 'volkswagen',
        'toyouts': 'toyota',
        'toyouta': 'toyota',
        'chevroelt': 'chevrolet'
    }

    df['make'] = df['make'].replace(make_typo_correction)

    # One hot encode nominal features
    nominal_feature_dict = {
        'cylinders': 'cyl',
        'origin': 'orig',
        'make': 'mk'
    }

    df = onehot_encode(df, nominal_feature_dict)

    # Split df into X and y
    y = df['mpg'].copy()
    X = df.drop('mpg', axis=1).copy()

    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

    # Scale X_train and X_test with a standard scaler fit only on X_train
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [7]:
X_train

Unnamed: 0,displacement,horsepower,weight,acceleration,model year,cyl_3,cyl_4,cyl_5,cyl_6,cyl_8,...,mk_peugeot,mk_plymouth,mk_pontiac,mk_renault,mk_saab,mk_subaru,mk_toyota,mk_triumph,mk_volkswagen,mk_volvo
0,1.533381,1.141598,1.742273,-0.594573,-0.311409,-0.104447,-0.992831,-0.104447,-0.546869,1.723783,...,-0.148522,-0.278423,-0.182913,-0.104447,-0.085126,-0.120824,-0.263117,-0.060084,-0.230283,-0.120824
1,-0.536489,-0.872511,-0.715662,1.398199,-0.875395,-0.104447,1.007220,-0.104447,-0.546869,-0.580119,...,-0.148522,-0.278423,-0.182913,-0.104447,-0.085126,-0.120824,-0.263117,-0.060084,-0.230283,-0.120824
2,0.054902,-0.513834,-0.491446,0.130071,-1.721374,-0.104447,-0.992831,-0.104447,1.828592,-0.580119,...,-0.148522,-0.278423,-0.182913,-0.104447,-0.085126,-0.120824,-0.263117,-0.060084,-0.230283,-0.120824
3,0.054902,-0.237928,0.193255,0.927180,0.534569,-0.104447,-0.992831,-0.104447,1.828592,-0.580119,...,-0.148522,-0.278423,-0.182913,-0.104447,-0.085126,-0.120824,-0.263117,-0.060084,-0.230283,-0.120824
4,0.045046,-0.375881,-0.417913,-0.232251,-1.721374,-0.104447,-0.992831,-0.104447,1.828592,-0.580119,...,-0.148522,-0.278423,-0.182913,-0.104447,-0.085126,-0.120824,-0.263117,-0.060084,-0.230283,-0.120824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,-1.226446,-0.182747,-0.801250,-0.775735,-1.157388,9.574271,-0.992831,-0.104447,-0.546869,-0.580119,...,-0.148522,-0.278423,-0.182913,-0.104447,-0.085126,-0.120824,-0.263117,-0.060084,-0.230283,-0.120824
274,1.533381,2.107267,1.813395,-1.138057,-0.875395,-0.104447,-0.992831,-0.104447,-0.546869,1.723783,...,-0.148522,-0.278423,-0.182913,-0.104447,-0.085126,-0.120824,-0.263117,-0.060084,-0.230283,-0.120824
275,-0.595628,-0.237928,-0.578240,-0.304716,0.534569,-0.104447,1.007220,-0.104447,-0.546869,-0.580119,...,-0.148522,-0.278423,-0.182913,-0.104447,-0.085126,-0.120824,3.800585,-0.060084,-0.230283,-0.120824
276,-1.039172,-1.148416,-1.138779,0.601090,1.380548,-0.104447,1.007220,-0.104447,-0.546869,-0.580119,...,-0.148522,-0.278423,-0.182913,-0.104447,-0.085126,-0.120824,3.800585,-0.060084,-0.230283,-0.120824


In [8]:
y_train

157    15.0
109    21.0
17     21.0
253    20.5
24     21.0
       ... 
71     19.0
106    12.0
270    21.1
348    37.7
102    26.0
Name: mpg, Length: 278, dtype: float64

In [9]:
{column: len(X_train[column].unique()) for column in X_train.columns}

{'displacement': 72,
 'horsepower': 82,
 'weight': 257,
 'acceleration': 88,
 'model year': 13,
 'cyl_3': 2,
 'cyl_4': 2,
 'cyl_5': 2,
 'cyl_6': 2,
 'cyl_8': 2,
 'orig_1': 2,
 'orig_2': 2,
 'orig_3': 2,
 'mk_amc': 2,
 'mk_audi': 2,
 'mk_bmw': 2,
 'mk_buick': 2,
 'mk_cadillac': 2,
 'mk_capri': 2,
 'mk_chevrolet': 2,
 'mk_chrysler': 2,
 'mk_datsun': 2,
 'mk_dodge': 2,
 'mk_fiat': 2,
 'mk_ford': 2,
 'mk_hi': 2,
 'mk_honda': 2,
 'mk_mazda': 2,
 'mk_mercedes': 2,
 'mk_mercury': 2,
 'mk_nissan': 1,
 'mk_oldsmobile': 2,
 'mk_opel': 2,
 'mk_peugeot': 2,
 'mk_plymouth': 2,
 'mk_pontiac': 2,
 'mk_renault': 2,
 'mk_saab': 2,
 'mk_subaru': 2,
 'mk_toyota': 2,
 'mk_triumph': 2,
 'mk_volkswagen': 2,
 'mk_volvo': 2}

### Training 

In [10]:
# Using simple linear model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

linear_r2 = linear_model.score(X_test, y_test)
print("Linear Regression R^2 : {:.5f}".format(linear_r2))

Linear Regression R^2 : -4289380516303062171648.00000


In [11]:
# Using a Decision Tree Model (tree models are good in capturing non-linearity)
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
tree_r2 = tree_model.score(X_test, y_test)
print("Decision Tree R^2 {:.5f}".format(tree_r2))

Decision Tree R^2 0.78959


In [12]:
# Using a Random Forest Model 
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_r2 = rf_model.score(X_test, y_test)
print("Random Forest R^2 {:.5f}".format(rf_r2))

Random Forest R^2 0.88936
