# Predicting Fuel Efficiency of Vehicles - Part 3


## Selecting and Training Models
1. Select and Train a few ML algorithms (Linear Regression, Decision Tree, Random Forest)
2. Evaluation using Mean Squared Error (performance metric)
3. Model Evaluation using Cross Validation
4. Hyperparameter Tuning
5. Check Feature Importance
6. Evaluate the final system on test data
7. Saving the Model

In [1]:
# Importing a few general use case libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the classes I need
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [2]:
# reading the .data fine using pandas

cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
        'Acceleration','Model Year','Origin']

df = pd.read_csv('auto-mpg.data',names=cols, na_values='?',
                 comment='\t',sep=' ',skipinitialspace=True)

data = df.copy()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data,data['Cylinders']):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [3]:
# segregating the feature and target variable
data = strat_train_set.drop('MPG', axis=1)           # all features
data_labels = strat_train_set['MPG'].copy()          # the target
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [4]:
# preprocess the Origin column in data
def preprocess_origin_cols(df):
    df['Origin'] = df['Origin'].map({1:'India', 2:'USA', 3:'Germany'})
    return df

In [5]:
# creating custom attribute adder class
# column indexes of Acceleration, Horsepower, Cylinders
acc_ix, hpower_ix, cyl_ix = 4, 2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):    # inherts BaseEstimator and TransformerMixin class, they work on ndarrays
    def __init__(self, acc_on_power=True):                 # no *args or **kargs
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self                                        # nothing else to do
    def transform(self, X):                                # X is a 2D array
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]      # concatenates arrays
        return np.c_[X, acc_on_cyl]

In [6]:
def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
    '''
    numerics = ['float64', 'int64']
    
    num_attrs = data.select_dtypes(include=numerics)
    
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline


def pipeline_transformer(data):
    '''
    Complete transformation pipeline for both
    numerical and categorical data
    
    Argument:
        data: original dataframe
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attrs = ['Origin']
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    print(list(num_attrs))
    full_pipeline = ColumnTransformer([
        ('num', num_pipeline, list(num_attrs)),
        ('cat', OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

#### From raw data to processed data in 2 steps

In [7]:
# from raw data data to processed data in 2 steps
preprocessed_df = preprocess_origin_cols(data)
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data

['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year']


array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])

In [8]:
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

#### Selecting and Training Models
1. Linear Regression
2. Decision Tree
3. Random Forest
4. SVM regressor

In [9]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

In [10]:
# trying out some predictions
sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_data_prepared = pipeline_transformer(sample_data)

print('Prediction of samples: ', lin_reg.predict(sample_data_prepared))

['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year']
Prediction of samples:  [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [11]:
print('Actual Labels of samples: ', list(sample_labels))

Actual Labels of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


**Performance Metrics**

Note that we can not keep looking at actual values to quantify the difference between actual and predicted values. Hence, we need a performance metric.

A good performance metric for linear regression is root mean squared error. It basically tells how large is the distance between actual and predicted values. It increases with the increase in difference between predicted and actual value. So, if you have a lot of outliers in the data, it is not a good idea to use RMS error, you can use absolute error instead. Here, we will use RMS Error.

#### Mean Squared Error

In [12]:
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.959040222576087

#### Decision Tree

In [13]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

Note the steps remain the same with every model you import from sklearn library.
1. import class of that particular algorithm.
2. instantiate the class, and create an object.
3. train the model using fit method by providing data and labels
4. It gives you a trained model object
5. Next predict by providing the training data only.
6. calculate the error by using module from sklearn

In [14]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

But no model is perfect, this means that our model has overfit the data to a great extent.

We won't be touching out test data until we finalize our model. So, how do we check for what's happening?

We can split the training data itself using train-test split or stratified split. But a great alternative is to use k-fold cross-validation method.

sklearn k-fold cross-validation method

#### Model Evaluation using Cross Validation

Scikit-Learn's K-fold cross-validation feature randomly splits the training set into k distinct subsets called folds, then it trains and evaluated the model k times, picking a different fold for evaluation every time and training on the other k-1 folds.

The result is an array containing the K evaluation scores:

In [15]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg,
                        prepared_data,
                        data_labels,
                        scoring='neg_mean_squared_error',
                        cv=10)  # 10 folds
tree_reg_rmse_scores = np.sqrt(-scores)

In [16]:
tree_reg_rmse_scores

array([2.9732663 , 3.36558873, 2.94199125, 3.19530906, 2.40721572,
       3.03865719, 3.75657756, 5.01859044, 4.20441396, 2.59962776])

In [17]:
tree_reg_rmse_scores.mean()

3.3501237975546103

In [18]:
scores = cross_val_score(lin_reg, 
                         prepared_data, 
                         data_labels,
                         scoring='neg_mean_squared_error',
                         cv = 10)
lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores

array([3.43254597, 3.45157629, 3.6621715 , 2.59652976, 2.48023405,
       2.74798115, 3.32524647, 2.42208917, 3.78133275, 2.8573747 ])

In [19]:
lin_reg_rmse_scores.mean()

3.0757081793709333

Linear Regression has performed better

#### Random Forest model

combination of a lot of decision trees, so it is like an ensemble model

In [20]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg,
                                      prepared_data,
                                      data_labels,
                                      scoring='neg_mean_squared_error',
                                      cv = 10)
forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores

array([2.15399631, 2.49707748, 2.73533314, 2.41236576, 2.01083566,
       2.53253449, 2.75869586, 2.53454881, 4.06896255, 1.96868078])

In [21]:
forest_reg_rmse_scores.mean()

2.5673030834826447

Random Forest has turned out to be the best model yet.


#### Support Vector Machine Regressor


In [22]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='linear')
svm_reg.fit(prepared_data, data_labels)
svm_cv_scores = cross_val_score(svm_reg,
                               prepared_data,
                               data_labels,
                               scoring='neg_mean_squared_error',
                               cv=10)
svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()

3.08659162080283

So, **Random Forest Regressor** has turned out to be the best model out of Linear Regressor, Decision Tree Regressor, SVM Regressor.

Now, we will see which parameters of Random Forest Regressor work best, meaning tuning which parameters improves performance. One way to do that is to change and check different values manually. See if we can reduce RMSE values of the model. Other way is:

#### Hyperparameter Tuning using GridSearchCV

This class takes some values that you want to try out, and it uses cross validation to evaluate the model on each of those hyperparameter value combinations you have provided. It gives you the best combination, for which Random Forest Regressor gives least RMSE value.

In [23]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
    ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, 
                          scoring='neg_mean_squared_error',
                          return_train_score=True,
                          cv=10)
grid_search.fit(prepared_data, data_labels)

In [24]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [25]:
cv_scores = grid_search.cv_results_

# printing all parameters along with their scores
for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores['params']):
    print(np.sqrt(-mean_score),params)

3.6059586880656083 {'max_features': 2, 'n_estimators': 3}
3.158095844796252 {'max_features': 2, 'n_estimators': 10}
2.911405329840498 {'max_features': 2, 'n_estimators': 30}
3.5098727403177747 {'max_features': 4, 'n_estimators': 3}
2.774839714929204 {'max_features': 4, 'n_estimators': 10}
2.7691234268431226 {'max_features': 4, 'n_estimators': 30}
3.0834467430465375 {'max_features': 6, 'n_estimators': 3}
2.9693052808369926 {'max_features': 6, 'n_estimators': 10}
2.6423773015107868 {'max_features': 6, 'n_estimators': 30}
3.124275184760063 {'max_features': 8, 'n_estimators': 3}
2.806338101210624 {'max_features': 8, 'n_estimators': 10}
2.719619076478718 {'max_features': 8, 'n_estimators': 30}
3.281936124713963 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.883543109949196 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.4043299108013994 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.923037586114607 {'bootstrap': False, 'max_features': 3, 'n_estimato

Going ahead with Random Forest Regressor with these paramters.

#### Checking Feature importance

In [26]:
# feature importances
# grid search has feature importances as well. 
# A score on how imporant a feature is.

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.1612831 , 0.22290722, 0.14439462, 0.23470535, 0.01621403,
       0.10830417, 0.03289704, 0.07087487, 0.00274887, 0.00252607,
       0.00314465])

In [27]:
# to see features and feature importances side by side
extra_attrs = ['acc_on_power','acc_on_cyl']
numerics = ['float64','int64']
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse=True)

[('acc_on_power', 0.032897040314185944),
 ('acc_on_cyl', 0.07087487294571816),
 ('Weight', 0.2347053515117013),
 ('Model Year', 0.10830416507516832),
 ('Horsepower', 0.14439462011040757),
 ('Displacement', 0.2229072154910244),
 ('Cylinders', 0.16128310062762416),
 ('Acceleration', 0.01621403392106413)]

#### Evaluating the entire system on Test Data

We are going ahead with the best estimator from the GridSearchCV as our final model.

In [28]:
final_model = grid_search.best_estimator_

x_test = strat_test_set.drop('MPG', axis=1)
y_test = strat_test_set['MPG'].copy()

x_test_preprocessed = preprocess_origin_cols(x_test)
x_test_prepared = pipeline_transformer(x_test_preprocessed)

final_predictions = final_model.predict(x_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year']


In [29]:
final_rmse

2.95756579244937

We see that a lot of things are being repeated, so we create a function.


#### Creating a function to cover this entire flow

In [30]:
def predict_mpg(config, model):
    '''
    Arguments:
        config: Vehicle configuration in dictionary or dataframe
        model: The trained model used for prediction
    Returns:
        y_pred: mpg predicted values
    '''
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
        
    preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preproc_df)
#     print(prepared_df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [31]:
# Creating a dictionary of all features, to depict vehicle config
# thinking when I will pass data, I will pass data from JSON
## checking it on a random sample
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [15.0, 14.0, 16.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year']


array([30.29666667, 24.21333333, 18.        ])

#### Save the model

In [32]:
import pickle

In [33]:
# saving the model
with open('model.bin','wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [34]:
# load the model from the saved file
with open('model.bin','rb') as f_in:
    model = pickle.load(f_in)
    
predict_mpg(vehicle_config, model)

['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year']


array([30.29666667, 24.21333333, 18.        ])

Now we have saved our model file, which we can load and use to make predictions.

Next step is to deploy the model, we will use our model which is saved in the model file. And make predictions using our flask web app.

In [40]:
import requests

url  = 'http://127.0.0.1:9696/'
r = requests.post(url, json = vehicle_config)
r.text.strip()

'{\n  "mpg_predictions": [\n    30.29666666666666,\n    24.21333333333333,\n    18.0\n  ]\n}'