#### **Step 6: Predicting the fuel efficiency of Vehicles**  
Creating Pipeline Functions

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [38]:
# Reading the data
cols = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']
df = pd.read_csv('auto-mpg.data', names= cols, na_values= '?', comment='\t', sep= " ", skipinitialspace= True)
data = df.copy()

split = StratifiedShuffleSplit(n_splits= 1, test_size= 0.2, random_state= 42)
for train_index, test_index in split.split(data, data['Cylinders']):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [39]:
# Separating feature and target variable
data = strat_train_set.drop("MPG", axis=1)
data_labels = strat_train_set['MPG'].copy()

In [40]:
# preprocessing origin column
def preprocessing_org_column(df):
    df['Origin'] = df['Origin'].map({1 : 'India', 2 : 'USA', 3 : 'Germany'})
    return df

In [41]:
# Creating Custom Attribute adder
acc_ix, hpower_ix, cyl_ix = 4, 2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power = True):
        self.acc_on_power = acc_on_power
    def fit(self, X, y= None):
        return self
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]

In [42]:
# pipelines
def num_pipeline_transformer(data):
    numerics = ['float64', 'int64']
    num_attrs = data.select_dtypes(include= numerics)
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy= 'median')),
        ('attr_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler())
    ])
    return num_attrs, num_pipeline

def pipeline_transformer(data):
    cat_attrs = ['Origin']
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ('num', num_pipeline, list(num_attrs)),
        ('cat', OneHotEncoder(), cat_attrs)
    ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data


In [43]:
# From raw to processed data in 2 steps
preprocessed_df = preprocessing_org_column(data)

In [44]:
prepared_data = pipeline_transformer(preprocessed_df)

In [45]:
prepared_data[89]

array([ 0.32260746,  0.0747988 , -0.5043511 , -0.0145391 ,  0.05835354,
        0.54607826,  0.13531964, -0.4808467 ,  0.        ,  1.        ,
        0.        ])

**Selecting and Training Model**

In [46]:
# linear Regression
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

LinearRegression()

In [47]:
# Testing the prediction
sample_data = data.iloc[:5]
sample_labels = data_labels.iloc[:5]

sample_prepared_data = pipeline_transformer(sample_data)

print("Prediction of samples: ", lin_reg.predict(sample_prepared_data))

Prediction of samples:  [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [48]:
print("Actual Values of samples: ", list(sample_labels))

Actual Values of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


In [49]:
# for evaluation of linear regression model we will use mean squared error
from sklearn.metrics import mean_squared_error

mpg_prediction = lin_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_prediction)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.9590402225760863

In [50]:
# Decision tree Model
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor()

In [51]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [52]:
# Tree model is overfitting the data because no model is perfect and it is giving a rmse of 0.0

In [53]:
# Evaluating the tree model using Cross Validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, prepared_data, data_labels, scoring= 'neg_mean_squared_error', cv = 10)
tree_reg_rmse_scores = np.sqrt(-scores)


In [54]:
tree_reg_rmse_scores

array([3.17440939, 3.0538398 , 2.91981378, 3.20502535, 2.39165215,
       3.12664956, 3.53747792, 5.54940875, 4.18673019, 2.54539432])

In [55]:
tree_reg_rmse_scores.mean() #average error of decision tree regressor

3.369040121177302

In [56]:
# performing cross validation for linear regression
scores = cross_val_score(lin_reg, prepared_data, data_labels, scoring= 'neg_mean_squared_error', cv = 10)
lin_reg_rmse_scores = np.sqrt(-scores)
lin_reg_rmse_scores

array([3.43254597, 3.45157629, 3.6621715 , 2.59652976, 2.48023405,
       2.74798115, 3.32524647, 2.42208917, 3.78133275, 2.8573747 ])

In [57]:
lin_reg_rmse_scores.mean()

3.0757081793709324

In [58]:
# Random Forest Model
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg, prepared_data, data_labels, scoring= 'neg_mean_squared_error', cv = 10)
forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores.mean()

2.567840130160126

In [59]:
# Support vector machine Model
from sklearn.svm import SVR 

svm_reg = SVR(kernel= 'linear')
svm_reg.fit(prepared_data, data_labels)
svm_cv_scores = cross_val_score(svm_reg, prepared_data, data_labels, scoring= 'neg_mean_squared_error', cv = 10)
svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()

3.086591620802784

In [60]:
# The best model among the 4 turns out to be Random Forest Regressor with a error of 2.56

#### **Step 7: Hyperparameter Tuning**
Using GridSearchCV to find out which set  of parameters works best on random forest regressor.

In [61]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators' : [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap' : [False], 'n_estimators' : [3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, scoring= 'neg_mean_squared_error', return_train_score= True, cv = 10)

grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [62]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [63]:
cv_scores = grid_search.cv_results_

for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores['params']):
    print(np.sqrt(-mean_score), params)

3.481154702111907 {'max_features': 2, 'n_estimators': 3}
2.9868111079808957 {'max_features': 2, 'n_estimators': 10}
2.9442047385225623 {'max_features': 2, 'n_estimators': 30}
3.366089691589925 {'max_features': 4, 'n_estimators': 3}
2.9270443727556716 {'max_features': 4, 'n_estimators': 10}
2.691173434493788 {'max_features': 4, 'n_estimators': 30}
3.0625732198108357 {'max_features': 6, 'n_estimators': 3}
2.911565903094848 {'max_features': 6, 'n_estimators': 10}
2.7163825625324693 {'max_features': 6, 'n_estimators': 30}
2.961816649102916 {'max_features': 8, 'n_estimators': 3}
2.7906022950887928 {'max_features': 8, 'n_estimators': 10}
2.6783455114192094 {'max_features': 8, 'n_estimators': 30}
3.3258270321874335 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.955907212616251 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.201412763577308 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.8327737141047082 {'bootstrap': False, 'max_features': 3, 'n_estima

#### **Step 8: Checking Feature Importance**

In [64]:
feature_importance = grid_search.best_estimator_.feature_importances_
feature_importance #tells the scores of the features i.e. how important each feature is

array([0.15633551, 0.31903765, 0.13853288, 0.17355579, 0.01478748,
       0.12676081, 0.02894619, 0.03494111, 0.00243775, 0.00265919,
       0.00200565])

In [65]:
# To make more sense
extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include= numerics))
attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importance), reverse= True)

[('acc_on_power', 0.028946187601656112),
 ('acc_on_cyl', 0.03494110623227895),
 ('Weight', 0.1735557887633897),
 ('Model Year', 0.12676081257222316),
 ('Horsepower', 0.1385328824910448),
 ('Displacement', 0.3190376477829678),
 ('Cylinders', 0.15633550661155182),
 ('Acceleration', 0.014787477312617122)]

#### **Step 9: Evaluating the entire system on test data**

In [66]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop('MPG', axis=1)
y_test =strat_test_set['MPG'].copy()

X_test_preprocessed = preprocessing_org_column(X_test)
X_test_prepared = pipeline_transformer(X_test_preprocessed)

final_prediction = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_prediction)
final_mse = np.sqrt(final_mse)

In [67]:
final_mse

3.0305646017789436

#### **Step 10: Creating a function to cover all the things**

In [68]:
def predict_mpg(config, model):

    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config

    preproc_df = preprocessing_org_column(df)
    prepared_df = pipeline_transformer(preproc_df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [77]:
# checking the model on random sample

vehicle_config ={
    'Cylinders' : [4, 6, 8],
    'Displacement' : [155.0, 160.0, 165.0],
    'Horsepower' : [93.0, 130.0, 98.0],
    'Weight' : [2500.0, 3150.0, 2600.0],
    'Acceleration' : [15.0, 14.0, 16.0],
    'Model Year' : [81, 80, 78],
    'Origin' : [1, 3, 2]
}

m = predict_mpg(vehicle_config, final_model)
m[0]

32.65333333333333

#### **Step 11: Saving the model**


In [70]:
import pickle

# Saving the model
with open('model.bin', 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [71]:
# loading the model
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)

predict_mpg(vehicle_config, model)

array([32.56666667, 17.93      , 20.86333333])