In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


In [4]:
# Download data
!wget "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"


--2021-12-21 15:12:39--  http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30286 (30K) [application/x-httpd-php]
Saving to: ‘auto-mpg.data.8’


2021-12-21 15:12:39 (103 KB/s) - ‘auto-mpg.data.8’ saved [30286/30286]



In [5]:
cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)

data = df.copy()

In [7]:
# Train-test split
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]
    
    
print('Train set:', strat_train_set.shape, '\n')
print('Test set:', strat_test_set.shape, '\n')

Train set: (318, 8) 

Test set: (80, 8) 



In [11]:
# Divide train set in train data and train labels
train_data = strat_train_set.drop("MPG", axis=1)
train_data_labels = strat_train_set["MPG"].copy()

print('Train set:', train_data.shape, '\n')
print('Test set:', train_data_labels.shape, '\n')
print(np.unique(train_data_labels))



Train set: (318, 7) 

Test set: (318,) 

[ 9.  10.  11.  12.  13.  14.  14.5 15.  15.5 16.  16.2 16.5 16.9 17.
 17.5 17.6 17.7 18.  18.1 18.2 18.5 18.6 19.  19.1 19.2 19.4 19.9 20.
 20.2 20.3 20.5 20.6 20.8 21.  21.1 21.5 21.6 22.  22.3 22.4 22.5 23.
 23.2 23.6 23.7 23.8 23.9 24.  24.2 24.3 24.5 25.  25.4 25.5 26.  26.4
 26.5 26.6 26.8 27.  27.2 27.4 27.5 27.9 28.  28.1 28.4 28.8 29.  29.5
 29.8 29.9 30.  30.5 30.7 30.9 31.  31.3 31.8 31.9 32.  32.1 32.2 32.3
 32.4 33.  33.5 33.7 33.8 34.  34.1 34.2 34.3 34.4 34.5 34.7 35.  35.1
 36.  36.1 36.4 37.  37.2 37.3 37.7 38.  39.1 39.4 40.8 43.1 43.4 44.3
 44.6 46.6]


In [16]:
# Preprocessing functions

# Preprocess the Origin column in data
def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df


acc_ix, hpower_ix, cyl_ix = 4,2, 0
class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True): # no *args or **kargs
        self.acc_on_power = acc_on_power
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]
        
        return np.c_[X, acc_on_cyl]

def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
    
    # Keep only numerical data
    numerics = ['float64', 'int64']
    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline


def pipeline_transformer(data):
    '''
    Complete transformation pipeline for numerical and categorical data.    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: preprocessed data
    '''
    
    cat_attrs = ["Origin"]
    
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data


In [17]:
# Preprocess train set
preprocessed_df = preprocess_origin_cols(train_data)
pre_train_data = pipeline_transformer(preprocessed_df)
print(pre_train_data.shape)



(318, 11)


In [22]:
from sklearn.model_selection import cross_val_score

# Linear Regressor
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(pre_train_data, train_data_labels)

lin_reg_scores = cross_val_score(lin_reg, 
                                  pre_train_data,
                                  train_data_labels, 
                                  scoring="neg_mean_squared_error", 
                                  cv = 10)

lin_reg_rmse_scores = np.sqrt(-lin_reg_scores)
print('Mean of cross validation scores:', lin_reg_rmse_scores.mean())

Mean of cross validation scores: 3.0757081793709324


In [23]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(pre_train_data, train_data_labels)

tree_reg_cv_scores = cross_val_score(tree_reg, 
                         pre_train_data, 
                         train_data_labels, 
                         scoring="neg_mean_squared_error", 
                         cv = 10)

tree_reg_rmse_scores = np.sqrt(-tree_reg_cv_scores)
print('Mean of cross validation scores:', tree_reg_rmse_scores.mean())


Mean of cross validation scores: 3.3171661482009314


In [24]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(pre_train_data, train_data_labels)

forest_reg_cv_scores = cross_val_score(forest_reg,
                                         pre_train_data,
                                         train_data_labels,
                                         scoring='neg_mean_squared_error',
                                         cv = 10)

forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
print('Mean of cross validation scores:', forest_reg_rmse_scores.mean())





Mean of cross validation scores: 2.587017664726406


In [25]:
# SVM Regressor
from sklearn.svm import SVR

svm_reg = SVR(kernel='linear')
svm_reg.fit(pre_train_data, train_data_labels)

svm_cv_scores = cross_val_score(svm_reg, 
                                pre_train_data,
                                train_data_labels,
                                scoring='neg_mean_squared_error',
                                cv = 10)

svm_rmse_scores = np.sqrt(-svm_cv_scores)
print('Mean of cross validation scores:', svm_rmse_scores.mean())



Mean of cross validation scores: 3.08659162080283


In [None]:
# Random forest gives the best results (smallest error) using the cross validation method

In [28]:
# Grid search for hyperparameter tuning 

from sklearn.model_selection import GridSearchCV

# Choose the hyperparameters thath you want to tune
param_grid = [
    # explore 2 grids of parameters
    {'n_estimators': [3, 10, 30], 
     'max_features': [2, 4, 6, 8]},
    
    {'bootstrap': [False], 
     'n_estimators': [3, 10],
     'max_features': [2, 3, 4]},
  ]

# Choose model for hyperparameter tuning
estimator = RandomForestRegressor()

grid_search = GridSearchCV(estimator, param_grid,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           cv=10,
                          )

grid_search.fit(pre_train_data, train_data_labels)
print('Best parameters found by the Grid search method:\n', grid_search.best_params_)



Best parameters found by the Grid search method:
 {'max_features': 8, 'n_estimators': 30}




In [30]:
# Print all combinations with their scores
cv_scores = grid_search.cv_results_

for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores["params"]):
    print(params, ' --> Neg_mean_squared_error: ', np.sqrt(-mean_score) )



{'max_features': 2, 'n_estimators': 3}  --> Neg_mean_squared_error:  3.2764517802561683
{'max_features': 2, 'n_estimators': 10}  --> Neg_mean_squared_error:  3.006469962467632
{'max_features': 2, 'n_estimators': 30}  --> Neg_mean_squared_error:  2.9518091137512865
{'max_features': 4, 'n_estimators': 3}  --> Neg_mean_squared_error:  3.437932743868426
{'max_features': 4, 'n_estimators': 10}  --> Neg_mean_squared_error:  2.8758091560266643
{'max_features': 4, 'n_estimators': 30}  --> Neg_mean_squared_error:  2.711621770736692
{'max_features': 6, 'n_estimators': 3}  --> Neg_mean_squared_error:  2.9940629582397835
{'max_features': 6, 'n_estimators': 10}  --> Neg_mean_squared_error:  2.889617447640897
{'max_features': 6, 'n_estimators': 30}  --> Neg_mean_squared_error:  2.8172615823172262
{'max_features': 8, 'n_estimators': 3}  --> Neg_mean_squared_error:  3.2118877503789327
{'max_features': 8, 'n_estimators': 10}  --> Neg_mean_squared_error:  2.788361199002136
{'max_features': 8, 'n_estimat

In [32]:
# Use the best set of hyperparameters and compute the importance of the features

feature_importances = grid_search.best_estimator_.feature_importances_


extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ['float64', 'int64']
num_attrs = list(train_data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse=True)



[('acc_on_power', 0.02235251704428772),
 ('acc_on_cyl', 0.015615722344495257),
 ('Weight', 0.2010595237410475),
 ('Model Year', 0.12083560068070626),
 ('Horsepower', 0.11470534342336992),
 ('Displacement', 0.3693649248140762),
 ('Cylinders', 0.1328526478136263),
 ('Acceleration', 0.015304533342734176)]

In [35]:
from sklearn.metrics import mean_squared_error

# Use the best model with the optimal parameters
final_model = grid_search.best_estimator_

# Preprocess test data
test_data = strat_test_set.drop("MPG", axis=1)
test_data_labels = strat_test_set["MPG"].copy()

preprocessed_test_data = preprocess_origin_cols(test_data)
pre_test_data = pipeline_transformer(preprocessed_test_data)

# Make predictions on test data
predictions = final_model.predict(pre_test_data)
mse = mean_squared_error(test_data_labels, predictions)
rmse = np.sqrt(mse)

print('Prediction Results:\n MSE:', mse, '\n RMSE:', rmse)

Prediction Results:
 MSE: 8.888482361111112 
 RMSE: 2.9813557924392575


In [36]:
import pickle

# Save model
with open("final_trained_model.bin", 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [38]:
# Check if saved model can be imported and used to make prediction for a random example


# 3 random examples
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}


# Load saved model
with open('final_trained_model.bin', 'rb') as f_in:
    model = pickle.load(f_in)
    
    
# Make predictions   
def predict_mpg(config, model):
    
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
    
    preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preproc_df)
    y_pred = model.predict(prepared_df)
    return y_pred

print('Predicted fuel consumption values: ', predict_mpg(vehicle_config, model))



    

Predicted fuel consumption values:  [33.14       18.48333333 19.26      ]
