In [1]:
import pandas as pd
import numpy as np
from sklearn import pipeline, preprocessing, metrics, model_selection, ensemble
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv('car-mpg.csv')
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


In [3]:
data.tail()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
393,27.0,4,140.0,86,2790,15.6,82,1,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,1,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,1,ford ranger
397,31.0,4,119.0,82,2720,19.4,82,1,1,chevy s-10


In [4]:
data.isnull().sum()

mpg         0
cyl         0
disp        0
hp          0
wt          0
acc         0
yr          0
origin      0
car_type    0
car_name    0
dtype: int64

In [5]:
data = data.drop('car_type', axis = 1)

In [6]:
mapper = DataFrameMapper([
                        (['cyl','disp','wt','acc','yr'], preprocessing.StandardScaler()),
                        (['origin'], preprocessing.OneHotEncoder())
                        ])

In [7]:
pipeline_obj = pipeline.Pipeline([
    ('mapper', mapper),
    ('model', ensemble.RandomForestRegressor())
])

In [8]:
data.columns

Index(['mpg', 'cyl', 'disp', 'hp', 'wt', 'acc', 'yr', 'origin', 'car_name'], dtype='object')

In [9]:
X = ['cyl', 'disp', 'hp', 'wt', 'acc', 'yr', 'origin', 'car_name']
y = ['mpg']

In [10]:
pipeline_obj.fit(data[X], data[y])

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('mapper',
                 DataFrameMapper(drop_cols=[],
                                 features=[(['cyl', 'disp', 'wt', 'acc', 'yr'],
                                            StandardScaler()),
                                           (['origin'], OneHotEncoder())])),
                ('model', RandomForestRegressor())])

In [11]:
pipeline_obj.predict(data[X])

array([17.33 , 14.65 , 16.91 , 16.34 , 16.983, 14.46 , 13.86 , 14.19 ,
       13.3  , 14.82 , 14.84 , 14.34 , 14.99 , 14.62 , 23.89 , 21.086,
       19.193, 20.62 , 27.11 , 26.732, 24.32 , 24.429, 24.809, 25.224,
       20.573, 10.55 , 11.155, 11.73 , 10.04 , 27.13 , 26.465, 24.972,
       26.165, 19.397, 16.592, 17.32 , 18.67 , 18.26 , 13.88 , 13.38 ,
       14.16 , 14.18 , 12.11 , 12.94 , 12.56 , 18.1  , 21.78 , 18.68 ,
       18.09 , 23.764, 27.025, 29.605, 29.44 , 31.32 , 33.59 , 27.297,
       26.65 , 24.724, 25.515, 23.006, 20.58 , 22.48 , 13.36 , 13.55 ,
       14.55 , 13.995, 16.29 , 11.96 , 12.6  , 12.31 , 13.06 , 20.545,
       14.96 , 13.665, 13.24 , 14.015, 19.2  , 22.23 , 20.62 , 26.01 ,
       22.475, 26.54 , 22.605, 27.245, 26.765, 13.49 , 14.84 , 13.42 ,
       14.09 , 14.53 , 12.04 , 13.115, 13.29 , 13.985, 12.51 , 12.11 ,
       13.17 , 18.483, 16.655, 18.29 , 17.77 , 21.914, 26.36 , 11.34 ,
       12.13 , 12.79 , 12.35 , 18.37 , 21.76 , 20.82 , 22.78 , 21.921,
      

In [15]:
import joblib

In [16]:
joblib.dump(pipeline_obj, 'RF_MPG.pkl')

['RF_MPG.pkl']

In [17]:
model_reload = joblib.load('RF_MPG.pkl')

In [18]:
model_reload.predict(data[X])

array([17.33 , 14.65 , 16.91 , 16.34 , 16.983, 14.46 , 13.86 , 14.19 ,
       13.3  , 14.82 , 14.84 , 14.34 , 14.99 , 14.62 , 23.89 , 21.086,
       19.193, 20.62 , 27.11 , 26.732, 24.32 , 24.429, 24.809, 25.224,
       20.573, 10.55 , 11.155, 11.73 , 10.04 , 27.13 , 26.465, 24.972,
       26.165, 19.397, 16.592, 17.32 , 18.67 , 18.26 , 13.88 , 13.38 ,
       14.16 , 14.18 , 12.11 , 12.94 , 12.56 , 18.1  , 21.78 , 18.68 ,
       18.09 , 23.764, 27.025, 29.605, 29.44 , 31.32 , 33.59 , 27.297,
       26.65 , 24.724, 25.515, 23.006, 20.58 , 22.48 , 13.36 , 13.55 ,
       14.55 , 13.995, 16.29 , 11.96 , 12.6  , 12.31 , 13.06 , 20.545,
       14.96 , 13.665, 13.24 , 14.015, 19.2  , 22.23 , 20.62 , 26.01 ,
       22.475, 26.54 , 22.605, 27.245, 26.765, 13.49 , 14.84 , 13.42 ,
       14.09 , 14.53 , 12.04 , 13.115, 13.29 , 13.985, 12.51 , 12.11 ,
       13.17 , 18.483, 16.655, 18.29 , 17.77 , 21.914, 26.36 , 11.34 ,
       12.13 , 12.79 , 12.35 , 18.37 , 21.76 , 20.82 , 22.78 , 21.921,
      