In [1]:
import pandas as pd
import numpy as np
from sklearn import pipeline, preprocessing, metrics, model_selection, ensemble
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv('car-mpg.csv')
data.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


In [3]:
data.tail()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
393,27.0,4,140.0,86,2790,15.6,82,1,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,1,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,1,ford ranger
397,31.0,4,119.0,82,2720,19.4,82,1,1,chevy s-10


In [4]:
data.isnull().sum()

mpg         0
cyl         0
disp        0
hp          0
wt          0
acc         0
yr          0
origin      0
car_type    0
car_name    0
dtype: int64

In [5]:
data = data.drop('car_type', axis = 1)

In [6]:
mapper = DataFrameMapper([
                        (['cyl','disp','wt','acc','yr'], preprocessing.StandardScaler()),
                        (['origin'], preprocessing.OneHotEncoder(handle_unknown='ignore'))
                        ])

In [7]:
pipeline_obj = pipeline.Pipeline([
    ('mapper', mapper),
    ('model', ensemble.RandomForestRegressor())
])

In [8]:
data.columns

Index(['mpg', 'cyl', 'disp', 'hp', 'wt', 'acc', 'yr', 'origin', 'car_name'], dtype='object')

In [9]:
X = ['cyl', 'disp', 'hp', 'wt', 'acc', 'yr', 'origin', 'car_name']
y = ['mpg']

In [10]:
pipeline_obj.fit(data[X], data[y])

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('mapper',
                 DataFrameMapper(drop_cols=[],
                                 features=[(['cyl', 'disp', 'wt', 'acc', 'yr'],
                                            StandardScaler()),
                                           (['origin'],
                                            OneHotEncoder(handle_unknown='ignore'))])),
                ('model', RandomForestRegressor())])

In [11]:
pipeline_obj.predict(data[X])

array([17.39 , 14.89 , 17.151, 16.57 , 16.86 , 14.5  , 13.83 , 14.21 ,
       13.27 , 14.91 , 14.88 , 14.36 , 15.01 , 14.723, 24.028, 21.24 ,
       19.445, 20.828, 26.99 , 26.45 , 24.545, 24.578, 25.059, 25.195,
       20.818, 10.63 , 11.11 , 11.87 , 10.176, 26.99 , 26.59 , 24.89 ,
       26.325, 19.575, 16.745, 17.491, 18.701, 18.23 , 13.895, 13.54 ,
       14.13 , 14.04 , 12.29 , 12.93 , 12.56 , 18.135, 21.721, 18.771,
       18.095, 23.85 , 26.93 , 29.595, 29.25 , 31.35 , 33.14 , 27.26 ,
       26.635, 24.38 , 25.635, 23.095, 20.59 , 22.335, 13.345, 13.55 ,
       14.54 , 14.02 , 15.98 , 11.82 , 12.72 , 12.31 , 13.18 , 20.328,
       14.915, 13.485, 13.24 , 14.   , 19.26 , 22.326, 20.638, 26.105,
       22.52 , 26.58 , 22.939, 27.05 , 26.73 , 13.41 , 14.77 , 13.42 ,
       14.135, 14.58 , 12.08 , 13.13 , 13.155, 14.07 , 12.57 , 12.03 ,
       13.6  , 18.564, 16.402, 18.39 , 17.97 , 22.002, 26.33 , 11.56 ,
       12.13 , 12.86 , 12.4  , 18.61 , 22.02 , 20.75 , 23.198, 21.545,
      

In [12]:
import joblib

In [13]:
joblib.dump(pipeline_obj, 'RF_MPG.pkl')

['RF_MPG.pkl']

In [14]:
reloadModel = joblib.load('RF_MPG.pkl')

In [15]:
reloadModel.predict(data[X])

array([17.39 , 14.89 , 17.151, 16.57 , 16.86 , 14.5  , 13.83 , 14.21 ,
       13.27 , 14.91 , 14.88 , 14.36 , 15.01 , 14.723, 24.028, 21.24 ,
       19.445, 20.828, 26.99 , 26.45 , 24.545, 24.578, 25.059, 25.195,
       20.818, 10.63 , 11.11 , 11.87 , 10.176, 26.99 , 26.59 , 24.89 ,
       26.325, 19.575, 16.745, 17.491, 18.701, 18.23 , 13.895, 13.54 ,
       14.13 , 14.04 , 12.29 , 12.93 , 12.56 , 18.135, 21.721, 18.771,
       18.095, 23.85 , 26.93 , 29.595, 29.25 , 31.35 , 33.14 , 27.26 ,
       26.635, 24.38 , 25.635, 23.095, 20.59 , 22.335, 13.345, 13.55 ,
       14.54 , 14.02 , 15.98 , 11.82 , 12.72 , 12.31 , 13.18 , 20.328,
       14.915, 13.485, 13.24 , 14.   , 19.26 , 22.326, 20.638, 26.105,
       22.52 , 26.58 , 22.939, 27.05 , 26.73 , 13.41 , 14.77 , 13.42 ,
       14.135, 14.58 , 12.08 , 13.13 , 13.155, 14.07 , 12.57 , 12.03 ,
       13.6  , 18.564, 16.402, 18.39 , 17.97 , 22.002, 26.33 , 11.56 ,
       12.13 , 12.86 , 12.4  , 18.61 , 22.02 , 20.75 , 23.198, 21.545,
      

In [16]:
temp = {}
temp['cyl'] = 1
temp['disp'] = 2
temp['hp'] = 3
temp['wt'] = 4
temp['acc'] = 5
temp['yr'] = 6
temp['origin'] = 1

In [17]:
testData = pd.DataFrame({'x':temp}).transpose()

In [18]:
testData

Unnamed: 0,acc,cyl,disp,hp,origin,wt,yr
x,5,1,2,3,1,4,6


In [19]:
reloadModel.predict(testData)[0]

29.256