In [1]:
import pandas as pd
import numpy as np
from sklearn import pipeline,preprocessing,metrics,model_selection,ensemble
from sklearn_pandas import DataFrameMapper
import sklearn.impute
from sklearn import impute
from sklearn.impute import SimpleImputer

In [2]:
data=pd.read_csv('../mpg_data_example.csv')

In [3]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [4]:
data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [5]:
imp = SimpleImputer(missing_values=np.nan, copy=False, strategy="mean", )

In [6]:
#Imputing Missing Values using the SimpleImputer Class in sklearn (horsepower)

mapper = DataFrameMapper([
                        (['cylinders','displacement','weight','acceleration','model year'], preprocessing.StandardScaler()),
                        (['horsepower'],imp),
                        (['origin'], preprocessing.OneHotEncoder())
                        ])

In [7]:
pipeline_obj = pipeline.Pipeline([
    ('mapper',mapper),
    ("model", ensemble.RandomForestRegressor())
])

In [8]:
data.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')

In [9]:
X=['cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin']
Y=['mpg']

In [10]:
pipeline_obj.fit(data[X],data[Y].values.ravel())



Pipeline(steps=[('mapper',
                 DataFrameMapper(drop_cols=[],
                                 features=[(['cylinders', 'displacement',
                                             'weight', 'acceleration',
                                             'model year'],
                                            StandardScaler()),
                                           (['horsepower'],
                                            SimpleImputer(copy=False)),
                                           (['origin'], OneHotEncoder())])),
                ('model', RandomForestRegressor())])

In [11]:
pipeline_obj.predict(data[X])



array([17.491, 15.036, 17.21 , 16.26 , 16.721, 14.7  , 14.05 , 14.25 ,
       13.45 , 14.78 , 15.07 , 14.67 , 15.   , 14.54 , 23.84 , 21.71 ,
       19.335, 21.078, 27.05 , 26.631, 24.472, 24.301, 24.725, 24.636,
       20.922, 10.86 , 11.215, 11.715,  9.98 , 27.11 , 26.316, 24.966,
       26.   , 19.329, 16.906, 17.46 , 18.49 , 18.319, 13.945, 13.68 ,
       14.12 , 14.14 , 12.2  , 12.91 , 12.67 , 18.245, 22.105, 18.66 ,
       18.085, 23.85 , 27.075, 29.6  , 29.08 , 31.321, 33.24 , 27.29 ,
       27.05 , 24.371, 25.75 , 23.755, 20.385, 22.26 , 13.5  , 13.72 ,
       14.66 , 14.035, 15.87 , 11.54 , 12.8  , 12.43 , 13.025, 20.561,
       14.74 , 13.775, 13.47 , 14.04 , 19.19 , 22.625, 20.888, 26.01 ,
       22.39 , 26.45 , 22.766, 27.325, 26.855, 13.41 , 14.57 , 13.55 ,
       14.08 , 14.59 , 11.87 , 13.15 , 13.12 , 14.125, 12.42 , 12.11 ,
       13.45 , 18.596, 16.425, 18.36 , 18.009, 21.885, 26.481, 11.53 ,
       12.22 , 12.885, 12.255, 18.62 , 21.67 , 21.485, 23.049, 21.93 ,
      

In [12]:
import joblib

In [13]:
joblib.dump(pipeline_obj,'RFModelforMPG.pkl')

['RFModelforMPG.pkl']

In [14]:
modelReload=joblib.load('RFModelforMPG.pkl')

In [15]:
modelReload.predict(data[X])



array([17.491, 15.036, 17.21 , 16.26 , 16.721, 14.7  , 14.05 , 14.25 ,
       13.45 , 14.78 , 15.07 , 14.67 , 15.   , 14.54 , 23.84 , 21.71 ,
       19.335, 21.078, 27.05 , 26.631, 24.472, 24.301, 24.725, 24.636,
       20.922, 10.86 , 11.215, 11.715,  9.98 , 27.11 , 26.316, 24.966,
       26.   , 19.329, 16.906, 17.46 , 18.49 , 18.319, 13.945, 13.68 ,
       14.12 , 14.14 , 12.2  , 12.91 , 12.67 , 18.245, 22.105, 18.66 ,
       18.085, 23.85 , 27.075, 29.6  , 29.08 , 31.321, 33.24 , 27.29 ,
       27.05 , 24.371, 25.75 , 23.755, 20.385, 22.26 , 13.5  , 13.72 ,
       14.66 , 14.035, 15.87 , 11.54 , 12.8  , 12.43 , 13.025, 20.561,
       14.74 , 13.775, 13.47 , 14.04 , 19.19 , 22.625, 20.888, 26.01 ,
       22.39 , 26.45 , 22.766, 27.325, 26.855, 13.41 , 14.57 , 13.55 ,
       14.08 , 14.59 , 11.87 , 13.15 , 13.12 , 14.125, 12.42 , 12.11 ,
       13.45 , 18.596, 16.425, 18.36 , 18.009, 21.885, 26.481, 11.53 ,
       12.22 , 12.885, 12.255, 18.62 , 21.67 , 21.485, 23.049, 21.93 ,
      

In [16]:
temp={}
temp['cylinders']=1
temp['displacement']=2
temp['horsepower']=3
temp['weight']=4
temp['acceleration']=5
temp['model year']=6
temp['origin']=1

In [17]:
testData=pd.DataFrame({'x':temp}).transpose()

In [18]:
testData

Unnamed: 0,acceleration,cylinders,displacement,horsepower,model year,origin,weight
x,5,1,2,3,6,1,4


In [19]:
modelReload.predict(testData)[0]



28.511