In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
from pandas.tools.plotting import scatter_matrix

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

# Class, for use in pipelines, to select certain columns from a DataFrame and convert to a numpy array
# From A. Geron: Hands-On Machine Learning with Scikit-Learn & TensorFlow, O'Reilly, 2017
# Modified by Derek Bridge to allow for casting in the same ways as pandas.DatFrame.astype
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, dtype=None):
        self.attribute_names = attribute_names
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_selected = X[self.attribute_names]
        if self.dtype:
            return X_selected.astype(self.dtype).values
        return X_selected.values

# Class, for use in pipelines, to binarize nominal-valued features (while avoiding the dummy variabe trap)
# By Derek Bridge, 2017
class FeatureBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, features_values):
        self.features_values = features_values
        self.num_features = len(features_values)
        self.labelencodings = [LabelEncoder().fit(feature_values) for feature_values in features_values]
        self.onehotencoder = OneHotEncoder(sparse=False,
            n_values=[len(feature_values) for feature_values in features_values])
        self.last_indexes = np.cumsum([len(feature_values) - 1 for feature_values in self.features_values])
    def fit(self, X, y=None):
        for i in range(0, self.num_features):
            X[:, i] = self.labelencodings[i].transform(X[:, i])
        return self.onehotencoder.fit(X)
    def transform(self, X, y=None):
        for i in range(0, self.num_features):
            X[:, i] = self.labelencodings[i].transform(X[:, i])
        onehotencoded = self.onehotencoder.transform(X)
        return np.delete(onehotencoded, self.last_indexes, axis=1)
    def fit_transform(self, X, y=None):
        onehotencoded = self.fit(X).transform(X)
        return np.delete(onehotencoded, self.last_indexes, axis=1)
    def get_params(self, deep=True):
        return {"features_values" : self.features_values}
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            self.setattr(parameter, value)
        return self

In [5]:
df = pd.read_csv('CS4618Resources/datasets/dataset_mpg.csv')

In [6]:
df.shape

(398, 8)

In [7]:
df.columns

Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'year', 'origin', 'mpg'],
      dtype='object')

Horse power is dodgy, question marks are included

In [8]:
df.dtypes

cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
year              int64
origin            int64
mpg             float64
dtype: object

In [9]:
df.describe(include='all')

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin,mpg
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
unique,,,94.0,,,,,
top,,,150.0,,,,,
freq,,,22.0,,,,,
mean,5.454774,193.425879,,2970.424623,15.56809,76.01005,1.572864,23.514573
std,1.701004,104.269838,,846.841774,2.757689,3.697627,0.802055,7.815984
min,3.0,68.0,,1613.0,8.0,70.0,1.0,9.0
25%,4.0,104.25,,2223.75,13.825,73.0,1.0,17.5
50%,4.0,148.5,,2803.5,15.5,76.0,1.0,23.0
75%,8.0,262.0,,3608.0,17.175,79.0,2.0,29.0


There are no missing values in the dataset.
However the origin has been converted into a numberic value, when in fact it should be a nominal one.
The horsepower contains question marks instead of values not appearing in the dataset.

In [10]:
(df['horsepower'] == '?').sum()

6

In [11]:
df_clean = df[df['horsepower'] != '?'].copy()
df_clean.reset_index(inplace=True, drop=True)

df_clean.shape

(392, 8)

In [12]:
df_clean['horsepower'] = pd.to_numeric(df_clean['horsepower'])

In [13]:
# convert the origin to a string to be able to binarize it.
df_clean['origin'] = df_clean['origin'].astype(str)

In [14]:
numeric_features = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year']
nominal_features = ['origin']

numeric_pipeline = Pipeline([
    ('selector', DataFrameSelector(numeric_features)),    
])

nominal_pipeline = Pipeline([
    ('selector', DataFrameSelector(nominal_features)),
    ("binarizer", FeatureBinarizer([df_clean[feature].unique() for feature in nominal_features]))
])

pipeline = Pipeline([
    ("union", FeatureUnion([("numeric_pipeline", numeric_pipeline), 
                            ("nominal_pipeline", nominal_pipeline)])),
    ('estimator', LinearRegression())
])

In [15]:
numeric_features1 = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin']
numeric_pipeline1 = Pipeline([
    ('selector', DataFrameSelector(numeric_features1)),  
    ('estimator', LinearRegression())
])

We should really shuffle the data, since the dataset was ordered by year.

In [34]:
df_clean = df_clean.take(np.random.permutation(len(df_clean)))
y = df_clean['mpg'].values

In [35]:
# convert the origin back to number for testing the just numeric pipeline.
df_clean['origin'] = df_clean['origin'].astype(int)
print(np.mean(cross_val_score(numeric_pipeline1, df_clean, y, scoring='neg_mean_absolute_error', cv = 10)))

# convert the origin to a string to compare it with numeric pipeline.
df_clean['origin'] = df_clean['origin'].astype(str)
print(np.mean(cross_val_score(pipeline, df_clean, y, scoring='neg_mean_absolute_error', cv=10)))

-2.56431305808
-2.58153581453


By next week treat origin as nominal