In [136]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit

import warnings
warnings.filterwarnings('ignore')

In [137]:
columns = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names = columns, na_values = "?", comment = '\t', sep = " ", skipinitialspace = True)

data = df.copy()

In [138]:
#set aside test data via stratified sampling (homogenous group with the right number of instances of eahc sub-group). From the correlation found in EDA, the cylinder column can be used to create the strate 

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)

for train_index, test_index, in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [139]:
#Separating feature variables and target variables
data = strat_train_set.drop("MPG", axis = 1)
data_labels = strat_train_set["MPG"].copy()

In [140]:
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [141]:
#preprocessing the origin column
def preprocess_origin_column(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df
data_tr = preprocess_origin_column(data)
data_tr.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,Germany
151,4,79.0,67.0,2000.0,16.0,74,USA
388,4,156.0,92.0,2585.0,14.5,82,India
48,6,250.0,88.0,3139.0,14.5,71,India
114,4,98.0,90.0,2265.0,15.5,73,USA


In [142]:
#One Hot Encoding the origin column from the preprocessed data
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 7 columns):
Cylinders       318 non-null int64
Displacement    318 non-null float64
Horsepower      314 non-null float64
Weight          318 non-null float64
Acceleration    318 non-null float64
Model Year      318 non-null int64
Origin          318 non-null object
dtypes: float64(4), int64(2), object(1)
memory usage: 19.9+ KB


In [143]:
#isolating the origin column because of data type "object". it is a categorical column that needs to be deal with 
data_category = data_tr[["Origin"]]
data_category.head()

Unnamed: 0,Origin
145,Germany
151,USA
388,India
48,India
114,USA


In [144]:
#One hot encoding the categorical values
from sklearn.preprocessing import OneHotEncoder

category_encoder = OneHotEncoder()
data_category_1hot = category_encoder.fit_transform(data_category)
data_category_1hot

<318x3 sparse matrix of type '<class 'numpy.float64'>'
	with 318 stored elements in Compressed Sparse Row format>

In [145]:
#converting the sparse matrix into a numpy array. This example just checks the top five roles 
data_category_1hot.toarray()[:5]

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [146]:
category_encoder.categories_

[array(['Germany', 'India', 'USA'], dtype=object)]

In [147]:
#Handling missing values using SimpleImputer
#Segregating the numerical columns
numerical_data = data.iloc[:, :-1]
numerical_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
Cylinders       318 non-null int64
Displacement    318 non-null float64
Horsepower      314 non-null float64
Weight          318 non-null float64
Acceleration    318 non-null float64
Model Year      318 non-null int64
dtypes: float64(4), int64(2)
memory usage: 17.4 KB


In [148]:
#handling missing values
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = "median")
imputer.fit(numerical_data)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [149]:
#median of all the columns from the imputer
imputer.statistics_

array([   4. ,  146. ,   92. , 2844. ,   15.5,   76. ])

In [150]:
#the values above are the medians of all the six columns above 

In [151]:
#checking median from the pandas dataframe 
data.median().values

array([   4. ,  146. ,   92. , 2844. ,   15.5,   76. ])

In [152]:
#impute the missing values by transforming the dataframe 
X = imputer.transform(numerical_data)
X 

array([[   4. ,   83. ,   61. , 2003. ,   19. ,   74. ],
       [   4. ,   79. ,   67. , 2000. ,   16. ,   74. ],
       [   4. ,  156. ,   92. , 2585. ,   14.5,   82. ],
       ...,
       [   4. ,  135. ,   84. , 2295. ,   11.6,   82. ],
       [   4. ,  113. ,   95. , 2372. ,   15. ,   70. ],
       [   6. ,  146. ,  120. , 2930. ,   13.8,   81. ]])

In [153]:
#convert the 2D array back into a dataframe
data_tr = pd.DataFrame(X, columns = numerical_data.columns, index = numerical_data.index)
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
Cylinders       318 non-null float64
Displacement    318 non-null float64
Horsepower      318 non-null float64
Weight          318 non-null float64
Acceleration    318 non-null float64
Model Year      318 non-null float64
dtypes: float64(6)
memory usage: 17.4 KB


In [154]:
#Horsepower no longer has the missing values 

In [155]:
#Adding attributes acceleration_on_power and acceleration_on_cylinder using BaseEstimator and Transformer

In [156]:
numerical_data.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year
145,4,83.0,61.0,2003.0,19.0,74
151,4,79.0,67.0,2000.0,16.0,74
388,4,156.0,92.0,2585.0,14.5,82
48,6,250.0,88.0,3139.0,14.5,71
114,4,98.0,90.0,2265.0,15.5,73


In [157]:
from sklearn.base import BaseEstimator, TransformerMixin

acc_index, hpower_index, cyl_index = 4, 2, 0 #these are column indexes to be used later

class CustomAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power
    def fit(self, X, y = None): 
        return self
    def transform(self, X): 
        acc_on_cylinder = X[:, acc_index] / X[:, cyl_index]
        if self.acc_on_power == True:
            acc_on_power = X[:, acc_index] / X[:, hpower_index]
            return np.c_[X, acc_on_power, acc_on_cylinder]
        return np.c_[X, acc_on_cylinder] 

attritbute_adder = CustomAttributeAdder(acc_on_power=True)
data_tr_extra_attributes = attritbute_adder.transform(data_tr.values)
data_tr_extra_attributes[0] 


array([4.0000000e+00, 8.3000000e+01, 6.1000000e+01, 2.0030000e+03,
       1.9000000e+01, 7.4000000e+01, 3.1147541e-01, 4.7500000e+00])

In [158]:
#Creating a pipeline of tasks to scale all the attributes
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [159]:
#tansforming numerical attributes

In [160]:
numerics = ['float64', 'int64']

num_data = data_tr.select_dtypes(include = numerics)

num_pipeline = Pipeline([ 
    ('imputer', SimpleImputer(strategy = "median")),
    ('attrs_adder', CustomAttributeAdder()),
    ('std_scaler', StandardScaler())
])

num_data_transform = num_pipeline.fit_transform(num_data)

num_data_transform[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517])

In [161]:
#transforming categorical attributes
from sklearn.compose import ColumnTransformer

In [163]:
numerical_attributes = list(num_data)
categorical_attributes = ["Origin"]

full_pipeline = ColumnTransformer([ 
    ("num", num_pipeline, numerical_attributes),
    ("cat", OneHotEncoder(), categorical_attributes)
])

prepared_data = full_pipeline.fit_transform(data)
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])