In [24]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import StratifiedShuffleSplit 

import warnings 
warnings.filterwarnings('ignore')

In [25]:
cols=['MPG','Cyclinders','Displacement','Horsepower','Weight','Acceleration','Model Year','Origin']
df=pd.read_csv('./auto-mpg.data',names=cols, na_values="?", 
              comment='\t' ,
              sep=" " ,
              skipinitialspace=True) 
data=df.copy() 

split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)

for train_index,test_index in split.split(data,data['Cyclinders']) :
    strat_train_set=data.loc[train_index] 
    strat_test_data=data.loc[test_index] 

In [26]:
data=strat_train_set.drop("MPG",axis=1)
data_labels=strat_train_set["MPG"].copy() 

In [27]:
data

Unnamed: 0,Cyclinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [28]:
def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df
data_tr = preprocess_origin_cols(data)
data_tr.head()

Unnamed: 0,Cyclinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,Germany
151,4,79.0,67.0,2000.0,16.0,74,USA
388,4,156.0,92.0,2585.0,14.5,82,India
48,6,250.0,88.0,3139.0,14.5,71,India
114,4,98.0,90.0,2265.0,15.5,73,USA


In [29]:
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 7 columns):
Cyclinders      318 non-null int64
Displacement    318 non-null float64
Horsepower      314 non-null float64
Weight          318 non-null float64
Acceleration    318 non-null float64
Model Year      318 non-null int64
Origin          318 non-null object
dtypes: float64(4), int64(2), object(1)
memory usage: 18.6+ KB


In [32]:
data_cat=data_tr[["Origin"]]
data_cat.head()

Unnamed: 0,Origin
145,Germany
151,USA
388,India
48,India
114,USA


In [34]:
from sklearn.preprocessing import OneHotEncoder 

cat_encoder=OneHotEncoder()
data_cat_1hot=cat_encoder.fit_transform(data_cat)
data_cat_1hot

<318x3 sparse matrix of type '<class 'numpy.float64'>'
	with 318 stored elements in Compressed Sparse Row format>

In [35]:
data_cat_1hot.toarray()[:5]

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [36]:
cat_encoder.categories_

[array(['Germany', 'India', 'USA'], dtype=object)]

In [37]:
num_data=data.iloc[:,:-1]
num_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
Cyclinders      318 non-null int64
Displacement    318 non-null float64
Horsepower      314 non-null float64
Weight          318 non-null float64
Acceleration    318 non-null float64
Model Year      318 non-null int64
dtypes: float64(4), int64(2)
memory usage: 17.4 KB


In [38]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy="median")
imputer.fit(num_data)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [39]:
imputer.statistics_

array([   4. ,  146. ,   92. , 2844. ,   15.5,   76. ])

In [40]:
data.median().values

array([   4. ,  146. ,   92. , 2844. ,   15.5,   76. ])

In [42]:
X=imputer.transform(num_data)
X

array([[   4. ,   83. ,   61. , 2003. ,   19. ,   74. ],
       [   4. ,   79. ,   67. , 2000. ,   16. ,   74. ],
       [   4. ,  156. ,   92. , 2585. ,   14.5,   82. ],
       ...,
       [   4. ,  135. ,   84. , 2295. ,   11.6,   82. ],
       [   4. ,  113. ,   95. , 2372. ,   15. ,   70. ],
       [   6. ,  146. ,  120. , 2930. ,   13.8,   81. ]])

In [45]:
data_tr=pd.DataFrame(X,columns=num_data.columns,index=num_data.index)
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
Cyclinders      318 non-null float64
Displacement    318 non-null float64
Horsepower      318 non-null float64
Weight          318 non-null float64
Acceleration    318 non-null float64
Model Year      318 non-null float64
dtypes: float64(6)
memory usage: 17.4 KB


In [47]:
num_data.head()

Unnamed: 0,Cyclinders,Displacement,Horsepower,Weight,Acceleration,Model Year
145,4,83.0,61.0,2003.0,19.0,74
151,4,79.0,67.0,2000.0,16.0,74
388,4,156.0,92.0,2585.0,14.5,82
48,6,250.0,88.0,3139.0,14.5,71
114,4,98.0,90.0,2265.0,15.5,73


In [56]:
from sklearn.base import BaseEstimator,TransformerMixin
acc_ix,hpower_ix,cyl_ix=4,2,0

class CustomAttrAdder(BaseEstimator,TransformerMixin):
    def __init__(self,acc_on_power=True):
        self.acc_on_power=acc_on_power 
    def fit(self,X,y=None):
        return self 
    def transform(self,X):
        acc_on_cyl=X[:,acc_ix]/X[:,cyl_ix]
        
        if self.acc_on_power:
            acc_on_power=X[:,acc_ix]/X[:,hpower_ix]
            return np.c_[X,acc_on_power,acc_on_cyl]
        
        return np.c_[X,acc_on_cyl]
    
attr_adder=CustomAttrAdder(acc_on_power=True)  
data_tr_extra_attrs=attr_adder.transform(data_tr.values)
data_tr_extra_attrs[0]

array([4.0000000e+00, 8.3000000e+01, 6.1000000e+01, 2.0030000e+03,
       1.9000000e+01, 7.4000000e+01, 3.1147541e-01, 4.7500000e+00])

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

numerics=["float64","int64"]

num_data=data_tr.select_dtypes(include=numerics)

num_pipeline=Pipeline([('imputer',SimpleImputer(strategy="median")),
                       ('attrs_adder',CustomAttrAdder()),
                       ('std_scaler',StandardScaler())
                      ])

num_data_tr=num_pipeline.fit_transform(num_data)
num_data_tr[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517])

In [60]:
from sklearn.compose import ColumnTransformer 

num_attrs=list(num_data)
cat_attrs=["Origin"]

full_pipeline=ColumnTransformer([("num",num_pipeline,num_attrs),("cat",OneHotEncoder(),cat_attrs)])

prepared_data=full_pipeline.fit_transform(data)
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])