# Automobile Data

### Attribute Description

1 mpg:           continuous

2 cylinders:     multi-valued discrete

3 displacement:  continuous

4 horsepower:    continuous

5 weight:        continuous

6 acceleration:  continuous

7 model year:    multi-valued discrete

8 origin:        multi-valued discrete

9 car name:      string (unique for each instance)

### Data Import

In [1]:
# importing pandas library

import pandas as pd

In [2]:
# loading a dataset into a pandas dataframe using read_csv() function

raw_data = pd.read_csv('Desktop/Data/auto-mpg.csv')

In [3]:
# viewing first five rows of dataset using head() function

raw_data.head()

Unnamed: 0,18,8,307,130,3504,12,70,1,chevrolet chevelle malibu
0,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
1,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
2,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
3,17.0,8,302.0,140,3449,10.5,70,1,ford torino
4,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500


In [4]:
# using 'names' parameter of read_csv() function for proper import

raw_data = pd.read_csv('Desktop/Data/auto-mpg.csv', \
                       names=['mpg','cylinders','displacement','horsepower','weight', \
                              'acceleration','model year','origin','car name'])
raw_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [5]:
# a quick glance at the number of rows, number of attributes and their datatypes

raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null float64
horsepower      398 non-null object
weight          398 non-null int64
acceleration    398 non-null float64
model year      398 non-null int64
origin          398 non-null int64
car name        398 non-null object
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [6]:
# a short statistical summary of the dataset

raw_data.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


In [7]:
# a short statistical summary of the dataset, including categorical variables

raw_data.describe(include='all')

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398
unique,,,,94.0,,,,,305
top,,,,150.0,,,,,ford pinto
freq,,,,22.0,,,,,6
mean,23.514573,5.454774,193.425879,,2970.424623,15.56809,76.01005,1.572864,
std,7.815984,1.701004,104.269838,,846.841774,2.757689,3.697627,0.802055,
min,9.0,3.0,68.0,,1613.0,8.0,70.0,1.0,
25%,17.5,4.0,104.25,,2223.75,13.825,73.0,1.0,
50%,23.0,4.0,148.5,,2803.5,15.5,76.0,1.0,
75%,29.0,8.0,262.0,,3608.0,17.175,79.0,2.0,


In [8]:
raw_data.horsepower.value_counts()

150    22
90     20
88     19
110    18
100    17
75     14
95     14
105    12
70     12
67     12
65     10
97      9
85      9
140     7
80      7
145     7
72      6
?       6
78      6
84      6
92      6
68      6
86      5
71      5
60      5
130     5
180     5
115     5
175     5
170     5
       ..
135     1
94      1
103     1
102     1
132     1
142     1
167     1
220     1
61      1
107     1
158     1
116     1
149     1
89      1
133     1
82      1
113     1
108     1
91      1
193     1
77      1
200     1
208     1
137     1
230     1
152     1
49      1
93      1
64      1
210     1
Name: horsepower, Length: 94, dtype: int64

In [9]:
# using 'names' and 'na_values' parameters of read_csv() function for proper import

raw_data = pd.read_csv('Desktop/Data/auto-mpg.csv', \
                       names=['mpg','cylinders','displacement','horsepower','weight', \
                              'acceleration','model year','origin','car name'], \
                       na_values='?')

raw_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [10]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null float64
horsepower      392 non-null float64
weight          398 non-null int64
acceleration    398 non-null float64
model year      398 non-null int64
origin          398 non-null int64
car name        398 non-null object
dtypes: float64(4), int64(4), object(1)
memory usage: 28.1+ KB


In [11]:
raw_data.describe(include='all')

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0,398.0,398
unique,,,,,,,,,305
top,,,,,,,,,ford pinto
freq,,,,,,,,,6
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005,1.572864,
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627,0.802055,
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0,1.0,
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0,1.0,
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0,1.0,
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0,2.0,


In [12]:
# checking for missing values

raw_data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [22]:
# creating a savepoint

data = raw_data.copy()

In [23]:
# finding missing value row indices

missing_val_list = data[data.horsepower.isnull() == True].index.to_list()
missing_val_list

[32, 126, 330, 336, 354, 374]

In [24]:
# viewing rows with missing values

data.iloc[missing_val_list, :]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
32,25.0,4,98.0,,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,,3035,20.5,82,1,amc concord dl


In [16]:
# dropping missing values

data = data.dropna()

In [17]:
data.iloc[missing_val_list, :]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
33,19.0,6,232.0,100.0,2634,13.0,71,1,amc gremlin
128,15.0,6,250.0,100.0,3336,17.0,74,1,chevrolet nova
333,32.7,6,168.0,132.0,2910,11.4,80,3,datsun 280-zx
340,25.8,4,156.0,92.0,2620,14.4,81,1,dodge aries wagon (sw)
359,28.1,4,141.0,80.0,3230,20.4,81,2,peugeot 505s turbo diesel
380,36.0,4,120.0,88.0,2160,14.5,82,3,nissan stanza xe


In [18]:
# checking number of rows in dataframe

len(data)

392

In [21]:
# filling missing values with mean

data['horsepower'] = data['horsepower'].fillna(data['horsepower'].mean())
data.iloc[missing_val_list, :]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
32,25.0,4,98.0,104.469388,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,104.469388,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,104.469388,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,104.469388,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,104.469388,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,104.469388,3035,20.5,82,1,amc concord dl


In [25]:
# filling missing values with median

data.horsepower = data.horsepower.fillna(data.horsepower.median())
data.iloc[missing_val_list, :]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
32,25.0,4,98.0,93.5,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,93.5,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,93.5,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,93.5,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,93.5,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,93.5,3035,20.5,82,1,amc concord dl


In [26]:
data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [27]:
# creating a savepoint

data_filled = data.copy()

In [28]:
# converting 'origin' column to string

data_filled['origin'] = data_filled['origin'].astype(str)

In [29]:
# one-hot encoding 'origin' column using get_dummies() function with 'prefix' parameter

pd.get_dummies(data_filled['origin'], prefix='origin')

Unnamed: 0,origin_1,origin_2,origin_3
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,1,0,0


In [30]:
# concating the dummy variables dataframe with original data 

data_filled = pd.concat([data_filled, pd.get_dummies(data_filled['origin'], prefix='origin')], axis=1)

In [31]:
data_filled.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,origin_1,origin_2,origin_3
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,1,0,0
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,1,0,0
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,1,0,0
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,1,0,0
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,1,0,0


### Feature Scaling

In [32]:
data_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 12 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null float64
horsepower      398 non-null float64
weight          398 non-null int64
acceleration    398 non-null float64
model year      398 non-null int64
origin          398 non-null object
car name        398 non-null object
origin_1        398 non-null uint8
origin_2        398 non-null uint8
origin_3        398 non-null uint8
dtypes: float64(4), int64(3), object(2), uint8(3)
memory usage: 29.2+ KB


In [33]:
# dropping categorical variables for which dummies have been created

data_filled.drop(['origin'], axis=1, inplace=True)

In [34]:
data_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 11 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null float64
horsepower      398 non-null float64
weight          398 non-null int64
acceleration    398 non-null float64
model year      398 non-null int64
car name        398 non-null object
origin_1        398 non-null uint8
origin_2        398 non-null uint8
origin_3        398 non-null uint8
dtypes: float64(4), int64(3), object(1), uint8(3)
memory usage: 26.1+ KB


In [35]:
# checking for correlation between the numerical variables

data_filled[['mpg','cylinders','displacement','horsepower','weight','acceleration','model year']].corr()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year
mpg,1.0,-0.775396,-0.804203,-0.773453,-0.831741,0.420289,0.579267
cylinders,-0.775396,1.0,0.950721,0.841284,0.896017,-0.505419,-0.348746
displacement,-0.804203,0.950721,1.0,0.895778,0.932824,-0.543684,-0.370164
horsepower,-0.773453,0.841284,0.895778,1.0,0.862442,-0.68659,-0.413733
weight,-0.831741,0.896017,0.932824,0.862442,1.0,-0.417457,-0.306564
acceleration,0.420289,-0.505419,-0.543684,-0.68659,-0.417457,1.0,0.288137
model year,0.579267,-0.348746,-0.370164,-0.413733,-0.306564,0.288137,1.0


In [36]:
# dropping all correlated variables except one

data_filled.drop(['cylinders'], axis=1, inplace=True)

In [52]:
data_uncorr = data_filled.copy()

In [53]:
# importing StandardScaler class from Scikit-Learn

from sklearn.preprocessing import StandardScaler

In [54]:
# creating an object of the StandardScaler class

std_scaler = StandardScaler()

In [55]:
# standardizing 'mpg' column

import numpy as np
std_scaler.fit_transform(np.array(data_uncorr['mpg']).reshape(len(data_uncorr['mpg']),1))

array([[-7.06438701e-01],
       [-1.09075062e+00],
       [-7.06438701e-01],
       [-9.62646649e-01],
       [-8.34542675e-01],
       [-1.09075062e+00],
       [-1.21885460e+00],
       [-1.21885460e+00],
       [-1.21885460e+00],
       [-1.09075062e+00],
       [-1.09075062e+00],
       [-1.21885460e+00],
       [-1.09075062e+00],
       [-1.21885460e+00],
       [ 6.21851453e-02],
       [-1.94022803e-01],
       [-7.06438701e-01],
       [-3.22126778e-01],
       [ 4.46497068e-01],
       [ 3.18393094e-01],
       [ 1.90289120e-01],
       [ 6.21851453e-02],
       [ 1.90289120e-01],
       [ 3.18393094e-01],
       [-3.22126778e-01],
       [-1.73127050e+00],
       [-1.73127050e+00],
       [-1.60316652e+00],
       [-1.85937447e+00],
       [ 4.46497068e-01],
       [ 5.74601043e-01],
       [ 1.90289120e-01],
       [ 1.90289120e-01],
       [-5.78334726e-01],
       [-9.62646649e-01],
       [-8.34542675e-01],
       [-5.78334726e-01],
       [-7.06438701e-01],
       [-1.2

In [56]:
data_uncorr.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model year,car name,origin_1,origin_2,origin_3
0,18.0,307.0,130.0,3504,12.0,70,chevrolet chevelle malibu,1,0,0
1,15.0,350.0,165.0,3693,11.5,70,buick skylark 320,1,0,0
2,18.0,318.0,150.0,3436,11.0,70,plymouth satellite,1,0,0
3,16.0,304.0,150.0,3433,12.0,70,amc rebel sst,1,0,0
4,17.0,302.0,140.0,3449,10.5,70,ford torino,1,0,0


In [57]:
# standardizing all numerical columns

data_uncorr[['mpg','displacement','horsepower','weight','acceleration','model year']] = std_scaler.fit_transform(data_uncorr[['mpg','displacement','horsepower','weight','acceleration','model year']])

In [58]:
data_uncorr.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model year,car name,origin_1,origin_2,origin_3
0,-0.706439,1.090604,0.673118,0.63087,-1.295498,-1.627426,chevrolet chevelle malibu,1,0,0
1,-1.090751,1.503514,1.589958,0.854333,-1.477038,-1.627426,buick skylark 320,1,0,0
2,-0.706439,1.196232,1.197027,0.55047,-1.658577,-1.627426,plymouth satellite,1,0,0
3,-0.962647,1.061796,1.197027,0.546923,-1.295498,-1.627426,amc rebel sst,1,0,0
4,-0.834543,1.042591,0.935072,0.565841,-1.840117,-1.627426,ford torino,1,0,0


In [44]:
# using inverse_transform function to view the original values from the scaled values

std_scaler.inverse_transform(data_uncorr[['mpg','displacement','horsepower','weight','acceleration','model year']].iloc[:5, :])

array([[  18. ,  307. ,  130. , 3504. ,   12. ,   70. ],
       [  15. ,  350. ,  165. , 3693. ,   11.5,   70. ],
       [  18. ,  318. ,  150. , 3436. ,   11. ,   70. ],
       [  16. ,  304. ,  150. , 3433. ,   12. ,   70. ],
       [  17. ,  302. ,  140. , 3449. ,   10.5,   70. ]])

In [46]:
# importing Normalizer class from Scikit-Learn

from sklearn.preprocessing import Normalizer

In [47]:
# creating an object of the Normalizer class

norm = Normalizer()

In [48]:
data_uncorr.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model year,car name,origin_1,origin_2,origin_3
0,18.0,307.0,130.0,3504,12.0,70,chevrolet chevelle malibu,1,0,0
1,15.0,350.0,165.0,3693,11.5,70,buick skylark 320,1,0,0
2,18.0,318.0,150.0,3436,11.0,70,plymouth satellite,1,0,0
3,16.0,304.0,150.0,3433,12.0,70,amc rebel sst,1,0,0
4,17.0,302.0,140.0,3449,10.5,70,ford torino,1,0,0


In [49]:
# normalizing all numerical columns

data_uncorr[['mpg','displacement','horsepower','weight','acceleration','model year']] = norm.fit_transform(data_uncorr[['mpg','displacement','horsepower','weight','acceleration','model year']])

In [50]:
data_uncorr.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model year,car name,origin_1,origin_2,origin_3
0,0.005113,0.087201,0.036926,0.995289,0.003409,0.019883,chevrolet chevelle malibu,1,0,0
1,0.004039,0.09424,0.044427,0.994366,0.003096,0.018848,buick skylark 320,1,0,0
2,0.00521,0.092048,0.043419,0.994582,0.003184,0.020262,plymouth satellite,1,0,0
3,0.004637,0.088104,0.043472,0.994939,0.003478,0.020287,amc rebel sst,1,0,0
4,0.004905,0.087137,0.040395,0.995155,0.00303,0.020197,ford torino,1,0,0


In [51]:
# checking the norm of each obervation (row)

s = 0

for i in range(6):
    s += data_uncorr[['mpg','displacement','horsepower','weight','acceleration','model year']].iloc[0, i]**2

s

1.0

In [60]:
# exporting processed data as csv

data_uncorr.to_csv('Desktop/Data/automobile_processed.csv', index=False)