In [1]:
import numpy as np
import pandas as pd

from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from bokeh.layouts import grid, row

import pandas_bokeh

In [2]:
output_notebook()
pandas_bokeh.output_notebook()
pd.set_option('plotting.backend', 'pandas_bokeh')
np.set_printoptions(precision=2)

# Explore the data

In [3]:
auto = pd.read_csv("/data/auto-mpg.csv")
auto.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null float64
horsepower      398 non-null object
weight          398 non-null int64
acceleration    398 non-null float64
model year      398 non-null int64
origin          398 non-null int64
car name        398 non-null object
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


Something is wrong because `horsepower` should be of type `int`, instead it is showing as type `object`. Lets find out why.

In [5]:
auto.groupby("horsepower").size()

horsepower
100    17
102     1
103     1
105    12
107     1
       ..
95     14
96      3
97      9
98      2
?       6
Length: 94, dtype: int64

Found it! It has 6 *?* entries, possibly indicating missing data. Lets just get rid of these rows and then convert this column into an int.

In [6]:
auto = auto.loc[auto["horsepower"] != "?"]
auto.groupby("horsepower").size()

horsepower
100    17
102     1
103     1
105    12
107     1
       ..
94      1
95     14
96      3
97      9
98      2
Length: 93, dtype: int64

In [7]:
auto["horsepower"] = auto["horsepower"].astype(int)
auto.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
mpg             392 non-null float64
cylinders       392 non-null int64
displacement    392 non-null float64
horsepower      392 non-null int64
weight          392 non-null int64
acceleration    392 non-null float64
model year      392 non-null int64
origin          392 non-null int64
car name        392 non-null object
dtypes: float64(3), int64(5), object(1)
memory usage: 30.6+ KB


Everything is as expected. Lets examine each column now.

In [8]:
auto[["mpg"]].plot.hist(bins=20)

In [9]:
cylinders = pd.DataFrame(auto.groupby("cylinders").size(), columns=["frequency"])
cylinders

Unnamed: 0_level_0,frequency
cylinders,Unnamed: 1_level_1
3,4
4,199
5,3
6,83
8,103


In [10]:
cylinders.plot.bar()

In [11]:
auto[["displacement"]].plot.hist(bins=7)

In [12]:
auto[["horsepower"]].plot.hist(bins=7)

In [13]:
auto[["weight"]].plot.hist(bins=15)

In [14]:
auto[["acceleration"]].plot.hist()

In [15]:
yrs = pd.DataFrame(auto.groupby("model year").size(), columns=["frequency"])
yrs.plot.bar()

In [16]:
origs = pd.DataFrame(auto.groupby("origin").size(), columns=["frequency"])
origs.plot.bar()

# Correlations against target
Given `mpg` is the target variable, lets see how it varies with all the other columns.

In [17]:
auto.plot.scatter(x="cylinders", y="mpg")

Having more cylinders is bad for mpg.

In [18]:
auto.plot.scatter(x="displacement", y="mpg")

Bigger the engine, worse the mileage.

In [19]:
auto.plot.scatter(x="horsepower", y="mpg")

More powerful engines are gas guzzlers.

In [20]:
auto.plot.scatter(x="weight", y="mpg")

Heavier engines need more gas to move.

In [21]:
auto.plot.scatter(x="acceleration", y="mpg")

Not a very strong correlation here. But if I had to say, I'd say faster the car, better the mileage.

In [22]:
auto.plot.scatter(x="model year", y="mpg")

Newer cars are more fuel efficient.

In [23]:
auto.plot.scatter(x="origin", y="mpg")

Overall cars made in the USA (origin=1) are the least fuel efficient and cars made in Japan (origin=3) are the most. European cars (origin=2) are in between.

Moreover, it seems that weight, horsepower, and displacement must be positively correlated. Lets check it out.

In [24]:
auto.plot.scatter(x="weight", y="horsepower")

In [25]:
auto.plot.scatter(x="horsepower", y="displacement")

In [26]:
auto.plot.scatter(x="displacement", y="weight")

# Conclusion
Apart from car names, all other columns seem relevant. `horsepower` has some missing data, but those rows have been removed. Lets write the new CSV file out.

In [27]:
auto.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name'],
      dtype='object')

In [31]:
cols_to_keep = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model year", "origin"]
auto = auto[cols_to_keep]
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


In [33]:
auto.to_csv("./clean-auto-mpg.csv", index=False)