#Part 1

Create function to pull data file from .zip

In [2]:
import os
import zipfile
import requests

def download_data(url, name, path='data'):
    if not os.path.exists(path):
        os.mkdir(path)

    response = requests.get(url)
    with open(os.path.join(path, name), 'wb') as f:
        f.write(response.content)

    z = zipfile.ZipFile(os.path.join(path, 'vehicles.zip'))
    z.extractall(path)

VEHICLES = 'http://bit.ly/ddl-cars'

download_data(VEHICLES, 'vehicles.zip')

In [3]:
import pandas as pd

path = 'data'

vehicles = pd.read_csv(os.path.join(path, 'vehicles.csv'))

  interactivity=interactivity, compiler=compiler, result=result)


Clean data - pull interesting columns, drop duplicated data points, drop n/a's, sort data by make, model, and year

In [4]:
select_columns = ['make', 'model', 'year', 'displ', 'cylinders', 'trany', 'drive', 'VClass','fuelType','barrels08', 'city08', 'highway08', 'comb08', 'co2TailpipeGpm', 'fuelCost08']

vehicles = vehicles[select_columns][vehicles.year <= 2016].drop_duplicates().dropna()

vehicles = vehicles.sort_values(['make', 'model', 'year'])

In [9]:
vehicles.head()

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
19316,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
19314,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
358,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100
369,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
25797,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550


In [8]:
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35095 entries, 19316 to 29609
Data columns (total 15 columns):
Make                       35095 non-null object
Model                      35095 non-null object
Year                       35095 non-null int64
Engine Displacement        35095 non-null float64
Cylinders                  35095 non-null float64
Transmission               35095 non-null object
Drivetrain                 35095 non-null object
Vehicle Class              35095 non-null object
Fuel Type                  35095 non-null object
Fuel Barrels/Year          35095 non-null float64
City MPG                   35095 non-null int64
Highway MPG                35095 non-null int64
Combined MPG               35095 non-null int64
CO2 Emission Grams/Mile    35095 non-null float64
Fuel Cost/Year             35095 non-null int64
dtypes: float64(4), int64(5), object(6)
memory usage: 4.3+ MB


Rename columns to make more readable

In [7]:
vehicles.columns = ['Make','Model','Year','Engine Displacement','Cylinders',
                    'Transmission','Drivetrain','Vehicle Class','Fuel Type',
                    'Fuel Barrels/Year','City MPG','Highway MPG','Combined MPG',
                    'CO2 Emission Grams/Mile','Fuel Cost/Year']

Looking to simplify data - create higher level categorical variables or remove complexity from data

In [10]:
def unique_col_values(df):
    for column in df:
        print("{} | {} | {}".format(
            df[column].name, len(df[column].unique()), df[column].dtype
        ))

unique_col_values(vehicles)

Make | 126 | object
Model | 3490 | object
Year | 33 | int64
Engine Displacement | 65 | float64
Cylinders | 9 | float64
Transmission | 44 | object
Drivetrain | 7 | object
Vehicle Class | 34 | object
Fuel Type | 13 | object
Fuel Barrels/Year | 116 | float64
City MPG | 48 | int64
Highway MPG | 49 | int64
Combined MPG | 46 | int64
CO2 Emission Grams/Mile | 550 | float64
Fuel Cost/Year | 55 | int64


Ex of simplfying data - Transmission: there are 44 types of transmission - they can be broken into Automatic and Manual

In [33]:
len(vehicles.Transmission.unique())

44

In [35]:
vehicles.Transmission.value_counts() 

Automatic 4-spd                     10582
Manual 5-spd                         7764
Automatic 3-spd                      2597
Automatic (S6)                       2455
Manual 6-spd                         2336
Automatic 5-spd                      2171
Automatic 6-spd                      1351
Manual 4-spd                         1306
Automatic (S5)                        822
Automatic (S8)                        774
Automatic (variable gear ratios)      643
Automatic 7-spd                       630
Automatic (S7)                        238
Automatic (S4)                        229
Auto(AM-S7)                           211
Automatic 8-spd                       207
Auto(AV-S6)                           135
Auto(AM7)                             124
Auto(AM6)                             103
Auto(AM-S6)                            82
Manual 3-spd                           74
Manual 7-spd                           56
Automatic 9-spd                        54
Auto(AV-S7)                       