In [2]:
# load the datasets
import pandas as pd

df08 = pd.read_csv('data_08_v1.csv')
df18 = pd.read_csv('data_18_v1.csv')

In [3]:
# view dimensions of dataset
df08.shape

(2404, 14)

In [4]:
# view dimensions of dataset
df18.shape

(1611, 14)

## Filter by Certification Region

In [13]:
# filter datasets for rows following California standards
df08 = df08.query('cert_region == "CA"')
df18 = df18.query('cert_region == "CA"')

In [14]:
# confirm only certification region is California
df08['cert_region'].unique()

array(['CA'], dtype=object)

In [15]:
# confirm only certification region is California
df18['cert_region'].unique()

array(['CA'], dtype=object)

In [17]:
# drop certification region columns form both datasets
df08.drop('cert_region', axis=1, inplace=True)
df18.drop('cert_region', axis=1, inplace=True)

KeyError: "['cert_region'] not found in axis"

In [18]:
df08.shape

(1084, 13)

In [19]:
df18.shape

(798, 13)

## Drop Rows with Missing Values

In [20]:
# view missing value count for each feature in 2008
df08.isnull().sum()

model                    0
displ                    0
cyl                     75
trans                   75
drive                   37
fuel                     0
veh_class                0
air_pollution_score      0
city_mpg                75
hwy_mpg                 75
cmb_mpg                 75
greenhouse_gas_score    75
smartway                 0
dtype: int64

In [21]:
# view missing value count for each feature in 2018
df18.isnull().sum()

model                   0
displ                   1
cyl                     1
trans                   0
drive                   0
fuel                    0
veh_class               0
air_pollution_score     0
city_mpg                0
hwy_mpg                 0
cmb_mpg                 0
greenhouse_gas_score    0
smartway                0
dtype: int64

In [22]:
# drop rows with any null values in both datasets
df08.dropna(inplace=True)
df18.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df08.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df18.dropna(inplace=True)


In [24]:
# checks if any of columns in 2008 have null values - should print False
df08.isnull().sum().any()

False

In [25]:
# checks if any of columns in 2018 have null values - should print False
df18.isnull().sum().any()

False

## Dedupe Data

In [28]:
# print number of duplicates in 2008 and 2018 datasets
print(df08.duplicated().sum())
print(df18.duplicated().sum())

23
3


In [29]:
# drop duplicates in both datasets
df08.drop_duplicates(inplace=True)
df18.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df08.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df18.drop_duplicates(inplace=True)


In [30]:
# print number of duplicates again to confirm dedupe - should both be 0
print(df08.duplicated().sum())
print(df18.duplicated().sum())

0
0


In [32]:
# save progress for the next section
df08.to_csv('data_08_v2.csv', index=False)
df18.to_csv('data_18_v2.csv', index=False)