In [1]:
import pandas as pd
import numpy as np

In [2]:
# Loading the DataSets
all_08 = pd.read_csv('./DataSet/all_alpha_08_cleaned.csv')
all_18 = pd.read_csv('./DataSet/all_alpha_18_cleaned.csv')

In [3]:
all_08.shape, all_18.shape

((2404, 14), (1611, 14))

In [4]:
all_08.columns, all_18.columns

(Index(['model', 'displ', 'cyl', 'trans', 'drive', 'fuel', 'cert_region',
        'veh_class', 'air_pollution_score', 'city_mpg', 'hwy_mpg', 'cmb_mpg',
        'greenhouse_gas_score', 'smartway'],
       dtype='object'),
 Index(['model', 'displ', 'cyl', 'trans', 'drive', 'fuel', 'cert_region',
        'veh_class', 'air_pollution_score', 'city_mpg', 'hwy_mpg', 'cmb_mpg',
        'greenhouse_gas_score', 'smartway'],
       dtype='object'))

## Filter, Drop Null, and Drop Duplicates

**Filter**
   
For consistency, only compare cars certified by California standards. Filter both datasets using query to select only rows where cert_region is CA. Then, drop the cert_region columns, since it will no longer provide any useful information (we'll know every value is 'CA').

In [5]:
all_08 = all_08[all_08.cert_region == 'CA']
all_18 = all_18[all_18.cert_region == 'CA']

In [6]:
all_08.cert_region.unique(), all_18.cert_region.unique()

(array(['CA'], dtype=object), array(['CA'], dtype=object))

In [7]:
all_08.drop(columns='cert_region', inplace=True)
all_18.drop(columns='cert_region', inplace=True)

In [8]:
all_08.shape, all_18.shape

((1084, 13), (798, 13))

**Drop Nulls**
  
Drop any rows in both datasets that contain missing values.

In [9]:
# Check the number of null values
print(all_08.isna().sum())
print()
print(all_18.isna().sum())

model                    0
displ                    0
cyl                     75
trans                   75
drive                   37
fuel                     0
veh_class                0
air_pollution_score      0
city_mpg                75
hwy_mpg                 75
cmb_mpg                 75
greenhouse_gas_score    75
smartway                 0
dtype: int64

model                   0
displ                   1
cyl                     1
trans                   0
drive                   0
fuel                    0
veh_class               0
air_pollution_score     0
city_mpg                0
hwy_mpg                 0
cmb_mpg                 0
greenhouse_gas_score    0
smartway                0
dtype: int64


In [10]:
# Drop any null values
all_08.dropna(inplace=True)
all_18.dropna(inplace=True)

In [11]:
# Check if all duplicates are removed
print(all_08.isna().any())
print()
print(all_18.isna().any())

model                   False
displ                   False
cyl                     False
trans                   False
drive                   False
fuel                    False
veh_class               False
air_pollution_score     False
city_mpg                False
hwy_mpg                 False
cmb_mpg                 False
greenhouse_gas_score    False
smartway                False
dtype: bool

model                   False
displ                   False
cyl                     False
trans                   False
drive                   False
fuel                    False
veh_class               False
air_pollution_score     False
city_mpg                False
hwy_mpg                 False
cmb_mpg                 False
greenhouse_gas_score    False
smartway                False
dtype: bool


**Dedupe**
  
Drop any duplicate rows in both datasets.

In [12]:
# Check if there are Duplicates
all_08.duplicated().sum(), all_18.duplicated().sum()

(23, 3)

In [13]:
# Drop All Duplicates
all_08.drop_duplicates(inplace=True)
all_18.drop_duplicates(inplace=True)

In [14]:
# Check if the Duplicates are removed
all_08.duplicated().any(), all_18.duplicated().any()

(False, False)

In [15]:
all_08.shape, all_18.shape

((986, 13), (794, 13))

## Save to CSV

In [16]:
all_08.to_csv('./DataSet/all_alpha_08_filtered.csv', index=False)
all_18.to_csv('./DataSet/all_alpha_18_filtered.csv', index=False)