## final version of analysis of initial emissions test pass/fail results
notebook builds on previous work by refining what qualifies as an initial test

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('test_data_clean_uak211.csv').drop('Unnamed: 0', axis = 1)

In [3]:
# filter to vehicles of model year 1996 and newer (and removing some data entry errors)
# limit to test type OBD
df['model_year'] = df['model_year'].fillna(0.0).astype(int)
df = df.loc[(df['model_year'] >= 1996) & (df['model_year'] <= 2019) & (df['test_type'] == 'OBDII')]

In [4]:
# convert to datetime format and create test year variable
df['date'] = pd.to_datetime(df['date'])
# create test year/month variable
df['test_year'] = df['date'].apply(lambda x: x.year)
df['test_month'] = df['date'].apply(lambda x: x.month)

https://slco.org/health/air-quality/vehicle-emissions/ (testing requirements tab)
* vehicles of a model year more than six years old must test every year
* vehicles of a model year less than six years old are tested every other year

In [12]:
# isolate cars to be tested every year
# for 2017 data, these would be MY 2011 or older cars, MY 2012 or older for 2018 tests
yearly_2017 = df[(df['model_year'] <= 2011) & (df['test_year'] == 2017)]
yearly_2017.sort_values(by = 'date')
yearly_2017 = yearly_2017.groupby('vin').head(1)
yearly_2018 = df[(df['model_year'] <= 2012) & (df['test_year'] == 2018)]
yearly_2018.sort_values(by = 'date')
yearly_2018 = yearly_2018.groupby('vin').head(1)

In [13]:
# isolate newer cars (this is where model year/test year even and odd comes into play)
newer_2017 = df[(df['model_year'] >= 2011) & (df['test_year'] == 2017) & (df['model_year']%2 != 0)]
newer_2017.sort_values(by = 'date', inplace = True)
newer_2017 = newer_2017.groupby('vin').head(1)
newer_2018 = df[(df['model_year'] >= 2012) & (df['test_year'] == 2018) & (df['model_year']%2 == 0)]
newer_2018.sort_values(by = 'date', inplace = True)
newer_2018 = newer_2018.groupby('vin').head(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
# combine, isolate first appearance for every year tests and every other year tests
dfs = [yearly_2017, yearly_2018, newer_2017, newer_2018]
initial_by_year = pd.concat(dfs)
len(initial_by_year)

1311876

In [15]:
# for MY < 6 years vehicles tested in 2018, isolate & remove those also tested in 2017. these are retests.
# isolate
retests = df[(df['test_year'] == 2018) & (df['model_year'] >= 2012) & (df['vin'].isin(newer_2017['vin']))]
retests = retests.iloc[:, 4].unique()
                                          
# remove
initial_by_year = initial_by_year[~initial_by_year['vin'].isin(retests)]
len(retests), len(initial_by_year)                                         

(14395, 1297481)

In [16]:
# pivot table to show counts 
pd.pivot_table(initial_by_year, index = 'test_month', columns = 'test_year', values = 'vin', aggfunc = 'count', margins = True)

test_year,2017,2018,All
test_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,46898,53566,100464
2,49921,50269,100190
3,61021,63566,124587
4,53281,56178,109459
5,58009,61219,119228
6,56276,59850,116126
7,56572,59822,116394
8,60418,63530,123948
9,53437,54393,107830
10,51312,56490,107802


In [19]:
# pivot table to show percentage of first-time test failures by test year
pct_func = lambda x: 100*x.count()/initial_by_year.shape[0]
pd.pivot_table(initial_by_year, index = 'model_year', columns = 'test_result', values = 'vin', aggfunc = pct_func, margins = True)

test_result,Fail,Pass,Reject,All
model_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996,0.096032,1.015737,0.066051,1.177821
1997,0.141505,1.539984,0.100271,1.78176
1998,0.150137,1.897292,0.126399,2.173828
1999,0.169791,2.479728,0.149829,2.799347
2000,0.210947,3.06571,0.182585,3.459241
2001,0.250562,3.240433,0.335034,3.826029
2002,0.20586,3.881829,0.329793,4.417483
2003,0.237383,4.452859,0.312683,5.002925
2004,0.213259,5.227129,0.321854,5.762242
2005,0.211641,5.994539,0.333878,6.540057


In [18]:
# pivot table to show counts of first-time test failures by test year
pd.pivot_table(initial_by_year, index = 'test_result', columns = 'test_year', values = 'vin', aggfunc = 'count', margins = True)

test_year,2017,2018,All
test_result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fail,18950,21207,40157
Pass,584348,618690,1203038
Reject,26331,27955,54286
All,629629,667852,1297481
