This notebook contains code for calculating missingness.

> *Import Libraries and Mount Drive*




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import os
import warnings
warnings.filterwarnings("ignore")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [80]:
def size(year):
  df = pd.read_parquet('/content/drive/MyDrive/traffic_stop/year_data/traffic_' + str(year) + '.parquet')
  return (len(df.index))

In [81]:
year06= size(2006)
year06 = 0 if year06 is None else int(year06)
year07= size(2007)
year07 = 0 if year07 is None else int(year07)
year08= size(2008)
year08 = 0 if year08 is None else int(year08)
year09= size(2009)
year09 = 0 if year09 is None else int(year09)
year10= size(2010)
year10 = 0 if year10 is None else int(year10)
year11= size(2011)
year11 = 0 if year11 is None else int(year11)
year12= size(2012)
year12 = 0 if year12 is None else int(year12)
year13= size(2013)
year13 = 0 if year13 is None else int(year13)
year14= size(2014)
year14 = 0 if year14 is None else int(year14)
year15= size(2015)
year15 = 0 if year15 is None else int(year15)
year16= size(2016)
year16 = 0 if year16 is None else int(year16)
year17= size(2017)
year17 = 0 if year17 is None else int(year17)

In [32]:
def single_vio_types(threshold, year_start, year_end):
  """
  data format: violation type in data end with ' (#)'; more than 1 violation types: 'A|B'
  returns a list of violation types with # of single violation records > threshold every year
  """
  year_present_vios = {}
  years = list(range(year_start, year_end + 1))

  # get number of years in which a specific violation having # of single violation records more than threshold

  for year in years:
    print('Processing: ',year)
    df = pd.read_parquet('/content/drive/MyDrive/traffic_stop/year_data/traffic_' + str(year) + '.parquet')

    cnt_df = pd.DataFrame(df['violation'].value_counts(dropna=False))
    
    cnt_df = cnt_df.loc[cnt_df['violation'] > threshold]
    single_vio_lst = [vio for vio in list(cnt_df.index) if '|' not in vio]

    for vio in single_vio_lst:
      if vio not in year_present_vios:
        year_present_vios[vio] = 1
      else:
        year_present_vios[vio] += 1

  vio_type_all = []
  for vio, cnt in year_present_vios.items():
    if cnt == len(years):
      vio_type_all.append(vio)
  vio_type_all = [vio.replace(' (#)', '').strip() for vio in vio_type_all]

  return vio_type_all

In [None]:
vio_vars = single_vio_types(threshold = 5000, year_start = 2006, year_end = 2017)

In [34]:
# this function is called by preprocess(year, keep_col, rm_na_col, violation_types)
def get_vio_df(violation_types, vio_col):
  vio_df_dict  = {vio: [] for vio in violation_types}
  vio_df_dict['others'] = []

  for vio in vio_col:
    vio_lst = vio.split('|')
    vio_lst = [vio.replace(' (#)', '').strip() for vio in vio_lst]
    # if single violation
    if len(vio_lst) == 1:
      vio = vio.replace(' (#)', '').strip()
      # if vio in violation_types
      if vio in violation_types:
        vio_df_dict[vio].append(1)
        vio_df_dict['others'].append(0)
        for vio_var in violation_types:
          if vio_var != vio:
            vio_df_dict[vio_var].append(0)

      # if vio not in violation_types
      else:
        vio_df_dict['others'].append(1)
        for vio_var in violation_types:
          vio_df_dict[vio_var].append(0)
    
    # if multiple violations
    if len(vio_lst) > 1:

      # for single vio variables
      for vio_var in violation_types:
        # if has one of violation_types
        if vio_var in vio_lst:
          vio_df_dict[vio_var].append(1)
        # if violation_type not present
        else:
          vio_df_dict[vio_var].append(0)

      # for the 'others' variable
      # if have one vio not in violation_types, 'other' = 1
      vars_vio_cnt = 0
      for violation in vio_lst:
        if violation not in violation_types:
          vio_df_dict['others'].append(1)
          break
        else:
          vars_vio_cnt += 1
      # if all violations of this record in violation_types
      if vars_vio_cnt == len(vio_lst):
        vio_df_dict['others'].append(0)

  vio_df = pd.DataFrame(vio_df_dict)
  vio_df['violation'] = vio_col.values

  return vio_df

In [35]:
def remove_duplicates(row):
  if row['speeding_only'] == 'speeding-repeated_entries':
    return row['all_violation'][0].strip()
  else:
    return row['violation']

def remove_white_spaces(vio_lst):
  vio_lst = [x.strip() for x in vio_lst]
  return vio_lst
  
def exclusive(vio_lst):
  count = 0
  for vio in vio_lst:
    # count number of violations containing 'speed'
    if 'speed' in vio:
      count += 1
  # if we have non-speeding violation
  if count < len(vio_lst):
    return 'speeding + others'
  # if we only have speeding violation
  elif count == len(vio_lst):
    # if one speeding vio type
    if count == 1:
      return 'speeding-1'
    # if we have more than one entries but only one type
    elif len(set(vio_lst)) == 1:
      return 'speeding-repeated_entries'
    # if we have more than one type
    else:
      return 'speeding-multiple'
  else:
    return 'undefined case'

In [36]:
def speeding_filter(df):
  print('Before speed filtering: ', len(df))
  df['violation'] = [s.lower() for s in df['violation']]
  # at least one violation is speeding-violated (could have other violations at the same time)
  df = df.loc[df['violation'].str.contains('speed', regex = False),:]
  df['violation'] = df['violation'].map(lambda x: x.replace('(#)',''))
  df['violation'] = df['violation'].map(lambda x: x.strip())

  # get a list of violations for each record, and apply self-defined func exclusive
  df['all_violation'] = df['violation'].str.split('|')
  df['all_violation'] = df['all_violation'].map(lambda x: remove_white_spaces(x))
  df['speeding_only'] = df['all_violation'].map(lambda x: exclusive(x))

  # filter out rows with violations other than speeding
  df = df.loc[df['speeding_only'].isin(['speeding-repeated_entries','speeding-1','speeding-multiple']),:]

  df['violation'] = df.apply(lambda row: remove_duplicates(row), axis = 1)

  df.drop(['all_violation','speeding_only'], axis = 1, inplace = True)
  print('Speeding only violation has records: ', len(df))
  return df

In [37]:
# columns we need in the analysis
keep_col = [
            'county_name', 
            'subject_race',
            'subject_sex',
            'violation', 
            'citation_issued'
            ]

# delete rows with missing value in these columns           
rm_na_col = ['county_name','violation','subject_race','subject_sex']

# this function is called by preprocess
def remove_empty_rows(df, colName):
	df = df[df[colName] != 'unknown']
	df = df[df[colName].notna()]
	return(df)

In [41]:
### this function used in run_year_analysis()
def data_loss(year, keep_col, rm_na_col, with_vio = False, violation_types=None, speeding_only = False):
  """
  with_vio: if include vioation type variables: True, if not include, False
  violation_types: violation types to be included in the analysis
  """
  # read dataframe
  filename = '/content/drive/MyDrive/traffic_stop/year_data/traffic_' + str(year) + '.parquet'
  df = pd.read_parquet(filename, engine = 'pyarrow')
  print('# of all traffic stops in year ', year, ': ', len(df))
  before = len(df)

  # invalid value (IF NOT TEXAS STATE DATA, DELETE THIS INVALID VALUE BLOCK)
  if year == 2013:
    df.drop(df.index[df['lat'] == 74.052879], inplace=True)

  # drop unrelated columns
  col_drop = [col for col in df.columns if col not in keep_col]
  df.drop(col_drop, axis = 1, inplace = True)

  # remove rows with missing values in rm_na_col
  for col in rm_na_col:
    df = remove_empty_rows(df, col)

  # filter rows if speeding_only
  if speeding_only:
    # filter: only speeding violation (no other violations) 
    df = speeding_filter(df)

  ######### THIS WHOLE BLOACK NEEDS TO BE CHANGED, NEW METROPOLITAN FILES AND DIFFERENT DATA FORMAT #########

  # County names are converted to county type - metropolitan, micropolitan or non-core
  # For definitions, see US OMB website

  # read in county info csv
  county_df = pd.read_csv('/content/drive/MyDrive/traffic_stop/2014-2018.csv')
  county_df = county_df[county_df['State']=='Texas']
  county_df = county_df.filter(items=['Metropolitan Status', 'County Name'])

  # transform column
  df['county'] = [name[:-7] for name in df['county_name']]
  df['county'] = df['county'].replace('Dewitt','DeWitt')
  df = df.join(county_df.set_index('County Name'), on='county')
  df.drop('county', axis = 1, inplace=True)
  df.rename(columns={'Metropolitan Status':'county_type'}, inplace=True)

  ######### Metropolitan block ends here #########

  # Convert 'citation issued' to integer
  df = df.astype({'citation_issued': 'int64'})

  # if race is other/unknown, we delete the rows!
  df = df.loc[(df['subject_race'] != 'unknown') & (df['subject_race'] != 'other'),:]
  df['subject_race'] = df.subject_race.cat.remove_unused_categories()

  df.drop(['county_name'], axis = 1, inplace = True)

  if with_vio:

    # get violation variables
    vio_df = get_vio_df(violation_types, vio_col = df['violation'])
    vio_df.drop('violation', axis = 1, inplace = True)

    # other x variables get dummies
    y = df['citation_issued']
    df.drop(['violation','citation_issued'], axis = 1, inplace = True)
    df = pd.get_dummies(df)
    df.drop(['subject_race_white', 'county_type_Metropolitan', 'subject_sex_male'], axis = 1, inplace = True)

    #vio_df.drop('Speeding Over Limit', axis = 1, inplace = True), I don't think vio_type needs a base level, as no multicollinearity
    df.reset_index(drop=True, inplace=True)
    vio_df.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)
    X = pd.concat([df, vio_df], axis = 1)
    X.columns = list(df.columns) + list(vio_df.columns)

    print('# of traffic stops after preprocessing in year ', year, ': ', len(X))
    after = len(X)
    print('Data that is lost in year', year, ':')
    return (before-after)

  elif not with_vio:

    # other x variables get dummies
    y = df['citation_issued']
    df.drop(['violation','citation_issued'], axis = 1, inplace = True)
    df = pd.get_dummies(df)
    df.drop(['subject_race_white', 'county_type_Metropolitan', 'subject_sex_male'], axis = 1, inplace = True)
    
    df.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)

    print('# of traffic stops after preprocessing in year ', year, ': ', len(df))
    after = len(df)

    print('Data that is lost in year', year, ':')
    return (before-after)
  

In [42]:
for year in range(2006, 2018):
  data_lost=(data_loss(year, keep_col, rm_na_col, with_vio = False, violation_types=None, speeding_only = False))
  print(data_lost)

# of all traffic stops in year  2006 :  2693894
# of traffic stops after preprocessing in year  2006 :  2671708
Data that is lost in year 2006 :
22186
# of all traffic stops in year  2007 :  2427347
# of traffic stops after preprocessing in year  2007 :  2404322
Data that is lost in year 2007 :
23025
# of all traffic stops in year  2008 :  2526783
# of traffic stops after preprocessing in year  2008 :  2432074
Data that is lost in year 2008 :
94709
# of all traffic stops in year  2009 :  2441306
# of traffic stops after preprocessing in year  2009 :  2368223
Data that is lost in year 2009 :
73083
# of all traffic stops in year  2010 :  2525296
# of traffic stops after preprocessing in year  2010 :  2449377
Data that is lost in year 2010 :
75919
# of all traffic stops in year  2011 :  2588004
# of traffic stops after preprocessing in year  2011 :  2508218
Data that is lost in year 2011 :
79786
# of all traffic stops in year  2012 :  2435812
# of traffic stops after preprocessing in year

In [82]:
total_data= year06+year07+year08+year09+year10+year11+year12+year13+year14+year15+year16+year17

In [83]:
total_data_lost=22186+23025+94709+73083+75919+79786+71154+63471+62337+66783+1621+1965
print('Total Data Lost across all years: ', total_data_lost)

print('Total Data before any processing across all years: ', total_data)

percentage_lost=(total_data_lost/total_data)*100
print('Percentage of lost data across all years is: ', percentage_lost)

Total Data Lost across all years:  636039
Total Data before any processing across all years:  27426840
Percentage of lost data across all years is:  2.319038576810161
