In [1]:
import os
import numpy as np
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from scipy.stats import chisquare

%matplotlib notebook

# Automating the proess of finding the filename
dataFolder = 'data'
plotsFolder = 'plots'
state = 'TX'
policeDept = 'statewide' # 'statewide' means state patrol
csvFilepath = glob.glob(os.path.join(dataFolder, state, '{}_{}*.csv'.format(state.lower(), policeDept)))[0]
csvFilepath

# Choose column names to load
colNamesList = ['raw_row_number', 
                'date', 
                'time', 
                'county_name', 
                'subject_race', 
                'violation', 
                'citation_issued',
                'warning_issued']

In [2]:
# Automating the creation of rowName:dtype dict
# This throws up a lot of stupid warnings/errors right now!
# df = pd.read_csv(csvFilepath, nrows=5, names=colNamesList)
# dtypeDict = df.dtypes.to_dict()
# dtypeDict

dtypeDict = {'raw_row_number': 'int64', 
             'date': 'object',
             'time': 'object',
             'location': 'object',
             'lat': 'float64',
             'lng': 'float64',
             'county_name': 'object',
             'district': 'object',
             'precinct': 'object',
             'region': 'float64',
             'subject_race': 'object',
             'subject_sex': 'object',
             'officer_id_hash': 'object',
             'type': 'object',
             'violation': 'object',
             'citation_issued': 'bool',
             'warning_issued': 'bool',
             'outcome': 'object',
             'contraband_found': 'object',
             'contraband_drugs': 'object',
             'contraband_weapons': 'object',
             'search_conducted': 'object',
             'search_vehicle': 'object',
             'search_basis': 'object',
             'vehicle_color': 'object',
             'vehicle_make': 'object',
             'vehicle_model': 'object',
             'vehicle_type': 'object',
             'vehicle_year': 'float64',
             'raw_HA_RACE_SEX': 'object',
             'raw_HA_SEARCH_PC_boolean': 'object',
             'raw_HA_SEARCH_CONCENT_boolean': 'object',
             'raw_HA_INCIDTO_ARREST_boolean': 'object',
             'raw_HA_VEHICLE_INVENT_boolean': 'object'}


        
# search_vehicle only has False and nan
# type is just vehicular
# violation column is of interest - reason for stop (split by |?)

In [3]:
chunk = pd.read_csv(csvFilepath,chunksize=1000000, dtype=dtypeDict, usecols=colNamesList)
df = pd.concat(chunk)

In [16]:
# Size of dataset
len(df)

27426840

In [17]:
# Delete rows with 'unknown' and 'nan' for 'subject_race'
df = df[df['subject_race'] != 'unknown']
df = df[df['subject_race'].notna()]

# Delete rows with 'nan' in violation column
df = df[df['violation'].notna()]

In [18]:
# Size of dataset after removing NA
len(df)

26826771

In [19]:
# Create a set of violations
violationSet = set()

count = 0
for individualViolation in df['violation']:
    count += 1
    
    if count % 1000000 == 0: # Print after ebery million
        print('Processing:', count)
    
    # Split violations (theyt are separated by |)
    individualViolationList = individualViolation.split('|')
    
    for i in range(len(individualViolationList)):
        # Remove '(#)' that appears often - don't know the reason for this!
        individualViolationList[i] = individualViolationList[i].replace('(#)', '')
        
        # Remove leading and/or trailing white spaces
        individualViolationList[i] = individualViolationList[i].strip()
        
        # Add violation to the set of violations
        violationSet.add(individualViolationList[i])

Processing: 1000000
Processing: 2000000
Processing: 3000000
Processing: 4000000
Processing: 5000000
Processing: 6000000
Processing: 7000000
Processing: 8000000
Processing: 9000000
Processing: 10000000
Processing: 11000000
Processing: 12000000
Processing: 13000000
Processing: 14000000
Processing: 15000000
Processing: 16000000
Processing: 17000000
Processing: 18000000
Processing: 19000000
Processing: 20000000
Processing: 21000000
Processing: 22000000
Processing: 23000000
Processing: 24000000
Processing: 25000000
Processing: 26000000


In [20]:
# Number of unique violations
len(violationSet)

1471

In [22]:
for violation in sorted(violationSet):
    print(violation)

406, 407, 412, Double Bulkhead Drain
406,407,412 Bottom Damage Protection
406,407,412 Internal Valve
406,407,412 Manhole Securement
406,407,412 Minimum Road Clearance
406,407,412 Rear End Protection
406,407,412 Ring Stiffener Drn
406,407,412 Ring Stiffener Typ
406,407,412 Single Name/specification Plate
406,407,412 Supports/anchoring
A fleet or government inspection station inspector issuing an inspection certificate to an unauthorized vehicle to include those not owned, leased, or under service contract to that entity or personal vehicles of officers and employees of the fleet o
ALL OTHER MISCELLANEOUS NON-TRAFFIC VIOLATIONS NOT 		LISTED
ALL OTHER MISCELLANEOUS TRAFFIC VIOLATIONS OTHERWISE NOT SPECIFIED IN VIOLATION CODE LISTINGS
Abuse Of Office-Misdemeanor
Acceleration Contest-racing/drag Racing
Adjustable Axle-locking Device
Affix Unauthorized Sunscreening Device To Motor Vehicle
Aggravated Assault On Law Enforc/corr Officr,probation Personnl-Felony
Aggravated Assault-Felony
Aggrava