# NYC Ticket Writing Machine!
### DS4400 Final Project
### by Benjamin Kosiborod and Victoria Staada
&nbsp;
## Data Cleanup

In [1]:
# All project imports
import pandas as pd
import numpy as np
import math

In [9]:
# Read in the data
df = pd.read_csv('https://data.cityofnewyork.us/resource/faiq-9dfq.csv?$limit=12000000')

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
# Preview the data
print(df.describe())

       summons_number  violation_code  street_code1  street_code2  \
count    1.146751e+07    1.146751e+07  1.146751e+07  1.146751e+07   
mean     7.474217e+09    3.377908e+01  2.562781e+04  2.119624e+04   
std      2.268085e+09    1.985783e+01  2.246802e+04  2.193509e+04   
min      1.028884e+09    0.000000e+00  0.000000e+00  0.000000e+00   
25%      8.500714e+09    2.000000e+01  9.130000e+03  0.000000e+00   
50%      8.655880e+09    3.600000e+01  1.934000e+04  1.474000e+04   
75%      8.694870e+09    4.000000e+01  3.618000e+04  3.427000e+04   
max      8.768851e+09    9.900000e+01  9.802000e+04  9.831000e+04   

       street_code3  vehicle_expiration_date  violation_location  \
count  1.146751e+07             1.146751e+07        9.589591e+06   
mean   2.126681e+04             2.691894e+07        5.636025e+01   
std    2.198500e+04             2.736702e+07        3.963513e+01   
min    0.000000e+00             0.000000e+00        1.000000e+00   
25%    0.000000e+00             2.0181

In [11]:
# Drop some columns that are empty, or do not have useful data
df = df.drop(columns=['unregistered_vehicle', 'violation_post_code', 'violation_description', 'no_standing_or_stopping', 'hydrant_violation', 'double_parking_violation'])

In [12]:
# Check types
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11467506 entries, 0 to 11467505
Data columns (total 37 columns):
summons_number              int64
plate_id                    object
registration_state          object
plate_type                  object
issue_date                  object
violation_code              int64
vehicle_body_type           object
vehicle_make                object
issuing_agency              object
street_code1                int64
street_code2                int64
street_code3                int64
vehicle_expiration_date     int64
violation_location          float64
violation_precinct          int64
issuer_precinct             int64
issuer_code                 int64
issuer_command              object
issuer_squad                object
violation_time              object
time_first_observed         object
violation_county            object
violation_in_front_of_or    object
house_number                object
street_name                 object
intersecting_stree

In [13]:
# Instead of having meter numbers, change this column into a boolean value
# for meter or no meter recorded at time of violation
df['meter_number'] = df['meter_number'].apply(lambda x: 0 if x == '-' or pd.isnull(x) else 1)

In [57]:
# Drop rows where vehicle color is null or unknown, as there are only 
# ~155K such rows, which should be fine to drop as this is an
# important column and we have ~11.5M rows of data
df = df[(pd.notnull(df['vehicle_color'])) 
        & (df['vehicle_color'] != 'UNK') 
        & (df['vehicle_color'] != 'UNKNO') 
        & (df['vehicle_color'] != 'O') 
        & (df['vehicle_color'] != 'MRPK') 
        & (df['vehicle_color'] != 'EMOOL')
        & (df['vehicle_color'] != 'IL')
        & (df['vehicle_color'] != 'TRF')]

In [56]:
colors = {}
for color in ('WH', 'W', 'w', 'white', 'White', 'WT', 'WHI', 'WH/', 'WHITE', 'Cream', 'CREAM', 'WT.', 'WHTE', 'WH YW', 'WHITW', 'HT'):
    colors[color] = 'White'
for color in ('Black','BLK','BK','black','BLK.','BK.','BLACK'):
    colors[color] = 'Black'
for color in ('GRY', 'Gray', 'GY/', 'GRAY', 'GY', 'GREY', 'grey', 'Grey', 'DKGRY', 'M.GRE', 'GY/GL', 'GRAYF', 'CHRAY'):
    colors[color] = 'Gray'
for color in ('Silver', 'SILVER', 'SLV', 'SV', 'SL', 'SILV', 'SIL', 'SILVR', 'SL.', 'STEEL', 'MET'):
    colors[color] = 'Silver'
for color in ('TAN', 'Beige','BEIGE', 'beige', 'BLD', 'ALMON'):
    colors[color] = 'Beige'
for color in ('RED', 'red', 'RD', 'rd', 'Rd', 'Red', 'RO', 'BUGA', 'MAROO', 'MAR', 'BUNGE'):
    colors[color] = 'Red'
for color in ('BLUE', 'BLLU', 'QBLUE', 'BUO'):
    colors[color] = 'Blue'
for color in ('GREEN', 'GYN'):
    colors[color] = 'Green'
for color in ('YELLW', 'YEL', 'YELLO', 'YW', 'GOLDE', 'ORO', 'YOL', 'YLOW', 'YL'):
    colors[color] = 'Yellow'
for color in ('BR/GY', 'BON', 'BRWMN', 'BREIR', 'BEUG', 'BRWN'):
    colors[color] = 'Brown'
for color in ('LAVEN', 'PURPL'):
    colors[color] = 'Purple'

df['vehicle_color'] = df['vehicle_color'].apply(lambda row: colors.get(row, row))
print(len(df['vehicle_color'].unique()))
print(df['vehicle_color'].unique())

1626
['Red' 'Beige' 'Gray' ... 'YELLL' 'BUNGE' 'TRF']
