# NYC Ticket Writing Machine!
### DS4400 Final Project
### by Benjamin Kosiborod and Victoria Staada
&nbsp;
## Data Cleanup

In [4]:
# All project imports
import pandas as pd
import numpy as np
import math

In [5]:
# Read in the data
df = pd.read_csv('https://data.cityofnewyork.us/resource/faiq-9dfq.csv?$limit=12000000')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [6]:
# Preview the data
print(df.describe())

       summons_number  violation_code  street_code1  street_code2  \
count    1.146751e+07    1.146751e+07  1.146751e+07  1.146751e+07   
mean     7.474217e+09    3.377908e+01  2.562781e+04  2.119624e+04   
std      2.268085e+09    1.985783e+01  2.246802e+04  2.193509e+04   
min      1.028884e+09    0.000000e+00  0.000000e+00  0.000000e+00   
25%      8.500714e+09    2.000000e+01  9.130000e+03  0.000000e+00   
50%      8.655880e+09    3.600000e+01  1.934000e+04  1.474000e+04   
75%      8.694870e+09    4.000000e+01  3.618000e+04  3.427000e+04   
max      8.768851e+09    9.900000e+01  9.802000e+04  9.831000e+04   

       street_code3  vehicle_expiration_date  violation_location  \
count  1.146751e+07             1.146751e+07        9.589591e+06   
mean   2.126681e+04             2.691894e+07        5.636025e+01   
std    2.198500e+04             2.736702e+07        3.963513e+01   
min    0.000000e+00             0.000000e+00        1.000000e+00   
25%    0.000000e+00             2.0181

In [7]:
# Drop some columns that are empty, or do not have useful data
df = df.drop(columns=['unregistered_vehicle', 'violation_post_code', 'violation_description', 'no_standing_or_stopping', 'hydrant_violation', 'double_parking_violation'])

In [8]:
# Check types
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11467506 entries, 0 to 11467505
Data columns (total 37 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   summons_number            int64  
 1   plate_id                  object 
 2   registration_state        object 
 3   plate_type                object 
 4   issue_date                object 
 5   violation_code            int64  
 6   vehicle_body_type         object 
 7   vehicle_make              object 
 8   issuing_agency            object 
 9   street_code1              int64  
 10  street_code2              int64  
 11  street_code3              int64  
 12  vehicle_expiration_date   int64  
 13  violation_location        float64
 14  violation_precinct        int64  
 15  issuer_precinct           int64  
 16  issuer_code               int64  
 17  issuer_command            object 
 18  issuer_squad              object 
 19  violation_time            object 
 20  time_first_observed   

In [9]:
# Instead of having meter numbers, change this column into a boolean value
# for meter or no meter recorded at time of violation
df['meter_number'] = df['meter_number'].apply(lambda x: 0 if x == '-' or pd.isnull(x) else 1)

In [10]:
# Drop rows where vehicle color is null or unknown, as there are only 
# ~155K such rows, which should be fine to drop as this is an
# important column and we have ~11.5M rows of data
df = df[(pd.notnull(df['vehicle_color'])) 
        & (df['vehicle_color'] != 'UNK') 
        & (df['vehicle_color'] != 'UNKNO') 
        & (df['vehicle_color'] != 'EMOOL')
        & (df['vehicle_color'] != 'IL')
        & (df['vehicle_color'] != 'TRF')
        & (df['vehicle_color'] != 'NO')
        & (df['vehicle_color'] != 'XXX')]

In [11]:
colors = {}
for color in ('WH', 'W', 'w', 'white', 'White', 'WT', 'WHI', 'WH/', 'WHITE', 'Cream', 'CREAM', 'WT.', 'WHTE', 'WH YW', 'WHITW', 'HT', 'WHT', 'WHBL', 'WHB', 'WHO', 'WHGY', 'WHIT', 'WHG', 'WHRD', 'WHGR', 'WTE', 'WH.', 'WHBK', 'WHTN', 'WHT.', 'WHTIE', 'WHE', 'WHGL', 'W/B', 'CRM', 'WHBR', 'WHWH', 'WHOR', 'WG', 'WHYW', 'WHIE', 'WJ', 'WHLE'):
    colors[color] = 'White'
for color in ('Black', 'BLK', 'BK', 'black', 'BLK.', 'BK.', 'BLACK', 'BLW', 'BKGY', 'BK/', 'BKBL', 'BLCK', 'BKGR', 'BLWH', 'BLA', 'BKG', 'BLK.', 'BKTN', 'BKW', 'BKT', 'BKWH', 'BLAC', 'BLAK', 'BLTN', 'BLRD', 'BKBK', 'BLGL', 'BLKWH'):
    colors[color] = 'Black'
for color in ('GRY', 'Gray', 'GY/', 'GRAY', 'GY', 'GREY', 'grey', 'Grey', 'DKGRY', 'M.GRE', 'GY/GL', 'GRAYF', 'CHRAY', 'LTGY', 'DKGY', 'GYGY', 'GYBL', 'GYGR', 'GY.', 'GYB', 'GRA', 'GYRD', 'GYBK', 'GYTN', 'GYG', 'GYBR', 'GYWH', 'GRY.', 'GRAY.', 'ALUMI', 'GYT', 'GREY.', 'GYGL', 'Gray', 'GYPR', 'GY GR'):
    colors[color] = 'Gray'
for color in ('Silver', 'SILVER', 'SLV', 'SV', 'SL', 'SILV', 'SILVE', 'SIL', 'SILVR', 'SL.', 'STEEL', 'MET', 'SLVR', 'SLR', 'SIV', 'SLVER', 'SLIVE', 'SLVE', 'SIL.'):
    colors[color] = 'Silver'
for color in ('TAN', 'Beige', 'BEIGE', 'beige', 'BLD', 'ALMON', 'TN', 'LTTN', 'TNGY', 'BE', 'BIEGE', 'TNGR', 'DKTN', 'TN/', 'BEIG'):
    colors[color] = 'Beige'
for color in ('RED', 'red', 'RD', 'rd', 'Rd', 'Red', 'RO', 'BUGA', 'MAROO', 'MAR', 'MR', 'BUNGE', 'RDW', 'DKR', 'DKRD', 'RD/', 'BURG', 'BURGU', 'BUR', 'RDGY', 'RD.', 'RDT', 'RDBK', 'RDBL', 'MRPK', 'RDG', 'RDGR', 'RDWH', 'RDRD', 'RDTN', 'DKMR', 'RD BK', 'RED.', 'BURGA', 'MRGY', 'MRN', 'BUG', 'RE', 'RDBR', 'DKRED'):
    colors[color] = 'Red'
for color in ('BLUE', 'BL', 'BLLU', 'QBLUE', 'BUO', 'BLU', 'DKBL', 'BLG', 'BL/', 'BLGY', 'LTBL', 'BLGR', 'BLBL', 'DBL', 'BL.', 'BLB', 'LBL', 'BLBK', 'BLUE.', 'NAVY', 'BLRD'):
    colors[color] = 'Blue'
for color in ('GREEN', 'GR', 'GYN', 'GRN', 'LTGR', 'DKGR', 'GR/', 'GRE', 'GRGY', 'GRG', 'GRW', 'GRBL', 'GRGR', 'DGR', 'GRB', 'GREN', 'GRT', 'LGR', 'GREE', 'GRTN'):
    colors[color] = 'Green'
for color in ('YELLW', 'YEL', 'YELL', 'YELLO', 'YW', 'GOLDE', 'ORO', 'YOL', 'YLOW', 'YL', 'GL', 'GOLD', 'GLD', 'YLW', 'Y', 'YE', 'YLLW', 'YELLL'):
    colors[color] = 'Yellow'
for color in ('BROWN', 'BR', 'BR/GY', 'BON', 'BRWMN', 'BREIR', 'BEUG', 'BRWN', 'BRN', 'BRO', 'BRW', 'BWN', 'LTBR', 'BROW', 'BRZ', 'DKBR', 'BRBL', 'BRON', 'BRT'):
    colors[color] = 'Brown'
for color in ('LAVEN', 'PURPL', 'PR', 'PURP', 'PUR', 'DKRR'):
    colors[color] = 'Purple'
for color in ('OR', 'ORANGE', 'ORANG', 'ONG', 'O', 'OG', 'ORA', 'ORAN', 'ORWH', 'ORN'):
    colors[color] = 'Orange'

df['vehicle_color'] = df['vehicle_color'].apply(lambda row: colors.get(row, row))
print(len(df['vehicle_color'].unique()))
print(df['vehicle_color'].unique())

1457
['Red' 'Beige' 'Gray' ... 'WHLE' 'BLKWH' 'YELLL']


In [None]:
df['vehicle_color'].value_counts().head(25)