# NYC Ticket Writing Machine!
### DS4400 Final Project
### by Benjamin Kosiborod and Victoria Staada
&nbsp;
## Data Cleanup

In [1]:
# All project imports
import pandas as pd
import numpy as np
import math

In [None]:
# Read in the data
data = pd.read_csv('https://data.cityofnewyork.us/resource/faiq-9dfq.csv?$limit=12000000')

In [None]:
# Copy the data so that we can
# manipulate the data and re-copy
# if we make a mistake
df = data.copy()

In [None]:
# Preview the data
print(df.describe())

In [None]:
# Drop some columns that are mostly empty, 
# or do not have useful data
df = df.drop(columns=['feet_from_curb', 'house_number', 'intersecting_street', 'date_first_observed', 'law_section', 'sub_division', 'violation_legal_code', 'violation_in_front_of_or', 'time_first_observed', 'issuer_code', 'issuer_command', 'issuer_squad', 'summons_number', 'plate_id', 'days_parking_in_effect', 'from_hours_in_effect', 'to_hours_in_effect', 'unregistered_vehicle', 'violation_description', 'no_standing_or_stopping', 'hydrant_violation', 'double_parking_violation'])

In [None]:
# Check columns left and their types
print(df.info())

In [None]:
# Instead of having meter numbers, change this column into a boolean value
# for meter or no meter recorded at time of violation
df['meter_number'] = df['meter_number'].apply(lambda x: 0 if x == '-' or pd.isnull(x) else 1)
df = df.rename({'meter_number': 'meter?'}, axis=1)

In [None]:
colors = {}
for color in ('WH', 'W', 'w', 'white', 'White', 'WT', 'WHI', 'WH/', 'WHITE', 'Cream', 'CREAM', 'WT.', 'WHTE', 'WH YW', 'WHITW', 'HT', 'WHT', 'WHBL', 'WHB', 'WHO', 'WHGY', 'WHIT', 'WHG', 'WHRD', 'WHGR', 'WTE', 'WH.', 'WHBK', 'WHTN', 'WHT.', 'WHTIE', 'WHE', 'WHGL', 'W/B', 'CRM', 'WHBR', 'WHWH', 'WHOR', 'WG', 'WHYW', 'WHIE', 'WJ', 'WHLE'):
    colors[color] = 'White'
for color in ('Black', 'BLK', 'BK', 'black', 'BLK.', 'BK.', 'BLACK', 'BLW', 'BKGY', 'BK/', 'BKBL', 'BLCK', 'BKGR', 'BLWH', 'BLA', 'BKG', 'BLK.', 'BKTN', 'BKW', 'BKT', 'BKWH', 'BLAC', 'BLAK', 'BLTN', 'BLRD', 'BKBK', 'BLGL', 'BLKWH'):
    colors[color] = 'Black'
for color in ('GRY', 'Gray', 'GY/', 'GRAY', 'GY', 'GREY', 'grey', 'Grey', 'DKGRY', 'M.GRE', 'GY/GL', 'GRAYF', 'CHRAY', 'LTGY', 'DKGY', 'GYGY', 'GYBL', 'GYGR', 'GY.', 'GYB', 'GRA', 'GYRD', 'GYBK', 'GYTN', 'GYG', 'GYBR', 'GYWH', 'GRY.', 'GRAY.', 'ALUMI', 'GYT', 'GREY.', 'GYGL', 'Gray', 'GYPR', 'GY GR'):
    colors[color] = 'Gray'
for color in ('Silver', 'SILVER', 'SLV', 'SV', 'SL', 'SILV', 'SILVE', 'SIL', 'SILVR', 'SL.', 'STEEL', 'MET', 'SLVR', 'SLR', 'SIV', 'SLVER', 'SLIVE', 'SLVE', 'SIL.'):
    colors[color] = 'Silver'
for color in ('TAN', 'Beige', 'BEIGE', 'beige', 'BLD', 'ALMON', 'TN', 'LTTN', 'TNGY', 'BE', 'BIEGE', 'TNGR', 'DKTN', 'TN/', 'BEIG'):
    colors[color] = 'Beige'
for color in ('RED', 'red', 'RD', 'rd', 'Rd', 'Red', 'RO', 'BUGA', 'MAROO', 'MAR', 'MR', 'BUNGE', 'RDW', 'DKR', 'DKRD', 'RD/', 'BURG', 'BURGU', 'BUR', 'RDGY', 'RD.', 'RDT', 'RDBK', 'RDBL', 'MRPK', 'RDG', 'RDGR', 'RDWH', 'RDRD', 'RDTN', 'DKMR', 'RD BK', 'RED.', 'BURGA', 'MRGY', 'MRN', 'BUG', 'RE', 'RDBR', 'DKRED'):
    colors[color] = 'Red'
for color in ('BLUE', 'BL', 'BLLU', 'QBLUE', 'BUO', 'BLU', 'DKBL', 'BLG', 'BL/', 'BLGY', 'LTBL', 'BLGR', 'BLBL', 'DBL', 'BL.', 'BLB', 'LBL', 'BLBK', 'BLUE.', 'NAVY', 'BLRD'):
    colors[color] = 'Blue'
for color in ('GREEN', 'GR', 'GYN', 'GRN', 'LTGR', 'DKGR', 'GR/', 'GRE', 'GRGY', 'GRG', 'GRW', 'GRBL', 'GRGR', 'DGR', 'GRB', 'GREN', 'GRT', 'LGR', 'GREE', 'GRTN'):
    colors[color] = 'Green'
for color in ('YELLW', 'YEL', 'YELL', 'YELLO', 'YW', 'GOLDE', 'ORO', 'YOL', 'YLOW', 'YL', 'GL', 'GOLD', 'GLD', 'YLW', 'Y', 'YE', 'YLLW', 'YELLL'):
    colors[color] = 'Yellow'
for color in ('BROWN', 'BR', 'BR/GY', 'BON', 'BRWMN', 'BREIR', 'BEUG', 'BRWN', 'BRN', 'BRO', 'BRW', 'BWN', 'LTBR', 'BROW', 'BRZ', 'DKBR', 'BRBL', 'BRON', 'BRT'):
    colors[color] = 'Brown'
for color in ('LAVEN', 'PURPL', 'PR', 'PURP', 'PUR', 'DKRR'):
    colors[color] = 'Purple'
for color in ('OR', 'ORANGE', 'ORANG', 'ONG', 'O', 'OG', 'ORA', 'ORAN', 'ORWH', 'ORN'):
    colors[color] = 'Orange'

df['vehicle_color'] = df['vehicle_color'].apply(lambda row: colors.get(row, row))

# Drop rows that are not categorized by the above, 
# as there as <500K such records out of 11.5M
# and the colors begin to get more niche. We
# suspect there is not enough data to accurately
# make predictions for the remaining colors.
df = df[df.vehicle_color.isin(colors.values())]

df['vehicle_color'].value_counts().head(25)

In [None]:
df = df[(df.vehicle_year <= 2020) & (df.vehicle_year != 0)]
df.vehicle_year.value_counts()

In [None]:
df = df[pd.notnull(df.vehicle_make)]
df.groupby('vehicle_make').filter(lambda x: len(x) > 100)
df.vehicle_make.value_counts().tail(10)