# NYC Parking Violations
This example demonstrates transforming New York City parking summons data to create maps. Original example can be found [here](https://github.com/JBlumstein/NYCParking/blob/master/NYC_Parking_Violations_Mapping_Example.ipynb).

In [1]:
import numpy as np
import pandas as pd
import time
import bodo

## Data Loading
In this section parking tickets data is loaded from S3 bucket and aggregated by day, violation type, and police precinct and placed in a dataframe. In addition, violcation codes, and precincts information are loaded as well.

In [2]:
@bodo.jit(cache=True)
def load_parking_tickets():
    start = time.time()
    year_2016_df = pd.read_parquet('s3://bodo-example-data/nyc-parking-tickets/Parking_Violations_Issued_-_Fiscal_Year_2016_8.pq')
    year_2016_df = year_2016_df.groupby(['Issue Date','Violation County','Violation Precinct','Violation Code'], as_index=False)['Summons Number'].count()
    print("Read and groupby time: ", time.time() - start)
    return year_2016_df

main_df = load_parking_tickets()
print(main_df.head())

Read and groupby time:  6.225391999999829


Unnamed: 0,Issue Date,Violation County,Violation Precinct,Violation Code,Summons Number
0,2015-07-09,K,88,21,59
1,2015-07-09,K,90,20,26
2,2015-07-09,K,90,71,11
3,2015-07-09,K,90,74,5
4,2015-06-23,K,75,66,6


In [5]:
@bodo.jit(distributed=False)
def load_violation_precincts_codes():
    start = time.time()
    violation_codes = pd.read_csv("./data/DOF_Parking_Violation_Codes.csv")
    violation_codes.columns = ['Violation Code','Definition','manhattan_96_and_below','all_other_areas']
    nyc_precincts_df = pd.read_csv("./data/nyc_precincts.csv", index_col='index')
    print("Violation and precincts load time: ", time.time() - start)
    return violation_codes, nyc_precincts_df

violation_codes, nyc_precincts_df = load_violation_precincts_codes()

Violation and precincts load time:  0.006868999999824155


## Data Cleaning

1. Remove summons with undefined violations (violation code 36).
2. Delete entries that have dates not within our dataset dates.

In [6]:
@bodo.jit(cache=True)
def elim_code_36(main_df):
    start = time.time()   
    main_df = main_df[main_df['Violation Code'] != 36].sort_values('Summons Number', ascending=False)
    end = time.time()
    print("Eliminate undefined violations time: ", end - start)
    return main_df

main_df = elim_code_36(main_df)
print(main_df.head())

Eliminate undefined violations time:  1.0307010000001355
       Issue Date Violation County  Violation Precinct  Violation Code  \
334108 2015-11-27                Q                 114              21   
340293 2015-12-31                Q                 114              21   
32556  2015-11-27                Q                 115              21   
259150 2015-11-27               BX                  43              21   
33773  2015-12-24                Q                 114              21   

        Summons Number  
334108            1165  
340293             778  
32556              743  
259150             742  
33773              686  


In [7]:
@bodo.jit(cache=True)
def remove_outliers(main_df):
    start = time.time()
    main_df = main_df[(main_df['Issue Date'] >= '2016-01-01') & (main_df['Issue Date'] <= '2017-12-31')]
    print("Remove outliers time: ", (time.time()-start)) 
    return main_df

main_df = remove_outliers(main_df)
main_df.head()

Remove outliers time:  0.006421000000045751


Unnamed: 0,Issue Date,Violation County,Violation Precinct,Violation Code,Summons Number
365458,2016-05-08,BK,0,7,684
215067,2016-04-30,BK,0,7,666
290349,2016-05-08,QN,0,7,632
215068,2016-04-30,QN,0,7,607
591138,2016-05-07,BK,0,7,567


## Collect More Information
Data on each violation type, like ticket cost and violation descriptions, are added to the dataset by joining our main_df dataset with a violation type level dataset

In [8]:
@bodo.jit(cache=True)
def merge_violation_code(main_df, violation_codes):
    start = time.time()
    # left join main_df and violation_codes df so that there's more info on violation in main_df
    main_df = pd.merge(main_df, violation_codes, on='Violation Code', how='left')
    # cast precincts as integers from floats (inadvertent type change by merge)
    main_df['Violation Precinct'] = main_df['Violation Precinct'].astype(int)    
    print("Merge time: ", (time.time()-start))
    print(main_df.shape)
    return main_df

main_w_violation = merge_violation_code(main_df, violation_codes)
print(main_w_violation.head())

Merge time:  0.087203000000045
(279344, 8)


Unnamed: 0,Issue Date,Violation County,Violation Precinct,Violation Code,Summons Number,Definition,manhattan_96_and_below,all_other_areas
0,2016-05-08,BK,0,7,684,Vehicles photographed going through a red ligh...,50,50
1,2016-04-30,BK,0,7,666,Vehicles photographed going through a red ligh...,50,50
2,2016-05-08,QN,0,7,632,Vehicles photographed going through a red ligh...,50,50
3,2016-04-30,QN,0,7,607,Vehicles photographed going through a red ligh...,50,50
4,2016-05-07,BK,0,7,567,Vehicles photographed going through a red ligh...,50,50


## Compute Cost of Summons For Each Precinct

1. Most violations have different ticket prices, based on whether they occur in Manhattan below 96th St. or elsewhere in New York City. The daily revenue for each violation type in each precinct are determined by multiplying the number of offenses by the average cost of the offense (based on how much of the precinct is in Manhattan below 96th St.).

In [11]:
# calculate the total summonses in dollars for a violation in a precinct on a day
@bodo.jit(cache=True)
def calculate_total_summons(main_df):
    start = time.time()
    # create column for portion of precinct 96th st. and below
    def get_portion(x):
        out = 0
        if x < 22 or x == 23:
            out = 1.0
        elif x == 22:
            out = 0.75
        elif x == 24:
            out = 0.5
        return out

    main_df["portion_manhattan_96_and_below"] = main_df['Violation Precinct'].map(get_portion)

    # create column for average dollar amount of summons based on location
    main_df['average_summons_amount'] = (main_df['portion_manhattan_96_and_below'] * main_df['manhattan_96_and_below'] 
                                     + (1 - main_df['portion_manhattan_96_and_below']) * main_df['all_other_areas'])

    # get total summons dollars by multiplying average dollar amount by number of summons given
    main_df['total_summons_dollars'] = main_df['Summons Number'] * main_df['average_summons_amount']
    main_df = main_df.sort_values(by=['total_summons_dollars'], ascending=False)
    end = time.time()    
    print("Calculate Total Summons Time: ", (end-start))
    return main_df

total_summons = calculate_total_summons(main_w_violation)
print(total_summons.head())

Calculate Total Summons Time:  0.9133640000000014


Unnamed: 0,Issue Date,Violation County,Violation Precinct,Violation Code,Summons Number,Definition,manhattan_96_and_below,all_other_areas,portion_manhattan_96_and_below,average_summons_amount,total_summons_dollars
16,2016-01-22,NY,18,14,452,General No Standing: Standing or parking where...,115,115,1.0,115.0,51980.0
29,2016-01-12,NY,18,14,427,General No Standing: Standing or parking where...,115,115,1.0,115.0,49105.0
30,2016-01-07,NY,18,14,426,General No Standing: Standing or parking where...,115,115,1.0,115.0,48990.0
32,2016-01-29,NY,18,14,421,General No Standing: Standing or parking where...,115,115,1.0,115.0,48415.0
36,2016-02-04,NY,18,14,420,General No Standing: Standing or parking where...,115,115,1.0,115.0,48300.0


2. The aggregate function aggregates main_df by precinct. Once the data is run through this function that it will have a single row per precinct with the precinct number, the number of summonses, and the combined dollar value of the summonses.

In [12]:
@bodo.jit(cache=True)
def aggregate(main_df):
    '''function that aggregates and filters data
    e.g. total violations by precinct
    '''
    start = time.time()
    filtered_dataset = main_df[['Violation Precinct','Summons Number', 'total_summons_dollars']]
    precinct_offenses_df = filtered_dataset.groupby(by=['Violation Precinct']).sum().reset_index().fillna(0)
    end = time.time()
    precinct_offenses_df = precinct_offenses_df.sort_values("total_summons_dollars", ascending=False)
    print("Aggregate code time: ", (end-start))
    return precinct_offenses_df

precinct_offenses_df = aggregate(total_summons)
print(precinct_offenses_df.head())

Aggregate code time:  0.4767029999998158


Unnamed: 0,Violation Precinct,Summons Number,total_summons_dollars
113,19,262711,22775170.0
199,18,148126,14207635.0
138,1,150524,14156050.0
25,14,149838,14040050.0
200,13,155784,13535825.0
