In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

from collections import defaultdict

import csv
import os
import datetime as dt
import re

In [18]:
def construct_lookup(data):
    # data = [(PHYSICALID, L_LOW_HN, L_HIGH_HN, R_LOW_HN, R_HIGH_HN, ST_LABEL, BOROCODE_IDX, FULL_STREE)]
    lookup = defaultdict(list)
    for row in data:
        # format outputs
        id = int(row[0])
        l_low = 0 if len(row[1]) == 0 else int(re.sub('-0|-', '', row[1]))
        l_high = 0 if len(row[2]) == 0 else int(re.sub('-0|-', '', row[2]))
        r_low = 0 if len(row[3]) == 0 else int(re.sub('-0|-', '', row[3]))
        r_high = 0 if len(row[4]) == 0 else int(re.sub('-0|-', '', row[4]))
        st_label = row[5].lower()
        borocode = int(row[6])
        full_stree = row[7].lower()
        # add formatted elements to table
        lookup[(st_label, borocode)].append(((r_low, r_high), (l_low, l_high), id))
        lookup[(full_stree, borocode)].append(((r_low, r_high), (l_low, l_high), id))
    return lookup

In [19]:
NYC_CSCL_PATH = 'data\\nyc_cscl.csv'
root = 'test'
violation_records = [os.path.join(root, 'violation_small1.csv'),
                     os.path.join(root, 'violation_small2.csv')]
VIOLATION_PATH = ','.join(violation_records)

In [24]:
# sc = SparkContext()

In [21]:
table = sc.textFile(NYC_CSCL_PATH)
header_table = table.first()
# start testing
res = sc.textFile(NYC_CSCL_PATH) \
        .filter(lambda x: x != header_table) \
        .mapPartitions(lambda x: csv.reader(x)) \
        .filter(lambda x: len(x) >= 30) \
        .map(lambda x: (x[0], (x[2], x[3], x[4], x[5], x[10], x[13], x[28]))) \
        .reduceByKey(lambda x, y: x) \
        .map(lambda x: (x[0], x[1][0], x[1][1], x[1][2], x[1][3], x[1][4], x[1][5], x[1][6])) \
        .collect()

In [25]:
len(res)

119801

In [13]:
for r in res:
    assert len(r) == 8
res[:5]

[('95944', '15', '15', '0', '0', 'TIDES LA', '5', 'TIDES LN'),
 ('43450', '601', '699', '600', '698', 'AV I', '3', 'I AVE'),
 ('96186', '1', '57', '2', '60', 'SILVER CT', '5', 'SILVER CT'),
 ('146179',
  '',
  '',
  '',
  '',
  'ROOSEVELT IS BR PED & BIKE PATH',
  '4',
  'ROOSEVELT IS BR PED & BIKE PATH'),
 ('42533', '2301', '2329', '0', '0', 'CROPSEY AVE', '3', 'CROPSEY AVE')]

In [22]:
lookup = construct_lookup(res)

In [42]:
print('Approximate number of boro and street pair:', len(lookup))

num_total = 0
for v in lookup.values():
    num_total += len(v)
print('Approximate number of num_total street segments:', num_total)

Approximate number of boro and street pair: 18069
Approximate number of num_total street segments: 239602


In [43]:
print(lookup[('tides la', 5)])
print(lookup[('tides ln', 5)])
print(lookup[('roosevelt is br ped & bike path', 4)])
print(lookup[('silver ct', 5)])
print(lookup[('bluh, bluh, dummy', 1)])

[((0, 0), (15, 15), 95944), ((20, 28), (0, 0), 95943), ((32, 46), (0, 0), 95945)]
[((0, 0), (15, 15), 95944), ((20, 28), (0, 0), 95943), ((32, 46), (0, 0), 95945)]
[((0, 0), (0, 0), 146179), ((0, 0), (0, 0), 146179), ((0, 0), (0, 0), 146181), ((0, 0), (0, 0), 146181)]
[((2, 60), (1, 57), 96186), ((2, 60), (1, 57), 96186)]
[]


In [36]:
LOOKUP_BCAST = sc.broadcast(lookup)

In [39]:
len(LOOKUP_BCAST.value)

18068

In [54]:
def format_hn(hn_record):
    # if a record is empty, assigns 0
    if len(hn_record) == 0:
        return 0
    # otherwise concatenate two values together
    # example: '187-09' = 18709 <int>
    # example: '187' = 187 <int>
    else:
        # format cases like `70 23` -> `70-23`
        hn_record = re.sub('\s', '-', hn_record)
        # exclude cases like `123A`, 'W', 'S', etc.
        try:
            return int(hn_record.replace('-', ''))
        except ValueError:
            return -1

In [55]:
# violation_record = [year, borocode, house_number, street_name]
def lookup_street_segment(v_record, lookup_table):
    street_name = v_record[3].lower() # lower string
    house_number = v_record[2] # <str>
    borocode = v_record[1] # <int>
    # lookup table to get candidates
    hn_ranges = lookup_table[(street_name, borocode)]
    # if key doesn't exist, it returns empty list
    if len(hn_ranges) == 0:
        return -1
    # format house number, if output is -1, returns -1
    formatted_hn = format_hn(house_number)
    if formatted_hn == -1:
        return -1
    # check candidate ranges, if there is a match, returns physicalID
    for hn_range in hn_ranges:
        # hn_range = ((r_low, r_high), (l_low, l_high), physicalID)
        ran = hn_range[formatted_hn%2]
        # ran = (low, high)
        if (ran[0] <= formatted_hn) and (formatted_hn <= ran[1]):
            return hn_range[2]
    # if there is no match, returns -1
    return -1

In [58]:
print(lookup_street_segment([1, 5, '20', 'tides la'], lookup))
print(lookup_street_segment([1, 5, '28', 'tides la'], lookup))
print(lookup_street_segment([1, 5, '26', 'tides la'], lookup))
print(lookup_street_segment([1, 5, '25', 'tides la'], lookup))

95943
95943
95943
-1


In [61]:
print(lookup_street_segment([1, 5, '15', 'tides la'], lookup))
print(lookup_street_segment([1, 5, '1', 'tides la'], lookup))
print(lookup_street_segment([1, 5, '17', 'tides la'], lookup))
print(lookup_street_segment([1, 1, '17', 'tides la'], lookup))

95944
-1
-1
-1


In [62]:
print(lookup_street_segment([1, 5, '17', 'silver ct'], lookup))
print(lookup_street_segment([1, 1, '17', 'silver ct'], lookup))
print(lookup_street_segment([1, 5, '', 'silver ct'], lookup))
print(lookup_street_segment([1, 5, '59', 'silver ct'], lookup))
print(lookup_street_segment([1, 5, '18', 'silver ct'], lookup))
print(lookup_street_segment([1, 5, '21', 'silver ct'], lookup))

96186
-1
-1
-1
96186
96186


In [64]:
print(lookup_street_segment([1, 5, '', 'tides ln'], lookup))
print(lookup_street_segment([1, 5, '', 'tides la'], lookup))

95944
95944


In [65]:
print(lookup_street_segment([1, 5, '124!', 'tides ln'], lookup))
print(lookup_street_segment([1, 5, '70 23', 'tides ln'], lookup))
print(lookup_street_segment([1, 5, 'Q', 'tides ln'], lookup))

-1
-1
-1


In [66]:
print(lookup_street_segment([1, 4, '107-31', '105 st'], lookup))
print(lookup_street_segment([1, 4, '108-00', '105 st'], lookup))

93463
93463


In [71]:
for l in lookup.values():
    print(l)

[((0, 0), (15, 15), 95944), ((20, 28), (0, 0), 95943), ((32, 46), (0, 0), 95945)]
[((0, 0), (15, 15), 95944), ((20, 28), (0, 0), 95943), ((32, 46), (0, 0), 95945)]
[((600, 698), (601, 699), 43450), ((1300, 1398), (1301, 1399), 43460), ((800, 898), (801, 899), 43448), ((1600, 1698), (1601, 1699), 46009), ((4400, 4498), (4401, 4499), 46029), ((2500, 2598), (2501, 2599), 46000), ((1100, 1198), (1101, 1199), 43445), ((5700, 5798), (5701, 5799), 46045), ((5400, 5498), (5401, 5499), 46042), ((3302, 3302), (0, 0), 46017), ((0, 0), (4501, 4501), 46030), ((3200, 3298), (3201, 3299), 46016), ((4600, 4698), (4601, 4699), 46034), ((4300, 4398), (4301, 4399), 46028), ((400, 498), (401, 499), 43454), ((0, 0), (0, 0), 46032), ((3700, 3798), (3701, 3799), 46022), ((1900, 1998), (1901, 1999), 46006), ((5000, 5098), (5001, 5099), 46038), ((4000, 4098), (4001, 4099), 46025), ((1000, 1098), (1001, 1023), 43446), ((1700, 1798), (1701, 1799), 46008), ((4200, 4298), (4201, 4299), 46027), ((3900, 3998), (3901

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[((2, 98), (1, 99), 85634), ((2, 98), (1, 99), 85634)]
[((2, 58), (1, 57), 102033), ((2, 58), (1, 57), 102033)]
[((2, 60), (1, 59), 101854), ((2, 60), (1, 59), 101854)]
[((0, 0), (0, 0), 83924)]
[((0, 0), (0, 0), 83924)]
[((1800, 1898), (1801, 1899), 103260)]
[((1800, 1898), (1801, 1899), 103260)]
[((0, 0), (0, 0), 91049)]
[((0, 0), (0, 0), 91049)]
[((2, 28), (1, 27), 104057)]
[((2, 28), (1, 27), 104057)]
[((2, 18), (1, 19), 100546), ((2, 18), (1, 19), 100546)]
[((10500, 10598), (10501, 10599), 100963), ((10500, 10598), (10501, 10599), 100963)]
[((2, 8), (1, 9), 91607), ((2, 8), (1, 9), 91607)]
[((1030, 1062), (1027, 1063), 94046), ((1030, 1062), (1027, 1063), 94046)]
[((0, 0), (0, 0), 176987), ((0, 0), (0, 0), 176987)]
[((2, 98), (1, 99), 166645)]
[((2, 98), (1, 99), 166645)]
[((2, 16), (1, 35), 104693), ((2, 16), (1, 35), 104693)]
[((2, 98), (1, 99), 103616), ((2, 98), (1, 99), 103616)]
[((0, 0), (0, 0), 99472)]
[((0, 0), (0, 0), 99472)]
[((7100, 7140), (7101, 7141), 103894)]
[((7100

In [74]:
physicalIDs = defaultdict(list)
for row in lookup.values():
    # row = [(.., .., physical ID), (.., .., physical ID), ..., (.., .., physical ID)]
    for element in row:
        try:
            id = element[2]
            physicalIDs[id] = [0, 0, 0, 0, 0, 0]
        except ValueError:
            pass

In [75]:
len(physicalIDs)

119801