##### Pull NavLA Data
The script below scrapes the NavLA site, scraping the following pieces of information

* intersection: The name of the intersection / location
* node_id: The Bureau of Engineering CL_NODE_ID that corresponds to that location
* automatic: A list of filenames for automatic count PDF documents 
* manual: A list of filenames for manual count PDF documents

The final result is a json dump, which will will then import later and use for joining to the BOE intersection and centerline file later.

In [None]:
from lxml import html, etree
import requests
import time
import json

Count_data = []

# Base URL to use for the web scraping
base_url = "http://boemaps.eng.ci.la.ca.us/reports/dot_traffic_data_report.cfm?trafficid="

# Loop through integers, grab all data in range
for i in range(1,9000):
    
    # Metering the requests
    time.sleep(.1)
    
    # The dictionary to hold the final data
    d = {}
    
    # Make the request
    url = base_url + str(i)
    page = requests.get(url)
    tree = html.fromstring(page.content)
    
    # Parse the data
    table_info = tree.xpath('//td[@class="tablerecord"]/text()')
    if len(table_info) > 0:

        # Parse the Node ID
        node_id = tree.xpath('//td[@class="tablerecord"]/text()')[0]
        node_id = node_id.replace(u'\xa0', u'')

        # Parse the Intersection Name
        intersection = tree.xpath('//td[@class="tablerecord"]/text()')[1]
        intersection = intersection.replace(u'\r\n\t\t\t\t\r\n\t\t\t\t\t', u'')
        intersection = intersection.replace(u'\xa0\r\n\t\t\t\t\r\n\t\t\t',u'')

        ##### Automatic Counts
        # Select the table that contains the automatic counts
        automatic_count_table = tree.xpath('//b[text()="Automatic Count"]/../../..')

        # If there is at least automatic count PDF, grab filenames
        if len(automatic_count_table) > 0:
            automatic_count_table = etree.XML(etree.tostring(automatic_count_table[0]))
            automatic_count_pdfs = automatic_count_table.xpath('//a/text()')
            d['automatic'] = automatic_count_pdfs
        else:
            d['automatic'] = []

        ##### Manual Counts
        # Select the table that contains manual counts
        manual_count_table = tree.xpath('//b[text()="Manual Count"]/../../..')

        # If there is at least manual count PDF, grab filenames
        if len(manual_count_table) > 0:
            manual_count_table = etree.XML(etree.tostring(manual_count_table[0]))
            manual_count_pdfs = manual_count_table.xpath('//a/text()')
            d['manual'] = manual_count_pdfs
        else:
            d['manual'] = []

        d['node_id'] = node_id
        d['intersection'] = intersection
        Count_data.append(d)

with open('data/TrafficCountFileStructure/navLAdump.txt', 'w') as outfile:
    json.dump(Count_data, outfile)

    # Verify
print(Count_data)


##### Load NavLA Data
Now that we have the data pulled from Navigate LA, let's load it. I'll make some slight modificatons to the json data structure before we can bring it into a pandas data frame and join to the rest of the data.

In [15]:
import json
import pandas as pd

# Lcoation of the json file
traffic_count_file = 'data/TrafficCountFileStructure/navLAdump.txt'

# Load json
with open(traffic_count_file) as f:
    traffic_count_files = json.load(f)

# New list data structure
count_files = []
count_files_labels = ['cl_node_id','location','type','file']

for location in traffic_count_files:
    
    # New list for each count, add intersection infomration & node id
    count = []
    node_id = location['node_id']
    loc = location['intersection']
    
    # Add a row for each manual count
    if len(location['manual']) > 0:
        for man in location['manual']:
            man2 = man + '.pdf'
            count = [node_id, loc, 'manual', man2]
            count_files.append(count)
    
    # Add a row for each automatic count
    if len(location['automatic']) > 0:
        for auto in location['automatic']:
            auto2 = auto + '.pdf'
            count = [node_id, loc, 'automatic', auto]
            count_files.append(count)
    
count_files_df = pd.DataFrame.from_records(count_files, columns=count_files_labels)
print("There are " + str(len(count_files_df)) + " files.")
count_files_df.head()

There are 26210 files.


Unnamed: 0,cl_node_id,location,type,file
0,3667,ISLAND AVE at L ST,manual,3667_ISLLST94.pdf
1,3680,FIGUEROA ST at L ST,manual,3680_FIGLST94.pdf
2,3727,FRIES AVE at HARRY BRIDGES BLVD,manual,3727_FRIHAR95.pdf
3,3787,ANAHEIM ST AT FARRAGUT AVE,manual,3787_ANAFAR100817.pdf
4,3787,ANAHEIM ST AT FARRAGUT AVE,manual,FARRAGUT.ANAHEIM.110119-MAN.pdf


Now that we have the count file table built, I am going to use the filename to join to the traffic data files to get the count id. I'll start by loading in the traffic data files.

In [17]:
# Load traffic data files table
traffic_data_files_path = 'data/TrafficCountFileStructure/boe_tables/dbo_dot_traffic_data_files.csv'
dbo_dot_traffic_data_files = pd.read_csv(traffic_data_files_path,
                                         parse_dates=['UploadDT'],
                                         encoding="ISO-8859-1")

# Drop rows where TrafficID is NaN, convert TrafficID to int type
dbo_dot_traffic_data_files = dbo_dot_traffic_data_files.dropna(axis=0, how='any', subset=['TrafficID'])
dbo_dot_traffic_data_files['TrafficID'] = dbo_dot_traffic_data_files['TrafficID'].astype(int)

# Subset out Survey Data and Automatic Counts
dbo_dot_traffic_data_files = dbo_dot_traffic_data_files[(dbo_dot_traffic_data_files['TrafficType'] == 'manual_count')]

# Preview the table
print("There are " + str(len(dbo_dot_traffic_data_files)) + " records in the table.")
dbo_dot_traffic_data_files.head()

There are 9034 records in the table.


Unnamed: 0,ID,TrafficID,TrafficType,DocName,UniqueDocName,UploadDT
0,1,1435,manual_count,2_GRAVDM93.pdf,2_GRAVDM93.pdf,2007-04-02 08:38:30
1,2,1436,manual_count,4_CULVIS95.pdf,4_CULVIS95.pdf,2008-02-20 09:15:12
2,3,1436,manual_count,4_MONCUL100928.pdf,4_MONCUL100928.pdf,2011-08-09 13:58:55
3,4,1437,manual_count,16_VISTA DEL MAR.WATERVIEW07.pdf,16_VISTA DEL MAR.WATERVIEW07.pdf,2007-11-28 13:01:46
4,5,1437,manual_count,16_visvis01.pdf,16_visvis01.pdf,2007-12-03 16:30:42


Now I want to join count_files_df to dbo_dot_traffic_data_files. The result will be that each count record will have an intersection that it can be assigned to from the BOE City of Los Angeles file. 

In [19]:
# Combine both of the traffic data files
file_table = dbo_dot_traffic_data_files.merge(count_files_df,
                                              how='left',
                                              left_on='DocName',
                                              right_on='file')

print("There are " + str(len(file_table)) + " records in the table.")
file_table.head()

There are 9034 records in the table.


Unnamed: 0,ID,TrafficID,TrafficType,DocName,UniqueDocName,UploadDT,cl_node_id,location,type,file
0,1,1435,manual_count,2_GRAVDM93.pdf,2_GRAVDM93.pdf,2007-04-02 08:38:30,2.0,GRAND AVE at VISTA DEL MAR,manual,2_GRAVDM93.pdf
1,2,1436,manual_count,4_CULVIS95.pdf,4_CULVIS95.pdf,2008-02-20 09:15:12,4.0,VISTA DEL MAR AT VISTA DEL MAR LANE,manual,4_CULVIS95.pdf
2,3,1436,manual_count,4_MONCUL100928.pdf,4_MONCUL100928.pdf,2011-08-09 13:58:55,4.0,VISTA DEL MAR AT VISTA DEL MAR LANE,manual,4_MONCUL100928.pdf
3,4,1437,manual_count,16_VISTA DEL MAR.WATERVIEW07.pdf,16_VISTA DEL MAR.WATERVIEW07.pdf,2007-11-28 13:01:46,16.0,WATERVIEW ST AT VISTA DEL MAR LANE,manual,16_VISTA DEL MAR.WATERVIEW07.pdf
4,5,1437,manual_count,16_visvis01.pdf,16_visvis01.pdf,2007-12-03 16:30:42,,,,


In [24]:
import geopandas as gp
import folium

# Import BOE Intersection spatial data
boe_int = gp.GeoDataFrame.from_file('data/intersections/Intersections.shp')

# Join BOE table to info table
file_tbl_boe = file_table.merge(boe_int,
                                how='left',
                                left_on = 'cl_node_id',
                                right_on = 'CL_NODE_ID')

# Create new LA Basemap specifying map center, zoom level, and using Stamen Toner Tiles
count_map = folium.Map([34.109279, -118.266087],
                       tiles='Stamen Toner',
                       zoom_start=11)

# Subset out points with a NaN Coodinate values
file_tbl_boe = file_tbl_boe[pd.notnull(file_tbl_boe['LAT'])]
file_tbl_boe = file_tbl_boe[pd.notnull(file_tbl_boe['LON'])]

locationlist = file_tbl_boe[['LAT','LON']].values.tolist()
labels = file_tbl_boe["DocName"].values.tolist()

for point in range(10):
    popup = folium.Popup(labels[point], parse_html=True)
    #popup = folium.Popup('hello')
    folium.Marker(locationlist[point], popup = popup).add_to(count_map)

# Show the map
count_map

IndexError: list index out of range