# Scraping LADOT Volume Data from PDFs, Part 2

##### Where I Left Off
In the first python notebook, I described the process by which I was able to extract volume data from PDFs. At this point, the resulting data has been converted to .csv files, formatted for data analysis. 


In [139]:
### Setup
import csv
import glob
from datetime import datetime, date, time
import pdfquery
import pandas as pd
import numpy as np
import folium
from folium.plugins import MarkerCluster
import os
import geopandas as gp

I'm going to start by loading and cleaning the tables provided by BOE. First there is the files table.

In [140]:
# Load traffic data files table
traffic_data_files_path = 'boe_tables/dbo_dot_traffic_data_files.csv'
dbo_dot_traffic_data_files = pd.read_csv(traffic_data_files_path, parse_dates=['UploadDT'], encoding="ISO-8859-1")

# Drop rows where TrafficID is NaN, convert TrafficID to int type
dbo_dot_traffic_data_files = dbo_dot_traffic_data_files.dropna(axis=0, how='any',subset=['TrafficID'])
dbo_dot_traffic_data_files['TrafficID'] = dbo_dot_traffic_data_files['TrafficID'].astype(int)

# Subset out Survey Data and Automatic Counts
dbo_dot_traffic_data_files = dbo_dot_traffic_data_files[(dbo_dot_traffic_data_files['TrafficType'] == 'manual_count')]

# See traffic data files head
print("There are " + str(len(dbo_dot_traffic_data_files)) + " records in the table.")
dbo_dot_traffic_data_files.head()

There are 9034 records in the table.


Unnamed: 0,ID,TrafficID,TrafficType,DocName,UniqueDocName,UploadDT
0,1,1435,manual_count,2_GRAVDM93.pdf,2_GRAVDM93.pdf,2007-04-02 08:38:30
1,2,1436,manual_count,4_CULVIS95.pdf,4_CULVIS95.pdf,2008-02-20 09:15:12
2,3,1436,manual_count,4_MONCUL100928.pdf,4_MONCUL100928.pdf,2011-08-09 13:58:55
3,4,1437,manual_count,16_VISTA DEL MAR.WATERVIEW07.pdf,16_VISTA DEL MAR.WATERVIEW07.pdf,2007-11-28 13:01:46
4,5,1437,manual_count,16_visvis01.pdf,16_visvis01.pdf,2007-12-03 16:30:42


In [141]:
# Load traffic data table
traffic_data_path = 'boe_tables/dot_traffic_data.csv'
dot_traffic_data = pd.read_csv(traffic_data_path)

# Drop "ext" and "Shape" columns
dot_traffic_data = dot_traffic_data.drop(['ext','Shape'], axis=1)
#dot_traffic_data['IntersectionID'] = dot_traffic_data['IntersectionID'].astype(int)

# See traffic data head
dot_traffic_data.head()

Unnamed: 0,TrafficID,IntersectionID,lat,lon,intersection
0,1,3667.0,33.78,-118.26,ISLAND AVE at L ST
1,2,3680.0,33.78,-118.28,FIGUEROA ST at L ST
2,3,3727.0,33.77,-118.26,FRIES AVE at HARRY BRIDGES BLVD
3,4,3787.0,33.78,-118.22,ANAHEIM ST AT FARRAGUT AVE
4,5,3839.0,33.79,-118.27,DON ST at FRIGATE AVE


In [142]:
# Import output data tables
manualcount_df = pd.read_csv('TrafficCountData/Results/manualcount.csv')
pedestrian_df = pd.read_csv('TrafficCountData/Results/pedestrian.csv')
peakvol_df = pd.read_csv('TrafficCountData/Results/peakvol.csv')
specveh_df = pd.read_csv('TrafficCountData/Results/SpecialVehicle.csv')
info_df = pd.read_csv('TrafficCountData/Results/info.csv')

info_df.head()

Unnamed: 0.1,Unnamed: 0,count_id,date,dayofweek,district,hours,school_day,street_ew,street_ns,weather
0,0,68,2011-04-25,MONDAY,YES,7-10AM 2-5PM,YES,65th PL,VAN NESS AV,SUNNY
1,1,70,2011-09-08,THURSDAY,YES,7-10AM 2-5PM,YES,FLORENCE AV.,KANSAS AV.,SUNNY
2,2,89,2007-07-23,MONDAY,YES,7-10AM 3-6PM,YES,FLORENCE AV,WEST ST,SUNNY
3,3,112,2010-03-08,MONDAY,YES,7-10AM 2-5PM,YES,78TH ST,SAN PEDRO ST,SUNNY
4,4,142,2010-04-23,FRIDAY,YES,7-10AM 3-6PM,YES,92ND ST,GRAHAM AV,SUNNY


In [143]:
# Combine both of the traffic data files
file_table = dbo_dot_traffic_data_files.merge(dot_traffic_data, how='left', left_on='TrafficID', right_on='TrafficID')
file_table.head()

Unnamed: 0,ID,TrafficID,TrafficType,DocName,UniqueDocName,UploadDT,IntersectionID,lat,lon,intersection
0,1,1435,manual_count,2_GRAVDM93.pdf,2_GRAVDM93.pdf,2007-04-02 08:38:30,147.0,33.95,-118.39,LA TIJERA BLVD AT SEPULVEDA EASTWAY
1,2,1436,manual_count,4_CULVIS95.pdf,4_CULVIS95.pdf,2008-02-20 09:15:12,168.0,33.95,-118.39,WESTCHESTER PKWY AT SEPULVEDA EASTWAY
2,3,1436,manual_count,4_MONCUL100928.pdf,4_MONCUL100928.pdf,2011-08-09 13:58:55,168.0,33.95,-118.39,WESTCHESTER PKWY AT SEPULVEDA EASTWAY
3,4,1437,manual_count,16_VISTA DEL MAR.WATERVIEW07.pdf,16_VISTA DEL MAR.WATERVIEW07.pdf,2007-11-28 13:01:46,169.0,33.95,-118.44,PERSHING DR AT MANCHESTER AVE
4,5,1437,manual_count,16_visvis01.pdf,16_visvis01.pdf,2007-12-03 16:30:42,169.0,33.95,-118.44,PERSHING DR AT MANCHESTER AVE


In [144]:
# Left join the traffic data table to the count info table
info_df_merge = info_df.merge(file_table, how='left', left_on = 'count_id', right_on = 'ID')

# Take a look at the length and head of the new table
print(len(info_df_merge))
info_df_merge

2119


Unnamed: 0.1,Unnamed: 0,count_id,date,dayofweek,district,hours,school_day,street_ew,street_ns,weather,ID,TrafficID,TrafficType,DocName,UniqueDocName,UploadDT,IntersectionID,lat,lon,intersection
0,0,68,2011-04-25,MONDAY,YES,7-10AM 2-5PM,YES,65th PL,VAN NESS AV,SUNNY,68,1477,manual_count,739_VANNESS.65PL110425.pdf,739_VANNESS.65PL110425.pdf,2013-05-08 17:31:58,832.0,33.97,-118.31,FLORENCE AVE at ST ANDREWS PL
1,1,70,2011-09-08,THURSDAY,YES,7-10AM 2-5PM,YES,FLORENCE AV.,KANSAS AV.,SUNNY,70,1478,manual_count,802_FLORENCE.KANSAS110908.pdf,802_FLORENCE.KANSAS110908.pdf,2013-05-08 17:37:55,918.0,33.96,-118.30,WESTERN AVE AT 78TH PL
2,2,89,2007-07-23,MONDAY,YES,7-10AM 3-6PM,YES,FLORENCE AV,WEST ST,SUNNY,89,1493,manual_count,1067_FLORENCE.WEST07.pdf,1067_FLORENCE.WEST07.pdf,2007-07-27 21:55:54,1160.0,33.99,-118.28,55TH ST at FIGUEROA ST
3,3,112,2010-03-08,MONDAY,YES,7-10AM 2-5PM,YES,78TH ST,SAN PEDRO ST,SUNNY,112,1510,manual_count,1461_SANPEDRO.78ST10.pdf,1461_SANPEDRO.78ST10.pdf,2010-03-15 15:48:02,1543.0,33.96,-118.28,79TH ST at FIGUEROA ST
4,4,142,2010-04-23,FRIDAY,YES,7-10AM 3-6PM,YES,92ND ST,GRAHAM AV,SUNNY,142,1530,manual_count,1986_GRAHAM.92ST10.pdf,1986_GRAHAM.92ST10.pdf,2010-05-04 12:36:26,2318.0,33.95,-118.28,92ND ST at FIGUEROA ST
5,5,151,2010-03-10,WEDNESDAY,YES,7-10AM 2-5PM,YES,110TH ST,FIGUEROA ST,SUNNY,151,1537,manual_count,2476_FIGUEROA.110ST10.pdf,2476_FIGUEROA.110ST10.pdf,2010-03-18 15:13:28,2675.0,33.90,-118.28,FIGUEROA ST AT ROSECRANS AVE
6,6,168,2009-06-08,MONDAY,YES,7-10AM 2-5PM,YES,182ND ST,EVELYN AV,SUNNY,168,1549,manual_count,2996_EVELYN.182ST09.pdf,2996_EVELYN.182ST09.pdf,2009-06-17 15:09:32,3203.0,33.79,-118.27,DOLORES ST at WILMINGTON BLVD
7,7,271,2009-11-02,MONDAY,YES,7-10AM 3-6PM,YES,JEFFERSON BL,SOMERSET DR,SUNNY,271,47,manual_count,4922_JEFSOM091102.pdf,4922_JEFSOM091102.pdf,2011-04-28 14:08:35,4922.0,34.02,-118.33,JEFFERSON BLVD AT SOMERSET DR
8,8,282,2013-04-30,TUESDAY,YES,7-10AM 3-6PM,YES,COLISEUM ST.,BRONSON AV. / MALL DRIVEWAY,SUNNY,282,54,manual_count,5009_BRONSON.COLISEUM.130430.MAN.pdf,5009_BRONSON.COLISEUM.130430.MAN.pdf,2013-05-10 08:06:58,5009.0,34.01,-118.33,COLISEUM ST AT BRONSON AVE
9,9,321,2012-08-29,WEDNESDAY,YES,7-10AM 3-6PM,YES,JEFFERSON BL.,FIGUEROA ST.,SUNNY,321,89,manual_count,5772_FIGUEROA.JEFFERSON120829.pdf,5772_FIGUEROA.JEFFERSON120829.pdf,2013-04-16 14:40:59,5772.0,34.02,-118.27,FIGUEROA ST AT JEFFERSON BLVD


In [145]:
# Import BOE Intersection spatial data
boe_int = gp.GeoDataFrame.from_file('shp/Intersections.shp')
# Take a peek at the table contents
boe_int.head()

Unnamed: 0,OBJECTID,ASSETID,CL_NODE_ID,X,Y,LAT,LON,TYPE,CRTN_DT,LST_MODF_D,USER_ID,FROM_ST,TO_ST,TOOLTIP,ZIP_CODE,NLA_URL,geometry
0,1001,96967,13589,6392995.0,1839536.0,34.045852,-118.556794,,,,,PASEO MIRAMAR,ESPERA AVE,PASEO MIRAMAR at ESPERA AVE,90272.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.5568057440078 34.04585714775025)
1,1002,96968,13634,6406002.0,1841805.0,34.052279,-118.513895,,,,,VILLA WOODS PL,D/E,VILLA WOODS PL at D/E,90272.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.5139074746574 34.0522839749969)
2,1003,96969,13660,6398905.0,1841108.0,34.050261,-118.537311,,,,,ALCIMA AVE,LAS LOMAS AVE,ALCIMA AVE at LAS LOMAS AVE,90272.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.5373235321818 34.0502655043602)
3,1004,96970,15007,6480819.0,1861803.0,34.108027,-118.26708,,,,,WEST SILVER LAKE DR,ROWENA AVE,WEST SILVER LAKE DR at ROWENA AVE,90039.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.267092077783 34.10803167807186)
4,1005,96971,15024,6479671.0,1861156.0,34.106243,-118.270866,,,,,ANGUS ST,PANORAMA TER,ANGUS ST at PANORAMA TER,90039.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.2708777616466 34.10624778706446)


In [146]:
# Join BOE table to info table
info_df_merge = info_df_merge.merge(boe_int,
                                    how='left',
                                    left_on = 'IntersectionID',
                                    right_on = 'CL_NODE_ID')

print(len(info_df_merge))
info_df_merge

2119


Unnamed: 0.1,Unnamed: 0,count_id,date,dayofweek,district,hours,school_day,street_ew,street_ns,weather,...,TYPE,CRTN_DT,LST_MODF_D,USER_ID,FROM_ST,TO_ST,TOOLTIP,ZIP_CODE,NLA_URL,geometry
0,0,68,2011-04-25,MONDAY,YES,7-10AM 2-5PM,YES,65th PL,VAN NESS AV,SUNNY,...,,,,,FLORENCE AVE,ST ANDREWS PL,FLORENCE AVE at ST ANDREWS PL,90047.0,navigatela/reports/intersection_report.cfm?pk=832,POINT (-118.3111801675238 33.97457344728143)
1,1,70,2011-09-08,THURSDAY,YES,7-10AM 2-5PM,YES,FLORENCE AV.,KANSAS AV.,SUNNY,...,,,,,WESTERN AVE,78TH PL,WESTERN AVE at 78TH PL,90047.0,navigatela/reports/intersection_report.cfm?pk=918,POINT (-118.3090085639863 33.96816881798583)
2,2,89,2007-07-23,MONDAY,YES,7-10AM 3-6PM,YES,FLORENCE AV,WEST ST,SUNNY,...,,,,,FIGUEROA ST,55TH ST,FIGUEROA ST at 55TH ST,90037.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.282681356782 33.99238429234652)
3,3,112,2010-03-08,MONDAY,YES,7-10AM 2-5PM,YES,78TH ST,SAN PEDRO ST,SUNNY,...,,,,,FIGUEROA ST,79TH ST,FIGUEROA ST at 79TH ST,90044.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.2827015482146 33.96787104496666)
4,4,142,2010-04-23,FRIDAY,YES,7-10AM 3-6PM,YES,92ND ST,GRAHAM AV,SUNNY,...,,,,,FIGUEROA ST,92ND ST,FIGUEROA ST at 92ND ST,90003.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.2827013155509 33.95278864473218)
5,5,151,2010-03-10,WEDNESDAY,YES,7-10AM 2-5PM,YES,110TH ST,FIGUEROA ST,SUNNY,...,,,,,FIGUEROA ST,ROSECRANS AVE,FIGUEROA ST at ROSECRANS AVE,90248.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.2827605468675 33.90189835602986)
6,6,168,2009-06-08,MONDAY,YES,7-10AM 2-5PM,YES,182ND ST,EVELYN AV,SUNNY,...,,,,,WILMINGTON BLVD,DOLORES ST,WILMINGTON BLVD at DOLORES ST,90744.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.2751224082636 33.79337025919423)
7,7,271,2009-11-02,MONDAY,YES,7-10AM 3-6PM,YES,JEFFERSON BL,SOMERSET DR,SUNNY,...,,,,,SOMERSET DR,JEFFERSON BLVD,SOMERSET DR at JEFFERSON BLVD,90016.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.337377284302 34.02557825898255)
8,8,282,2013-04-30,TUESDAY,YES,7-10AM 3-6PM,YES,COLISEUM ST.,BRONSON AV. / MALL DRIVEWAY,SUNNY,...,,,,,COLISEUM ST,BRONSON AVE,COLISEUM ST at BRONSON AVE,90008.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.3339546752218 34.01823342519442)
9,9,321,2012-08-29,WEDNESDAY,YES,7-10AM 3-6PM,YES,JEFFERSON BL.,FIGUEROA ST.,SUNNY,...,,,,,JEFFERSON BLVD,FIGUEROA ST,JEFFERSON BLVD at FIGUEROA ST,90007.0,navigatela/reports/intersection_report.cfm?pk=...,POINT (-118.2799898748333 34.02192125171941)


In [147]:
# Create new LA Basemap specifying map center, zoom level, and using Stamen Toner Tiles
count_map = folium.Map([34.109279, -118.266087],
                       tiles='Stamen Toner',
                       zoom_start=11)

# Subset out points with a NaN Coodinate values
info_df_merge = info_df_merge[pd.notnull(info_df_merge['LAT'])]
info_df_merge = info_df_merge[pd.notnull(info_df_merge['LON'])]

#marker_cluster = MarkerCluster().add_to(count_map)

locationlist = info_df_merge[['LAT','LON']].values.tolist()
labels = info_df_merge["DocName"].values.tolist()

print(len(labels))
print(len(locationlist))

# Loop through the midblock_points df, add each point and value for 'intersection' column to the map
#for point in range(len(locationlist)):
    #popup = folium.Popup(labels[point], parse_html=True)
    #folium.Marker(locationlist[point]
                  #,popup='test'
                 # 
                 #).add_to(count_map)
    #popup = folium.Popup(str(row['intersection']), parse_html=True)
    #popup issues. See here: https://github.com/python-visualization/folium/issues/726
    #folium.Marker([row['lat'], row['lon']]).add_to(marker_cluster)
for point in range(400):
    popup = folium.Popup(labels[point], parse_html=True)
    #popup = folium.Popup('hello')
    folium.Marker(locationlist[point], popup = popup).add_to(count_map)
# Show the map
count_map


1987
1987


In [148]:
import folium
print(folium.__file__)
print(folium.__version__)

from folium.plugins import MarkerCluster

C:\Users\Tim\Anaconda3\lib\site-packages\folium\__init__.py
0.5.0


In [149]:
#map2 = folium.Map(location=[38.9, -77.05], tiles='CartoDB dark_matter', zoom_start=11)

#marker_cluster = MarkerCluster().add_to(map2)

#for point in range(0, len(locationlist)):
#    folium.Marker(locationlist[point], popup=df_counters['Name'][point]).add_to(marker_cluster)
#map2

In [150]:
import pandas as pd

data = '{"id":{"0":411083201,"1":418513660,"2":528057543,"3":586713622,"4":647656728,"5":647656785,"6":1493456455,"7":1493456487,"8":2005894602,"9":2095344770,"10":2187987262,"11":2411692096,"12":2411698457,"13":2474058013,"14":2474058022,"15":2881830804,"16":2895323815,"17":2895323816,"18":2983919102,"19":3321734312,"20":3641568148,"21":4010355532,"22":4030622426,"23":4037746568,"24":4055719117,"25":4259001279,"26":4340535594,"27":4625994189,"28":4666687025},"Latitude":{"0":37.7673162,"1":37.7645003,"2":37.7682118,"3":37.7648492,"4":37.771672,"5":37.7721693,"6":37.7763591,"7":37.777046,"8":37.7690716,"9":37.766319,"10":37.7664253,"11":37.7770377,"12":37.7763106,"13":37.7739857,"14":37.774748,"15":37.775488,"16":37.7752166,"17":37.7759142,"18":37.7744588,"19":37.7760172,"20":37.7762395,"21":37.765011,"22":37.769195,"23":37.7750452,"24":37.7726713,"25":37.7717782,"26":37.7745253,"27":37.7768943,"28":37.7752822},"Longitude":{"0":-122.4219479,"1":-122.4216812,"2":-122.4223857,"3":-122.4320119,"4":-122.4331366,"5":-122.4307254,"6":-122.4180877,"7":-122.4172737,"8":-122.4277243,"9":-122.417422,"10":-122.4290387,"11":-122.4175698,"12":-122.4232558,"13":-122.424226,"14":-122.4226877,"15":-122.4159,"16":-122.4195185,"17":-122.4191912,"18":-122.4205881,"19":-122.4314951,"20":-122.4168763,"21":-122.4226685,"22":-122.4315398,"23":-122.4210561,"24":-122.4220741,"25":-122.4167145,"26":-122.4306476,"27":-122.4245402,"28":-122.4161381},"Cafe Name":{"0":"Four Barrel Coffee","1":"Muddy Waters","2":"Carlin\'s Cafe","3":"Peet\'s Coffee & Tea","4":"Nectar","5":"Cafe International","6":"Ma\'velous","7":"Starbucks","8":"Starbucks","9":"Flying Pig Bistro","10":"Church Street Cafe","11":"Anderson Bakery","12":"Blue Bottle Coffee","13":"mercury cafe","14":"gourmet and more","15":"Cumaica","16":"All Star Cafe","17":"Boston Cafe","18":"Javalencia Cafe","19":"Alamo Square Cafe","20":"Blue Bottle Coffee","21":"Stanza","22":"Duboce Park Cafe","23":"Eden Cafe","24":"Delessio Market & Bakery","25":"Gaslamp Cafe","26":"The Center SF Tea House","27":"Cafe la Vie","28":"Peet\'s Coffee"},"Street":{"0":"Valencia Street","1":"Valencia Street","2":"Valencia Street","3":"Market Street","4":"Haight Street","5":"Haight Street","6":"Market Street","7":"Market Street","8":"Market Street","9":"South Van Ness Avenue","10":null,"11":null,"12":null,"13":"Octavia Street","14":null,"15":"Mission Street","16":"Market Street","17":"Van Ness Avenue","18":"Market Street","19":null,"20":"Market Street","21":null,"22":"Sanchez Street","23":"Franklin Street","24":"Market Street","25":"Howard Street","26":"Fillmore Street","27":"Octavia Street","28":null}}'

df = pd.read_json(data)

location = df['Latitude'].mean(), df['Longitude'].mean()

import folium

#locationlist = df[["Latitude","Longitude"]].values.tolist()
#labels = df["Cafe Name"].values.tolist()

locationlist = info_df_merge[['LAT','LON']].values.tolist()
labels = info_df_merge["DocName"].values.tolist()

m = folium.Map(location=location, zoom_start=14)
for point in range(20):
    popup = folium.Popup(labels[point], parse_html=True)
    #popup = folium.Popup('hello')
    folium.Marker(locationlist[point], popup=popup).add_to(m)

m

