# Crowding on NJTansit's 119

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By

In [2]:
# configure headless browser
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(options=chrome_options)

In [3]:
# Request the page and scrape the data
url="https://www.njtransit.com/my-bus-to?stopID=30189&form=stopID"
try:
    driver.get(url)
except:
    print("error getting web page")

raw_rows = driver.find_elements(By.XPATH, "//div[@class='media-body']")

In [4]:
# raw data
split_rows = [row.text.split('\n') for row in raw_rows]
split_rows

[['HOBOKEN TERMINAL',
  'Bus #5865',
  'Arriving in 8 minutes',
  '9:27 PM',
  'LIGHT'],
 ['HOBOKEN TERMINAL', 'Bus #5865', 'Arriving in 8 minutes'],
 ['HOBOKEN TERMINAL',
  'Bus #5905',
  'Arriving in 34 minutes',
  '9:53 PM',
  'LIGHT'],
 ['HOBOKEN TERMINAL', 'Bus #5905', 'Arriving in 34 minutes'],
 ['Bus', 'Check All', '190 401 412 507 509 553 559'],
 ['Bus']]

In [5]:
# filtered data
filtered_rows = [b for b in split_rows if len(b)==5]
filtered_rows

[['HOBOKEN TERMINAL',
  'Bus #5865',
  'Arriving in 8 minutes',
  '9:27 PM',
  'LIGHT'],
 ['HOBOKEN TERMINAL',
  'Bus #5905',
  'Arriving in 34 minutes',
  '9:53 PM',
  'LIGHT']]

In [6]:
# dump to CSV
# timestamp, stopid, destination, bus_id, eta in minutes, eta time, occupancy
# can infer route from destination
import datetime as dt
from csv import writer 
with open('NJT119_crowding.csv', 'a', newline='') as f_object:  
    writer_object = writer(f_object)
    for row in filtered_rows:
        row.insert(0,'30189')
        row[2]=row[2].split('#')[1]
        row[3]=row[3].split(' ')[2]
        row.insert(0, str(dt.datetime.now()))
        writer_object.writerow(row)
        print (f"Added row: {row}")
    f_object.close()

Added row: ['2022-05-19 21:19:24.748896', '30189', 'HOBOKEN TERMINAL', '5865', '8', '9:27 PM', 'LIGHT']
Added row: ['2022-05-19 21:19:24.749126', '30189', 'HOBOKEN TERMINAL', '5905', '34', '9:53 PM', 'LIGHT']


# TODO
- run 1x minute for a day and visualize the results
- run once per minute for a list of stops in the area (jsq, central, here, palisade, weehawken)
- use NJTransitAPI.py get_route_points to make a list of every stop on the route and then fetch/parse them all 1x / minute
- deployment
    - lambdaize it: selenium lambda https://levelup.gitconnected.com/chromium-and-selenium-in-aws-lambda-6e7476a03d80
    - write to csv in S3?
    
- dockerize and deploy to ECS https://docs.docker.com/cloud/ecs-integration/