# Scrape MTA status archive using Selenium

Scraping historical MTA alert archive using selenium, inspired by:

https://medium.com/the-andela-way/introduction-to-web-scraping-using-selenium-7ec377a8cf72

In [None]:
# relative MTADelayPredict Project
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(os.path.join('AlertScraper.ipynb')))))
from MTADelayPredict.utils import scrape_helpers

In [None]:
# selenium imports
from selenium import webdriver 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
import chromedriver_binary  # Adds chromedriver binary to path


In [None]:
import time
import pandas as pd
import re
import os
from importlib import reload
import progressbar

In [None]:
# need to add headless and dev shm usage, suggested here:
# https://github.com/heroku/heroku-buildpack-google-chrome/issues/46

# Set up selenium browser
option = webdriver.ChromeOptions()
option.add_argument("--incognito")
option.add_argument("--disable-dev-shm-usage")
option.add_argument("--no-sandbox")
option.add_argument("--headless")


### Iterate through all the data entries by clicking "next" on the MTA webform

In [None]:
# Date range to download monthly alerts over
download_start = '2019-01-01'
download_end = '2019-01-31'


drange = pd.date_range(start=download_start, end=download_end, freq='M')
data_dir = os.path.join('../data/raw/alerts')

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Fetch alert data in monthly blocks
for i,m in enumerate(drange):
    start_date = m.replace(day=1).strftime('%m/%d/%Y')
    end_date = m.strftime('%m/%d/%Y')
    
    # Set up a new chrome browser
    browser = webdriver.Chrome(executable_path='/usr/bin/chromedriver',
                           options=option)
    browser.get('https://m.mymtaalerts.com/archive')
    
    # Use helpers to submit date range and then collect data
    scrape_helpers.submit_dates(browser, start_date, end_date)
    data_rows = scrape_helpers.scrape_data(browser)

    # raw send monthly csv to disk
    data_df = pd.DataFrame(data_rows, columns=['Date','Agency','Subject','Message'])
    data_df.index = data_df.Date.map(pd.to_datetime)
    #data_df['Type'] = data_df.Subject.map(lambda x:x.split(',')[-1])
    data_df.drop(columns=['Date'], inplace=True)
    
    filename = 'raw_alerts_{}_{}.csv'.format(start_date, end_date)
    filename = filename.replace('/', '.')
    data_df.to_csv(os.path.join(data_dir, filename))
    browser.quit()
    
    # Wait a bit to be friendly
    time.sleep(10)
    
browser.quit()