# Crawl & Scrape
Useful tutorial: https://www.youtube.com/watch?v=XjNm9bazxn8&index=5&list=WL  

Target site(s):
http://www.fixmystreet.org.au/reports

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import datetime
import time
import pandas as pd
import re

In [2]:
# Starting url.
base_url = 'http://www.fixmystreet.org.au/reports'
start_url = 'http://www.fixmystreet.org.au/reports'

In [6]:
def links(url, target_string, exclude_string=None):
    """Return a list of the top level links.
    
    Args:
        url (str): URL to pull html links from.
        target_string (str): String to look for.

    Returns:
        list: List containing html links.
    
    """
    
    url = url
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml') # Pull the raw html and store it as text in soup

    # Parse soup and look for links that contain '/NVRSHIPS/HULL_'.
    links_list = []
    
    for link in soup.find_all('a'):
        try:
            if exclude_string == None:
                if target_string in link.get('href'):
                    links_list.append(link.get('href'))
            elif exclude_string != None:
                if (target_string in link.get('href')) & (exclude_string not in link.get('href')):
                    links_list.append(link.get('href'))
        except:
            pass
        
    return links_list

In [4]:
top_level_links = links(start_url, 'reports/', '?')

In [5]:
top_level_links

['http://www.fixmystreet.org.au/reports/ACT',
 'http://www.fixmystreet.org.au/reports/Adelaide+City+Council',
 'http://www.fixmystreet.org.au/reports/Adelaide+Hills+Council',
 'http://www.fixmystreet.org.au/reports/Albany+City+Council',
 'http://www.fixmystreet.org.au/reports/Albury+City+Council',
 'http://www.fixmystreet.org.au/reports/Alexandrina+Council',
 'http://www.fixmystreet.org.au/reports/Alice+Springs+Town+Council',
 'http://www.fixmystreet.org.au/reports/Alpine+Shire+Council',
 'http://www.fixmystreet.org.au/reports/Anangu+Pitjantjatjara+Yankunytjatjara',
 'http://www.fixmystreet.org.au/reports/Ararat+Rural+City+Council',
 'http://www.fixmystreet.org.au/reports/Armadale+City+Council',
 'http://www.fixmystreet.org.au/reports/Armidale+Dumaresq+Council',
 'http://www.fixmystreet.org.au/reports/Auburn+City+Council',
 'http://www.fixmystreet.org.au/reports/Aurukun+Shire+Council',
 'http://www.fixmystreet.org.au/reports/Ballarat+City+Council',
 'http://www.fixmystreet.org.au/repor

In [5]:
def scraper(url):
    """Return a dictionary of info for the requested URL.
    
    Args:
        url (str): URL to scrape.

    Returns:
        dict: Contains scraped ship info with key = ship name, and values as ship info.
    
    """
    info = {}
    
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    
    id_num = re.search('(\d+$)', url).group(1)
    try:
        title = soup.find('h1', attrs={'class':'moderate-display'}).get_text()
    except:
        title = None
    
    try:
        category_raw = soup.find('p', attrs={'class':'report_meta_info'}).get_text()
        category = re.search('in the (.+) category', category_raw).group(1)
    except:
        category = None
    
    try:
        comment_raw = soup.find('div', attrs={"class":"moderate-display"}).get_text()
        comment = re.search('\\n(.+)\\n', comment_raw).group(1)
    except:
        comment = None
        
    info[id_num] = {'id': id_num,
                    'title': title,
                    'category': category,
                    'comment': comment
                   }

    return info

# Test Single Report

In [None]:
pd.DataFrame.from_dict(scraper('http://www.fixmystreet.org.au/report/1399'), orient='index').reset_index(drop=True)

In [18]:
def links_li(url, target_string, exclude_string=None):
    """Return a list of the second level links.
    Recursively run if pagination found.
    
    Args:
        url (str): URL to pull html links from.
        target_string (str): String to look for.

    Returns:
        list: List containing html links.
    
    """
    
    url = url
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml') # Pull the raw html and store it as text in soup

    # Parse soup and look for links that contain '/NVRSHIPS/HULL_'.
    links_list = []
    
    # Look for additional pages and recursively call.
    if soup.find('a', {'class': 'next'}):
        links_list += links_li(soup.find('a', {'class': 'next'}).get('href'), target_string, exclude_string)
    
    for li_item in soup.find_all('li', {'class': 'item-list__item item-list--reports__item '}):
        for link in li_item.find_all('a'):
            try:
                if exclude_string == None:
                    if target_string in link.get('href'):
                        links_list.append(link.get('href'))
                elif exclude_string != None:
                    if (target_string in link.get('href')) & (exclude_string not in link.get('href')):
                        links_list.append(link.get('href'))
            except:
                pass
        
    return links_list


# Main Loop

In [23]:
# Main scraping loop.
# Requires top_level_links above.

info = {}
count = 0
start_time = time.time()

for top_link in top_level_links:
    # Grab next level links.
    second_level_links = links_li(top_link, 'report/')
    
#     # TEST
#     print(second_level_links)
#     break
    
    # Go to each link.
    for second_link in second_level_links:
        scraped_info = scraper(second_link) # dict
        info.update(scraped_info) # Merges dict
        
        # Take a break to not hammer the site.
        count += 1
        if count % 100 == 0:
            print(count)
            print('{:.2f} min elapsed'.format((time.time() - start_time)/ 60))
        time.sleep(1)


print('Completed download of {} records in {:.2f} minutes!'.format(count, (time.time() - start_time)/60))

100
4.88 min elapsed
Completed download of 141 records in 6.93 minutes!
Completed download of 143 records in 7.05 minutes!
Completed download of 143 records in 7.07 minutes!
Completed download of 143 records in 7.10 minutes!
Completed download of 145 records in 7.21 minutes!
Completed download of 145 records in 7.23 minutes!
Completed download of 145 records in 7.26 minutes!
Completed download of 145 records in 7.28 minutes!
Completed download of 145 records in 7.31 minutes!
Completed download of 145 records in 7.33 minutes!
Completed download of 145 records in 7.44 minutes!
Completed download of 145 records in 7.46 minutes!
Completed download of 156 records in 8.03 minutes!
Completed download of 156 records in 8.06 minutes!
Completed download of 157 records in 8.12 minutes!
Completed download of 158 records in 8.20 minutes!
Completed download of 158 records in 8.22 minutes!
Completed download of 158 records in 8.24 minutes!
Completed download of 158 records in 8.27 minutes!
Completed 

Completed download of 452 records in 25.97 minutes!
Completed download of 452 records in 25.99 minutes!
Completed download of 452 records in 26.01 minutes!
Completed download of 452 records in 26.04 minutes!
Completed download of 452 records in 26.06 minutes!
Completed download of 452 records in 26.09 minutes!
Completed download of 452 records in 26.11 minutes!
Completed download of 452 records in 26.13 minutes!
Completed download of 452 records in 26.16 minutes!
Completed download of 452 records in 26.18 minutes!
Completed download of 452 records in 26.20 minutes!
Completed download of 452 records in 26.23 minutes!
Completed download of 452 records in 26.25 minutes!
Completed download of 452 records in 26.27 minutes!
Completed download of 452 records in 26.30 minutes!
Completed download of 452 records in 26.32 minutes!
Completed download of 452 records in 26.34 minutes!
Completed download of 452 records in 26.37 minutes!
Completed download of 452 records in 26.39 minutes!
Completed do

Completed download of 735 records in 43.70 minutes!
Completed download of 735 records in 43.73 minutes!
Completed download of 756 records in 44.78 minutes!
Completed download of 756 records in 44.80 minutes!
Completed download of 756 records in 44.82 minutes!
Completed download of 756 records in 44.85 minutes!
Completed download of 756 records in 44.87 minutes!
Completed download of 756 records in 44.89 minutes!
Completed download of 756 records in 44.92 minutes!
Completed download of 763 records in 45.26 minutes!
Completed download of 763 records in 45.28 minutes!
Completed download of 763 records in 45.31 minutes!
Completed download of 766 records in 45.47 minutes!
Completed download of 768 records in 45.59 minutes!
Completed download of 768 records in 45.61 minutes!
Completed download of 768 records in 45.63 minutes!
Completed download of 768 records in 45.66 minutes!
800
47.27 min elapsed


AttributeError: 'NoneType' object has no attribute 'group'

In [24]:
info

{'453': {'id': '453',
  'title': 'Leaking water meter',
  'category': 'General',
  'comment': "My mother in law's water meter is leaking. Can you please send someone out to fix it? Thanks"},
 '429': {'id': '429',
  'title': 'Overgrown grass',
  'category': 'General',
  'comment': 'Over grown grass around footpath starting at Goyder down Leahy Close Narrabundah.'},
 '284': {'id': '284',
  'title': 'Unsafe and unsuitable parking area',
  'category': 'General',
  'comment': 'The ACT Government is responsible for the alternate car parking area, at Bonython Primary School, and needs to be made aware of the unsafe and unsuitable condition that this area is currently in. This includes:'},
 '320': {'id': '320',
  'title': 'Crossing',
  'category': 'General',
  'comment': 'Many school kids either alone or with their parents cross this every morning and afternoon. Cars continue to drive faster than the speed limit and dangerous for those crossing the street. A crossing would benefit many pedestr

In [25]:
# Save to csv.
current_datetime = datetime.datetime.now()
output_name = 'fixmystreet_list_au_' + current_datetime.strftime("%Y-%m-%d_%H-%M") + '.csv'

pd.DataFrame.from_dict(info, orient='index').reset_index(drop=True).to_csv(output_name, index_label='index')
# pd.DataFrame.from_dict(scraper('http://www.fixmystreet.org.au/report/1399'), orient='index').reset_index(drop=True)