# Crawl & Scrape
Useful tutorial: https://www.youtube.com/watch?v=XjNm9bazxn8&index=5&list=WL  

Target site(s):
http://www.fixmystreet.org.au/reports

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import datetime
import time
import pandas as pd
import re

In [2]:
# Starting url.
base_url = 'http://www.fixmystreet.org.au/reports'
start_url = 'http://www.fixmystreet.org.au/reports'

In [3]:
def links(url, target_string, exclude_string=None):
    """Return a list of the top level links.
    
    Args:
        url (str): URL to pull html links from.
        target_string (str): String to look for.

    Returns:
        list: List containing html links.
    
    """
    
    url = url
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml') # Pull the raw html and store it as text in soup

    # Parse soup and look for links that contain '/NVRSHIPS/HULL_'.
    links_list = []
    
    for link in soup.find_all('a'):
        try:
            if exclude_string == None:
                if target_string in link.get('href'):
                    links_list.append(link.get('href'))
            elif exclude_string != None:
                if (target_string in link.get('href')) & (exclude_string not in link.get('href')):
                    links_list.append(link.get('href'))
        except:
            pass
        
    return links_list

In [4]:
def scraper(url):
    """Return a dictionary of info for the requested URL.
    
    Args:
        url (str): URL to scrape.

    Returns:
        dict: Contains scraped ship info with key = ship name, and values as ship info.
    
    """
    info = {}
    
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    
    id_num = re.search('(\d+$)', url).group(1)
    try:
        title = soup.find('h1', attrs={'class':'moderate-display'}).get_text()
    except:
        title = None
    
    try:
        category_raw = soup.find('p', attrs={'class':'report_meta_info'}).get_text()
        category = re.search('in the (.+) category', category_raw).group(1)
    except:
        category = None
    
    try:
        comment_raw = soup.find('div', attrs={"class":"moderate-display"}).get_text()
        comment = re.search('\\n(.+)\\n', comment_raw).group(1)
    except:
        comment = None
        
    info[id_num] = {'id': id_num,
                    'title': title,
                    'category': category,
                    'comment': comment
                   }

    return info

# Test Single Report

In [5]:
pd.DataFrame.from_dict(scraper('http://www.fixmystreet.org.au/report/1399'), orient='index').reset_index(drop=True)

Unnamed: 0,id,title,category,comment
0,1399,Street Light Out,General,The street light is out... again.


In [6]:
def links_li(url, target_string, exclude_string=None):
    """Return a list of the second level links.
    Recursively run if pagination found.
    
    Args:
        url (str): URL to pull html links from.
        target_string (str): String to look for.

    Returns:
        list: List containing html links.
    
    """
    
    url = url
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml') # Pull the raw html and store it as text in soup

    # Parse soup and look for links that contain '/NVRSHIPS/HULL_'.
    links_list = []
    
    # Look for additional pages and recursively call.
    if soup.find('a', {'class': 'next'}):
        links_list += links_li(soup.find('a', {'class': 'next'}).get('href'), target_string, exclude_string)
    
    for li_item in soup.find_all('li', {'class': 'item-list__item item-list--reports__item '}):
        for link in li_item.find_all('a'):
            try:
                if exclude_string == None:
                    if target_string in link.get('href'):
                        links_list.append(link.get('href'))
                elif exclude_string != None:
                    if (target_string in link.get('href')) & (exclude_string not in link.get('href')):
                        links_list.append(link.get('href'))
            except:
                pass
        
    return links_list


# Main Loop

In [7]:
# Main scraping loop.
# Requires top_level_links above.

info = {}
count = 0
start_time = time.time()

top_level_links = links(start_url, 'reports/', '?')

for top_link in top_level_links:
    # Grab next level links.
    second_level_links = links_li(top_link, 'report/')
    
#     # TEST
#     print(second_level_links)
#     break
    
    # Go to each link.
    for second_link in second_level_links:
        scraped_info = scraper(second_link) # dict
        info.update(scraped_info) # Merges dict
        
        # Take a break to not hammer the site.
        count += 1
        if count % 100 == 0:
            print(count)
            print('{:.2f} min elapsed'.format((time.time() - start_time)/ 60))
        time.sleep(1)


print('Completed download of {} records in {:.2f} minutes!'.format(count, (time.time() - start_time)/60))

100
4.71 min elapsed
200
10.51 min elapsed
300
15.65 min elapsed
400
22.58 min elapsed
500
29.23 min elapsed
600
35.36 min elapsed
700
40.69 min elapsed
800
47.10 min elapsed
900
52.02 min elapsed
1000
60.25 min elapsed
1100
65.94 min elapsed
1200
72.41 min elapsed
Completed download of 1247 records in 75.12 minutes!


In [None]:
# Save to csv.
current_datetime = datetime.datetime.now()
output_name = 'fixmystreet_list_au_' + current_datetime.strftime("%Y-%m-%d_%H-%M") + '.csv'

pd.DataFrame.from_dict(info, orient='index').reset_index(drop=True).to_csv(output_name, index_label='index')
# pd.DataFrame.from_dict(scraper('http://www.fixmystreet.org.au/report/1399'), orient='index').reset_index(drop=True)