# Crawl & Scrape
Useful tutorial: https://www.youtube.com/watch?v=XjNm9bazxn8&index=5&list=WL  

Target site(s):
http://fixmystreet.ie/reports

In [12]:
import requests
from bs4 import BeautifulSoup
import json
import datetime
import time
import pandas as pd
import re

In [13]:
# Starting url.
base_url = 'http://fixmystreet.ie/reports'
start_url = 'http://fixmystreet.ie/reports'

In [14]:
def links(url, target_string, exclude_string=None):
    """Return a list of the top level links.
    
    Args:
        url (str): URL to pull html links from.
        target_string (str): String to look for.

    Returns:
        list: List containing html links.
    
    """
    
    url = url
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml') # Pull the raw html and store it as text in soup

    links_list = []

    for el in soup.find_all('td', {'class': 'title', 'style': 'text-align: left'}):
        for link in el.find_all('a'):
            try:
                if exclude_string == None:
                    if target_string in link.get('href'):
                        links_list.append(link.get('href'))
                elif exclude_string != None:
                    if (target_string in link.get('href')) & (exclude_string not in link.get('href')):
                        links_list.append(link.get('href'))
            except:
                pass
    
    return links_list

In [15]:
links(start_url, '/reports/')

['http://fixmystreet.ie/reports/Carlow',
 'http://fixmystreet.ie/reports/Cavan',
 'http://fixmystreet.ie/reports/Clare',
 'http://fixmystreet.ie/reports/Cork+City',
 'http://fixmystreet.ie/reports/Cork+County',
 'http://fixmystreet.ie/reports/Donegal',
 'http://fixmystreet.ie/reports/Dublin+City',
 'http://fixmystreet.ie/reports/D%C3%BAn+Laoghaire-Rathdown',
 'http://fixmystreet.ie/reports/Fingal',
 'http://fixmystreet.ie/reports/Galway+City',
 'http://fixmystreet.ie/reports/Galway+County',
 'http://fixmystreet.ie/reports/Kerry',
 'http://fixmystreet.ie/reports/Kildare',
 'http://fixmystreet.ie/reports/Kilkenny',
 'http://fixmystreet.ie/reports/Laois',
 'http://fixmystreet.ie/reports/Leitrim',
 'http://fixmystreet.ie/reports/Limerick+City',
 'http://fixmystreet.ie/reports/Limerick+County',
 'http://fixmystreet.ie/reports/Longford',
 'http://fixmystreet.ie/reports/Louth',
 'http://fixmystreet.ie/reports/Mayo',
 'http://fixmystreet.ie/reports/Meath',
 'http://fixmystreet.ie/reports/Monag

In [34]:
def scraper(url):
    """Return a dictionary of info for the requested URL.
    
    Args:
        url (str): URL to scrape.

    Returns:
        dict: Contains scraped ship info with key = ship name, and values as ship info.
    
    """
    info = {}
    
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    
    id_num = re.search('(\d+$)', url).group(1)
    try:
        title = soup.find('div', attrs={'class': 'problem-header cf'}).find('h1').get_text()
    except:
        title = None
    
    try:
        category_raw = soup.find('div', attrs={'class': 'problem-header cf'}).find('em').get_text()
        category = re.search('in the (.+) category', category_raw).group(1)
    except:
        category = None
    
    try:
        comment_raw = soup.find('div', attrs={'class': 'problem-header cf'}).find_all('p')[2].get_text()
        comment = re.search('\\n(.+)', comment_raw).group(1)
    except:
        comment = None
        
    info[id_num] = {'id': id_num,
                    'title': title,
                    'category': category,
                    'comment': comment
                   }

    return info

# Test Single Report

In [35]:
pd.DataFrame.from_dict(scraper('http://fixmystreet.ie/report/27192'), orient='index').reset_index(drop=True)

Unnamed: 0,id,title,category,comment
0,27192,Trees and number of heavy branches down,Tree and Grass Maintenance,Due to the high winds last week there are a nu...


In [38]:
def links_ul(url, target_string, exclude_string=None):
    """Return a list of the second level links.
    Recursively run if pagination found.
    
    Args:
        url (str): URL to pull html links from.
        target_string (str): String to look for.

    Returns:
        list: List containing html links.
    
    """
    
    url = url
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml') # Pull the raw html and store it as text in soup

    # Parse soup and look for links.
    links_list = []
    
    # Look for additional pages and recursively call.
    try:
        if soup.find('p', {'class': 'pagination'}).find('a', {'class': 'next'}):
            links_list += links_ul(soup.find('p', {'class': 'pagination'}).find('a', {'class': 'next'}).get('href'), target_string, exclude_string)
    except:
        pass
    
    for ul_item in soup.find_all('ul', {'class': 'issue-list-a'}):
        for link in ul_item.find_all('a'):
            
#             # TEST
#             print(ul_item, link)
            
            try:
                if exclude_string == None:
                    if target_string in link.get('href'):
                        links_list.append(link.get('href'))
                elif exclude_string != None:
                    if (target_string in link.get('href')) & (exclude_string not in link.get('href')):
                        links_list.append(link.get('href'))
            except:
                pass
        
    return links_list


# Main Loop

In [37]:
# Main scraping loop.
# Requires top_level_links above.

info = {}
count = 0
start_time = time.time()

top_level_links = links(start_url, 'reports/', '?')

for top_link in top_level_links:
    # Grab next level links.
    second_level_links = links_ul(top_link, 'report/')
    
#     # TEST
#     print(second_level_links)
#     break
    
    # Go to each link.
    for second_link in second_level_links:
        scraped_info = scraper(second_link) # dict
        info.update(scraped_info) # Merges dict
        
        # Take a break to not hammer the site.
        count += 1
        if count % 100 == 0:
            print(count)
            print('{:.2f} min elapsed'.format((time.time() - start_time)/ 60))
        time.sleep(1)


print('Completed download of {} records in {:.2f} minutes!'.format(count, (time.time() - start_time)/60))

AttributeError: 'NoneType' object has no attribute 'find'

In [None]:
# Save to csv.
current_datetime = datetime.datetime.now()
output_name = 'fixmystreet_list_au_' + current_datetime.strftime("%Y-%m-%d_%H-%M") + '.csv'

pd.DataFrame.from_dict(info, orient='index').reset_index(drop=True).to_csv(output_name, index_label='index')
# pd.DataFrame.from_dict(scraper('http://www.fixmystreet.org.au/report/1399'), orient='index').reset_index(drop=True)