Skip to content

Commit

Permalink
Add intial helpers for scraping sugarloaf report pages
Browse files Browse the repository at this point in the history
Works on #3 and #4
  • Loading branch information
abkfenris committed Nov 30, 2016
1 parent 469cb25 commit c4c789f
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 0 deletions.
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ Flask-Migrate==2.0.0
itsdangerous==0.24
cssmin==0.2.0
jsmin==2.2.1
requests==2.11.1
beautifulsoup4==4.5.1
dateparser==0.5.0
lxml==3.6.4

# Testing
pytest==3.0.4
Expand Down
Empty file added sugarloaf/helpers/__init__.py
Empty file.
131 changes: 131 additions & 0 deletions sugarloaf/helpers/scrape_sugarloaf_lifts_trails.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import requests
from bs4 import BeautifulSoup
import dateparser

URL = 'http://sugarloaf.com/the-mountain/trails-and-lifts'


def trail_name(trail):
"""Returns a string containing the name of the trail"""
try:
return trail.contents[0]
except AttributeError:
raise AttributeError(trail)


def trail_status(trail):
"""Returns True if the trail is open"""
if 'closed' in trail.attrs['class'] or 'snowmaking-closed' in trail.attrs['class']:
return False
return True

def trail_snowmaking(trail):
"""Returns true if snowmaking in progress"""
if 'snowmaking-closed' in trail.attrs['class'] or 'snowmaking-open' in trail.attrs['class']:
return True
return False

dificulty = {'beginner', 'intermediate', 'double-black', 'black', 'terrain-park'}


def trail_difficulty(trail):
"""Returns a string with the difficulty of given trail"""
dif = set(trail.attrs['class']).intersection(dificulty)
try:
return list(dif)[0]
except IndexError:
raise IndexError(trail)


def trail_groomed(trail):
"""Returns true if the trail has been groomed"""
if 'groomed' in trail.attrs['class']:
return True
return False


def trail_terrain_park(trail):
"""Returns True if the trail is a terrain park"""
if 'terrain-park' in trail.attrs['class']:
return True
return False


def trail_area(trail):
"""Returns a string with the area of the mountain the trail is in"""
try:
return trail.find_previous_sibling('h3').contents[0]
except AttributeError:
raise AttributeError(trail)


def update_trails(soup):
"""Yields dicts with Sugarloaf trails names, current status, and other attributes"""


trail_status_div = soup.find('div', {'class': 'trail-status'})

all_trail_divs = trail_status_div.find_all('div', {'class', 'trail'})

for trail_div in all_trail_divs:
yield {
'name': trail_name(trail_div),
'open': trail_status(trail_div),
'difficulty': trail_difficulty(trail_div),
'groomed': trail_groomed(trail_div),
'terrain-park': trail_terrain_park(trail_div),
'area': trail_area(trail_div),
'snowmaking': trail_snowmaking(trail_div)
}


def lift_name(lift):
"""Returns a string with the lifts name"""
return lift.contents[0]


statuses = {'open', 'closed'}
def lift_status(lift):
"""Returns the lift status"""
status = set(lift.attrs['class']).intersection(statuses)
try:
return list(status)[0]
except IndexError:
raise IndexError(lift)

def update_lifts(soup):
"""Yields dicts with Sugarloaf lift names and statuses"""
div_lift_status = div_trail_status = soup.find('div', {'class': 'lift-status'})
lifts_divs = div_lift_status.find_all('div', {'class': 'lift'})

for lift_div in lifts_divs:
yield {
'name': lift_name(lift_div),
'status': lift_status(lift_div)
}


def update_time(soup):
"""Returns datetime when the lift and trail report was last updated"""
right_content = soup.find('div', {'class': 'content--right'})
condition_update_string = right_content.find('small').contents[0]
condition_time_string = condition_update_string.strip().split('of')[1]
return dateparser.parse(condition_time_string)


if __name__ == '__main__':
import json

r = requests.get(URL)
soup = BeautifulSoup(r.content, 'lxml')

trails = list(update_trails(soup))

lifts = list(update_lifts(soup))

all_statuses = {'trails': trails,
'lifts': lifts,
'update datetime': update_time(soup).isoformat()}

with open('sugarloaf.json', 'w') as f:
json.dump(all_statuses, f)
32 changes: 32 additions & 0 deletions sugarloaf/helpers/scrape_sugarloaf_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import requests
from bs4 import BeautifulSoup
import dateparser

URL = 'http://sugarloaf.com/the-mountain/daily-report'

def update_time(soup):
"""Returns datetime when the lift and trail report was last updated"""
right_content = soup.find('div', {'class': 'content--right'})
condition_update_string = right_content.find('small').contents[0]
condition_time_string = condition_update_string.strip().split('of')[1]
return dateparser.parse(condition_time_string)


def report_text(soup):
"""Returns the HTML paragraphs from the daily report"""
report_div = soup.find('div', {'class': 'daily-report'})
output = ''

paragraphs = report_div.find_all('p')

for p in paragraphs:
output += p.decode()

return output


def report_reporter(soup):
"""Returns a string with the current Snow Reporter's name"""
report_div = soup.find('div', {'class': 'daily-report'})
reporter = report_div.find('div', {'class': 'signature'}).find('strong')
return reporter.contents[0]

0 comments on commit c4c789f

Please sign in to comment.