In [139]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import sqlite3
import time
import re

In [140]:
#return the area id from an area link
def get_area_id(link):
    return link.split("/area/")[1].split("/")[0]

In [141]:
#in case we trigger something https://softwareengineering.stackexchange.com/questions/91760/how-to-be-a-good-citizen-when-crawling-web-sites
def is_good_link(link):
    #catch nulls
    if not link:
        return False
    #catch problem words
    for word in ['delete', 'remove', 'edit', 'update', 'modify']:
        if word in link.lower():
            return False
    return True

In [142]:
def update_area(area_id, status, connector):
    '''
    Set the given area to the given status in the areas SQL table
    '''
    cursor = connector.cursor()
    try:
        cursor.execute('''INSERT INTO areas VALUES (?,?)''', (area_id, status))
    except sqlite3.IntegrityError:
        cursor.execute('''UPDATE areas SET finished=? WHERE area_id=?;''', (status, area_id))
    connector.commit()
    cursor.close()

In [161]:
def mp_scrape(link, connector):
    '''
    Takes in an area page and recursively scrapes area pages listed on that page
    If the area page contains a route table, scrapes the routes in the table
    '''
    #track progress on the area
    update_area(get_area_id(link), 0, connector)
    
    #get page
    res = requests.get(link)
    try:
        res.raise_for_status()
    except:
        print(f"{link} errored out")
        return None
    
    #explore page
    area_text = BeautifulSoup(res.text, "lxml")
    sub_area_table = area_text.find_all("div", {"class":"lef-nav-row"})
    route_table = area_text.find("table", {"id":"left-nav-route-table"})

    #if there are more sub_areas, call the function recursively
    if sub_area_table:
        for div in sub_area_table:
            sub_area_link = div.find("a")["href"]
            if is_good_link(sub_area_link):
                time.sleep(30)
                mp_scrape(sub_area_link, connector)
    
    #if there is a list of routes, add each route to the database
    elif route_table:
        #create cursor to do actions on db
        cursor = connector.cursor()
        
        #go through routes in the table
        routes = [x.find_all("td")[1] for x in route_table.find_all("tr") if not x.get("id") == "left-nav-unsorted-label"]
        for route in routes:
            #make sure they're not bouldering
            if not "Boulder" in route.find("span").get("class") and is_good_link(route.find("a").get("href")):
                add_route(route.find("a")["href"], cursor)
                connector.commit()
                time.sleep(60)
            else:
                print("shoot that's a Bouldering problem gross")
        cursor.close()
    #if there are neither, this page is empty
    else:
        print(f"{link} is an empty area page")
    
    #set the area as complete
    update_area(get_area_id(link), 1, connector)

In [144]:
def add_route(routelink, cursor):
    '''
    Takes in a routelink and adds the route information to the db
    Overwrites existing info if the route is already stored
    '''
    print(f"Adding route {routelink}")
    #make some soup
    res = requests.get(routelink)
    try:
        res.raise_for_status()
    except:
        print(f"{routelink} page did not work")
        return None
    page_text = BeautifulSoup(res.text, "lxml")
    
    
    
    #all of the variables for the route
    route_id = routelink.split("/route/")[1].split("/")[0]
    areas =  "\t".join([get_area_id(a["href"]) for a in page_text.find("div", {"class": "mb-half small text-warm"}).find_all("a")[1:]])
    name = page_text.find("h1").text.strip()
    yds = page_text.find("span", {"class": "rateYDS"}).text.strip()
    
    #this pulls all of the ratings, have to pare down to risk rating
    risk_rating = page_text.find("h2", {"class": "inline-block mr-2"}).text.strip().split()[-1]
    if risk_rating == "British":
        risk_rating = 'Null'
    
    #route details
    description_table = page_text.find("table", {"class": "description-details"})
    tds = [tr.find_all("td")[1].text.strip() for tr in description_table.find_all("tr")]
    route_type = tds[0]
    fa = tds[1]
    page_views = tds[2]
    
    #put all description paragraphs into TSV form
    description = None
    if page_text.find("div", {"class":"fr-view"}):
        description = "\t".join([x.text.strip() for x in page_text.find_all('div', {'class':'fr-view'})])
        
    link = routelink
    
    try:
        cursor.execute('''INSERT INTO routes VALUES (?,?,?,?,?,?,?,?,?,?);''', (route_id, areas, name, yds, route_type, fa, page_views, description, link, risk_rating))
    except sqlite3.IntegrityError:
        print(f'{routelink} already in db')
        cursor.execute('''UPDATE routes SET areas=?, name=?, yds=?, route_type=?, fa=?, page_views=?, description=?, link=?, risk_rating=? WHERE route_id=?''', (areas, name, yds, route_type, fa, page_views, description, link, risk_rating, route_id))
    #add ratings from opinions page
    you_and_route = page_text.find("div", {"id": "you-and-route"}).find("a")
    if you_and_route.text[0] != 0 and is_good_link(you_and_route["href"]):
        time.sleep(30)
        add_suggested_ratings(you_and_route["href"], cursor, route_id)
    
    #add comments - ran into roadblock https://www.mountainproject.com/img/waitWhite.gif
    #comments are non-static elements, use selenium for static access
    """comments = page_text.find_all("table", {"class": "main-comment width100"})
    print(comments)
    if comments:
        for comment in comments:
            comment_id = comment["id"].split("-")[1]
            user_id = comment.find("div", {"class": "bio"}).a["href"].split("user/")[1].split("/")[0]
            comment_text = comment.find("div", {"class": "comment-body"}).span.text.strip()
            cursor.execute(f"INSERT INTO comments VALUES ({comment_id}, {route_id}, {user_id}, {comment_text});")"""

In [145]:
def add_suggested_ratings(ticklink, cursor, route_id):
    """
    Takes in a route stats link and adds all suggested ratings to the database
    """
    #make some soup
    res = requests.get(ticklink)
    try:
        res.raise_for_status()
    except:
        print(f"{ticklink} page did not work")
        return None
    page_text = BeautifulSoup(res.text, "lxml")
    
    tables = page_text.find_all("div", {"class": re.compile("^col-lg-2")})
    if tables:
        ratings_table = [x.find("table", {"class": "table table-striped"}) for x in tables if x.find("h3").text.strip().lower()[:17] == ('suggested ratings')]
        if ratings_table:
            rows = ratings_table[0].find_all("tr")
            if rows:
                for row in rows:
                    tds = row.find_all("td")
                    user_id = tds[0].a["href"].split("user/")[1].split("/")[0]
                    rating = tds[1].text
                    try:
                        cursor.execute('''INSERT INTO suggested_ratings VALUES (?,?,?);''', (user_id, route_id, rating))
                    except sqlite3.IntegrityError:
                        #assumes that the values to be overwritten are from updates to code
                        cursor.execute('''UPDATE suggested_ratings SET route_id=?, suggested_rating=? WHERE user_id=?''', (route_id, rating, user_id))

#### Using the Helper Functions

In [158]:
#find starting areas
response = requests.get("https://www.mountainproject.com/route-guide")
response.raise_for_status()

main_guide = BeautifulSoup(response.text, "lxml")

#thank you: https://gist.github.com/norcal82/e4c7e8113f377db184bb
state_names = ["Alaska", "Alabama", "Arkansas", "Arizona", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]

starting_areas = [a["href"] for a in main_guide.find_all("a", {"class":"text-truncate float-xs-left"}) if a.text in state_names]

#open database
HOME_DB = "./sql-db/mp.sqlite"
connector = sqlite3.connect(HOME_DB)

In [162]:
scraped_areas = []
for area in starting_areas:
    if area not in scraped_areas:
        mp_scrape(area, connector)
        scraped_areas.append(area)
        proceed = input("proceed to next area? y/n")
        if not proceed.lower() == 'y':
            break
        

Adding route https://www.mountainproject.com/route/107605174/fiery-ginger
https://www.mountainproject.com/route/107605174/fiery-ginger already in db
Adding route https://www.mountainproject.com/route/107717687/bombs-away
https://www.mountainproject.com/route/107717687/bombs-away already in db
Adding route https://www.mountainproject.com/route/106248472/stonyman
shoot that's a Bouldering problem gross
shoot that's a Bouldering problem gross
Adding route https://www.mountainproject.com/route/106651430/cutting-teeth
Adding route https://www.mountainproject.com/route/106248462/father-and-son
Adding route https://www.mountainproject.com/route/106249484/fuzzy-crack
Adding route https://www.mountainproject.com/route/106651418/live-to-climb-another-day
Adding route https://www.mountainproject.com/route/107484556/ride-the-lighting
shoot that's a Bouldering problem gross
Adding route https://www.mountainproject.com/route/107484544/way-of-the-hand
shoot that's a Bouldering problem gross
shoot tha

ConnectionError: HTTPSConnectionPool(host='www.mountainproject.com', port=443): Max retries exceeded with url: /area/121726720/c-tomb-stone (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x11b3cef10>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [167]:
sql = """
SELECT *
FROM suggested_ratings
"""

pd.read_sql(sql, connector)

Unnamed: 0,user_id,route_id,suggested_rating
0,106800942,107605174,5.11-
1,200015822,107717687,5.8- PG13
2,106800942,107717687,5.8-
3,106800942,107605174,5.11-
4,200015822,107717687,5.8- PG13
...,...,...,...
158,200231209,108049350,5.13c
159,105886712,105982497,5.7 PG13
160,111656527,105982497,5.7 PG13
161,111442903,105982497,5.7
