In [79]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import sqlite3
import time

In [33]:
#open database to write
db = "./sql-db/mp.sqlite"
connector = sqlite3.connect(db)
cursor = connector.cursor()


In [3]:
response = requests.get("https://www.mountainproject.com/route-guide")
response.raise_for_status()

In [5]:
main_guide = BeautifulSoup(response.text, "lxml")

In [172]:
def get_area_id(href):
    return href.split("/area/")[1].split("/")[0]

In [194]:
#thank you: https://gist.github.com/norcal82/e4c7e8113f377db184bb
state_names = ["Alaska", "Alabama", "Arkansas", "Arizona", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"]

In [195]:
starting_areas = {get_area_id(a["href"]): [a.text, a["href"]] for a in main_guide.find_all("a", {"class":"text-truncate float-xs-left"}) if a.text in state_names}
starting_areas

{'105905173': ['Alabama',
  'https://www.mountainproject.com/area/105905173/alabama'],
 '105909311': ['Alaska',
  'https://www.mountainproject.com/area/105909311/alaska'],
 '105708962': ['Arizona',
  'https://www.mountainproject.com/area/105708962/arizona'],
 '105901027': ['Arkansas',
  'https://www.mountainproject.com/area/105901027/arkansas'],
 '105708959': ['California',
  'https://www.mountainproject.com/area/105708959/california'],
 '105708956': ['Colorado',
  'https://www.mountainproject.com/area/105708956/colorado'],
 '105806977': ['Connecticut',
  'https://www.mountainproject.com/area/105806977/connecticut'],
 '106861605': ['Delaware',
  'https://www.mountainproject.com/area/106861605/delaware'],
 '111721391': ['Florida',
  'https://www.mountainproject.com/area/111721391/florida'],
 '105897947': ['Georgia',
  'https://www.mountainproject.com/area/105897947/georgia'],
 '106316122': ['Hawaii',
  'https://www.mountainproject.com/area/106316122/hawaii'],
 '105708958': ['Idaho',
  '

In [196]:
scraped_areas = {}
for area in starting_areas.items():
    print(area)

('105905173', ['Alabama', 'https://www.mountainproject.com/area/105905173/alabama'])
('105909311', ['Alaska', 'https://www.mountainproject.com/area/105909311/alaska'])
('105708962', ['Arizona', 'https://www.mountainproject.com/area/105708962/arizona'])
('105901027', ['Arkansas', 'https://www.mountainproject.com/area/105901027/arkansas'])
('105708959', ['California', 'https://www.mountainproject.com/area/105708959/california'])
('105708956', ['Colorado', 'https://www.mountainproject.com/area/105708956/colorado'])
('105806977', ['Connecticut', 'https://www.mountainproject.com/area/105806977/connecticut'])
('106861605', ['Delaware', 'https://www.mountainproject.com/area/106861605/delaware'])
('111721391', ['Florida', 'https://www.mountainproject.com/area/111721391/florida'])
('105897947', ['Georgia', 'https://www.mountainproject.com/area/105897947/georgia'])
('106316122', ['Hawaii', 'https://www.mountainproject.com/area/106316122/hawaii'])
('105708958', ['Idaho', 'https://www.mountainproj

In [212]:
#rewrite this to scrape one page at a time
def mp_scrape(link, cursor):
    res = requests.get(link)
    try:
        res.raise_for_status()
    except:
        print(f"{link} errored out")
    sub_area_text = BeautifulSoup(res.text, "lxml")
    sub_area_table = sub_area_text.find_all("div", {"class":"lef-nav-row"})
    route_table = sub_area_text.find("table", {"id":"left-nav-route-table"})

    #if there are more sub_areas, call the function recursively
    if sub_area_table:
        for div in sub_area_table:
            link = div.find("a")["href"]
            if is_good_link(link):
                mp_scrape(link, cursor)
    
    #if there is a list of routes, add each route to the database
    elif route_table:
        #make a list of route links
        routelinks = [a["href"] for a in route_table.find_all("a") if is_good_link(a["href"])]
        for link in routelinks:
            if is_good_link(link):
                add_route(link, cursor)
    #if there are neither, this page is empty
    else:
        print(f"{link} is an empty area page")


#make some soup 
#if find_all("div", {"class": "lef-nav-row"}) -> add more areas
#if find("table", {"class": "left-nav-route-table"}) -> open route links, add route and comments to database


In [203]:
#in case we trigger something https://softwareengineering.stackexchange.com/questions/91760/how-to-be-a-good-citizen-when-crawling-web-sites
def is_good_link(link):
    for word in ['delete', 'remove', 'edit', 'update', 'modify']:
        if word in link.lower():
            return False
    return True

In [206]:
is_good_link("https://remove.stackexchange.com/questions/91760/how-to-be-a-good-citizen-when-crawling-web-sites")

False

In [191]:
def add_route(routelink, cursor):
    print(f"Adding route {routelink}")
    #make some soup
    res = requests.get(routelink)
    try:
        res.raise_for_status()
    except:
        print(f"{routelink} page did not work")
        return None
    page_text = BeautifulSoup(res.text, "lxml")
    
    
    
    #all of the variables for the route
    route_id = routelink.split("/route/")[1].split("/")[0]
    areas = str([get_area_id(a["href"]) for a in page_text.find("div", {"class": "mb-half small text-warm"}).find_all("a")[1:]])
    name = page_text.find("h1").text.strip()
    yds = page_text.find("span", {"class": "rateYDS"}).text.strip()
    
    #this pulls all of the ratings, have to pare down to risk rating
    risk_rating = page_text.find("h2", {"class": "inline-block mr-2"}).text.strip().split()[-1]
    if risk_rating == "British":
        risk_rating = 'Null'
    
    #for NLP features
    description_table = page_text.find("table", {"class": "description-details"})
    tds = [tr.find_all("td")[1].text.strip() for tr in description_table.find_all("tr")]
    route_type = tds[0]
    fa = tds[1]
    page_views = tds[2]
        
    paragraphs = page_text.find_all("div", {"class": "fr-view"})
    description = paragraphs[0].text.strip()
    location = paragraphs[1].text.strip()
    protection = paragraphs[2].text.strip()
    link = routelink
    
    try:
        cursor.execute('''INSERT INTO routes VALUES (?,?,?,?,?,?,?,?,?,?,?,?);''', (route_id, areas, name, yds, route_type, fa, page_views, description, location, protection, link, risk_rating))
    except sqlite3.IntegrityError:
        print('already in db')
    
    #add ratings from opinions page
    you_and_route = page_text.find("div", {"id": "you-and-route"}).find("a")
    if you_and_route.text[0] != 0:
        add_suggested_ratings(you_and_route["href"], cursor, route_id)
    
    #add comments - ran into roadblock https://www.mountainproject.com/img/waitWhite.gif
    """comments = page_text.find_all("table", {"class": "main-comment width100"})
    print(comments)
    if comments:
        for comment in comments:
            comment_id = comment["id"].split("-")[1]
            user_id = comment.find("div", {"class": "bio"}).a["href"].split("user/")[1].split("/")[0]
            comment_text = comment.find("div", {"class": "comment-body"}).span.text.strip()
            cursor.execute(f"INSERT INTO comments VALUES ({comment_id}, {route_id}, {user_id}, {comment_text});")"""

In [190]:
res = requests.get('https://www.mountainproject.com/route/111972483/hero-route')
rando = BeautifulSoup(res.text, 'lxml').find("div", {"id": "you-and-route"}).find("a")
rando

<a href="https://www.mountainproject.com/route/stats/111972483/hero-route">2 Opinions</a>

In [187]:
def add_suggested_ratings(ticklink, cursor, route_id):
    #make some soup
    res = requests.get(ticklink)
    try:
        res.raise_for_status()
    except:
        print(f"{ticklink} page did not work")
        return None
    page_text = BeautifulSoup(res.text, "lxml")
    
    ratings_table = page_text.find_all("table", {"class":"table table-striped"})[1]
    rows = ratings_table.find_all("tr")
    for row in rows:
        tds = row.find_all("td")
        user_id = tds[0].a["href"].split("user/")[1].split("/")[0]
        rating = tds[1].text
        try:
            cursor.execute('''INSERT INTO suggested_ratings VALUES (?,?,?);''', (user_id, route_id, rating))
        except sqlite3.IntegrityError:
            #assumes that the values to be overwritten are from updates to code
            cursor.execute('''UPDATE suggested_ratings SET user_id=?, route_id=?, suggested_rating=?''', (user_id, route_id, rating))

In [174]:
test_dict= {"106305856": ["Yellow Bluff",
  "https://www.mountainproject.com/area/106305856/yellow-bluff"]}
mp_scrape(test_dict, cursor)
connector.commit()

Adding route https://www.mountainproject.com/route/111972455/caught-in-the-hen-house
already in db
[]
Adding route https://www.mountainproject.com/route/111972483/hero-route
already in db
[]
Adding route https://www.mountainproject.com/route/111972427/out-on-a-limb
already in db
[]
Adding route https://www.mountainproject.com/route/106413411/hang-down
already in db
[]
Adding route https://www.mountainproject.com/route/106413405/jelly-roll
already in db
[]
Adding route https://www.mountainproject.com/route/106413420/yella-fever


KeyboardInterrupt: 

In [149]:
sql = """
SELECT *
FROM routes;
"""
test_df = pd.read_sql(sql, connector)
test_df

In [162]:
test_df[test_df['risk_rating'].str.endswith('British')][['link', 'risk_rating']]

Unnamed: 0,link,risk_rating
0,https://www.mountainproject.com/route/10640886...,5.7 YDS 5a French 15 Ewbanks V+ UIAA 13 ZA MVS...
1,https://www.mountainproject.com/route/10641120...,5.10- YDS 6a French 18 Ewbanks VI+ UIAA 18 ZA ...
10,https://www.mountainproject.com/route/10642588...,5.11a YDS 6c French 22 Ewbanks VII+ UIAA 22 ZA...
11,https://www.mountainproject.com/route/10643094...,5.11a YDS 6c French 22 Ewbanks VII+ UIAA 22 ZA...
13,https://www.mountainproject.com/route/10666464...,5.5 YDS 4b French 13 Ewbanks IV+ UIAA 11 ZA MS...
...,...,...
71,https://www.mountainproject.com/route/11197248...,5.10b/c YDS 6b French 20 Ewbanks VII UIAA 20 Z...
72,https://www.mountainproject.com/route/11211461...,4th YDS 1 French 2 Ewbanks I UIAA 2 ZA M 1b Br...
73,https://www.mountainproject.com/route/11531867...,5.10b YDS 6a+ French 19 Ewbanks VII- UIAA 19 Z...
75,https://www.mountainproject.com/route/11883877...,5.4 YDS 4a French 12 Ewbanks IV UIAA 10 ZA VD ...


In [161]:
test_df.loc[76, 'risk_rating']

'5.4 YDS 4a French 12 Ewbanks IV UIAA 10 ZA VD 3c British'

In [163]:
test_df.shape

(77, 12)

In [166]:
test_df[test_df['risk_rating'].str.contains('Font')].shape

(11, 12)

In [169]:
comments = pd.read_sql("SELECT * FROM comments;", connector)
comments

Unnamed: 0,comment_id,route_id,user_id,comment


In [186]:
sql = """
SELECT * 
FROM suggested_ratings
"""
pd.read_sql(sql, connector)

Unnamed: 0,user_id,route_id,suggested_rating
0,106098470,5.8,111972455
1,106098470,5.10b/c,111972483
2,106098470,5.10a,111972427
3,200634965,V2+,106413411
4,106092355,V2-,106413411
...,...,...,...
359,106098470,5.10b/c,111972483
360,106098470,5.10a,111972427
361,200634965,V2+,106413411
362,106092355,V2-,106413411


In [211]:
mp_scrape('https://www.mountainproject.com/area/106222485/colbert-heights', cursor)

https://www.mountainproject.com/area/106305856/yellow-bluff is an empty area page
Adding route https://www.mountainproject.com/route/106325615/mantle-problem
Adding route https://www.mountainproject.com/route/106322478/dirty-laundry
Adding route https://www.mountainproject.com/route/106327218/unknown
Adding route https://www.mountainproject.com/route/106329925/vulcanunknown
Adding route https://www.mountainproject.com/route/106322459/unknown
