In [1]:
import pandas as pd
import numpy as np
import seaborn
import re
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException

In [2]:
def scrapeComments(route_id):
    """
    If comments exist on a given route, returns comments in list 
    Keyword arguments: 
    route_id -- The id parsed from the URL 
    """
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    wait.until(EC.presence_of_element_located((By.XPATH, "//*[@id='comments-Climb-Route-" + route_id + "']/div[2]/div[2]")))
    route_com = driver.find_element_by_xpath("//*[@id='comments-Climb-Route-" + route_id + "']/div[2]/div[2]")
    has_comments = None 
    try:
        wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "comment-body")))
        has_comments = True 
    except TimeoutException:  
            has_comments = False 
            print(route_id, 'has no comments')
    if has_comments == True:
        comment_body = route_com.find_elements_by_class_name('comment-body')
        func_comments = []
        for i in comment_body: 
            func_comments.append(i.text)
        return(func_comments)


In [3]:
def getRouteDetails():
    """
    Returns route details based on current webdriver URL 

    """
    url_split = driver.current_url.split('/')
    route_id = url_split[4]
    route_name = url_split[5]
    route_type = driver.find_element_by_xpath("//*[@id='route-page']/div/div[3]/div[1]/div[1]/div[1]/table/tbody/tr[1]/td[2]").text
    route_rating_list = driver.find_element_by_xpath("//*[@id='route-star-avg']/span").text.split(" ")
    route_diff = driver.find_element_by_xpath("//*[@id='route-page']/div/div[1]/h2/span[1]").text
    route_location = driver.find_element_by_xpath("//*[@id='route-page']/div/div[1]/div[2]").text
    route_num_votes = route_rating_list [4]
    route_avg_rating = route_rating_list[2]
    return ([route_id, route_name, route_type, route_diff, route_location, route_num_votes, route_avg_rating])


In [52]:
#gets route details and comments from each route in search
def getResultSet(route_df):
    """
    Loops through all routes in a result set and returns an appended dataframe 
    Keyword arguments: 
    cur_route_df -- A dataframe accomodating scraped route details and comments
    """
    cur_route_df = pd.DataFrame(columns = ['id','name','type','diff','location','num_ratings','ratings','comments'])
    num_routes = (driver.find_element_by_xpath("//*[@id='body-climb']/div[6]/div/div[2]/div/div[1]").text).split('Results 1 to ')[1].split(' ')[0]
    for route in range(2, int(num_routes)+2): 
        driver.find_element_by_xpath("//*[@id='body-climb']/div[6]/div/div[3]/div/div/table[2]/tbody/tr["+str(route)+"]/td[1]/a").click() 
        route_row = getRouteDetails()
        route_row.append(scrapeComments(route_row[0]))
        cur_route_df.loc[route-2] = route_row
        driver.back()
    route_df = pd.concat([route_df, cur_route_df], ignore_index = True)
    return(route_df)

In [35]:
def ParseLocHierarchy(): 
    """
    Parses sublocations of current location, returns names in a list 
    
    """
    loc_areas = driver.find_elements_by_xpath('//*[@id="climb-area-page"]/div/div[2]/div/div[3]')
    subloc_lst = []
    for i in loc_areas:
        subloc_lst.append(i.text)
    location_lst = subloc_lst[0].split('\n')
    locs_cal = []
    for location in location_lst :
        m = re.split('((\d+))', location)
        locs_cal.append(m[0].strip())

    return(locs_cal)

In [36]:
#get location heirarchy to automate searching route results <1k 

driver.get("https://www.mountainproject.com/route-guide")
driver.find_element_by_link_text(state).click()
state_locs = ParseLocHierarchy() 

driver.find_element_by_link_text(locs_cal[0]).click()
    #test.append(re.split('[^a-zA-Z]', i.text))

#//*[@id="climb-area-page"]/div/div[2]/div/div[3]/div[1]
#//*[@id="climb-area-page"]/div/div[2]/div/div[3]/div[2]
#//*[@id="climb-area-page"]/div/div[2]/div/div[3]


['Central Coast',
 'High Desert',
 'High Sierra',
 'Inland Empire',
 'Joshua Tree National Park',
 'Lake Tahoe',
 'Los Angeles Basin',
 'Northeast California',
 'Redwood Coast',
 'San Bernardino Mountains',
 'San Diego County',
 'San Francisco Bay Area',
 'San Jacinto Mountains',
 'Sequoia & Kings Canyon NP',
 'Sierra Eastside',
 'Sonora Pass Highway (',
 'Tahquitz & Suicide Rocks',
 'Western Sierra',
 'Yosemite National Park']

In [32]:
state_areas = driver.find_elements_by_xpath('//*[@id="climb-area-page"]/div/div[2]/div/div[3]')
state_lst = []
for i in state_areas:
    state_lst.append(i.text)
state_lst

['Big Sur 28\nCamarillo Springs Boulder (Lonesome Cube) 0\nCarmel Bay 15\nConejo Mountain 40\nHwy 166/Silly Rock 59\nHwy 33/Ojai 368\nPine Mountain Club 18\nPinnacles National Park 274\nPoint Mugu 15\nSan Luis Obispo 262\nSanta Barbara 765\nSanta Cruz 38\nTar Creek (the Swimming Hole) 0\nWagon Caves 22']

In [25]:
#first hierarchy level names 
location_lst = test[0].split('\n')
locs_cal = []
for location in location_lst :
    m = re.split('((\d+))', location)
    locs_cal.append(m[0].strip())
locs_cal

['Central Coast',
 'High Desert',
 'High Sierra',
 'Inland Empire',
 'Joshua Tree National Park',
 'Lake Tahoe',
 'Los Angeles Basin',
 'Northeast California',
 'Redwood Coast',
 'San Bernardino Mountains',
 'San Diego County',
 'San Francisco Bay Area',
 'San Jacinto Mountains',
 'Sequoia & Kings Canyon NP',
 'Sierra Eastside',
 'Sonora Pass Highway (',
 'Tahquitz & Suicide Rocks',
 'Western Sierra',
 'Yosemite National Park']

# Apply Search Settings 

In [70]:
#Read in Config txt file as dict
with open(r"C:\Users\drpow\Documents\Personal Projects\positively_climbable\config.txt", "r") as config_txt:
    config_txt = eval(config_txt.read())
config = config_txt

#location 
state = str(config_txt['state_location'])
sub_location = str(config_txt['sub_location'])

#mapping rock route type to HTML id + map other settings to vars 
route_type = config["route_type"].lower()
diff_rating_range = config["diff_rating_range"]

rock_type_dict = {"sport":"check_is_sport_climb", "trad":"check_is_trad_climb", "toprope" : "check_is_top_rope"}
rock_type = config["rock_route_type_exclude"]

In [74]:
#initialize webdriver and waits
driver = webdriver.Chrome(r"C:\Users\drpow\Documents\Personal Projects\positively_climbable\chromedriver.exe")
driver.get("http://www.mountainproject.com")
wait = WebDriverWait(driver, 2)

#apply config settings to "route finder"
select_type = Select(driver.find_element_by_id("type")) 
select_type.select_by_value(route_type)

#lb = lower bound, ub = upper bound
if route_type =='rock': 
    Select(driver.find_element_by_id("diffMinrock")).select_by_visible_text(diff_rating_range[0])
    Select(driver.find_element_by_id("diffMaxrock")).select_by_visible_text(diff_rating_range[1])
    for rock_toggle in rock_type: 
        if rock_type[rock_toggle] == True:
            driver.find_element_by_id(rock_type_dict[rock_toggle]).click()
  

In [None]:
          
#if state != '': 
#    driver.find_element_by_xpath('//*[@id="routeFinderForm"]/table/tbody/tr[4]/td[2]/a').click()
#    wait.until(EC.element_to_be_clickable((By.LINK_TEXT, state))).click()
#    if sub_location != '': 
#        wait.until(EC.element_to_be_clickable((By.LINK_TEXT, sub_location))).click()
#    driver.find_element_by_id("select-area").click()
        
        
driver.find_element_by_xpath("//*[@id='routeFinderForm']/table/tbody/tr[5]/td/input").click()
wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "View All"))).click()

route_df = pd.DataFrame(columns = ['id','name','type','diff','location','num_ratings','ratings','comments'])
