In [29]:
import os
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [39]:
def scrape_urls(searches):
    """Extract URLs for route pages from a list of MP search result pages"""
    
    # list to contain all URLs
    full = []
    
    for search in searches:
        # parse the page with BS4 and find all links to routes
        response = requests.get(search)
        soup = BeautifulSoup(response.text, "html5lib")
        routes = soup.find_all("a" , {"class", "text-black route-row"})
        
        # get all URLs as a list and append to full
        urls = [route.get('href') for route in routes]
        full += urls

    return(full)

In [83]:
yos_urls = ["https://www.mountainproject.com/route-finder?selectedIds=105833388&type=rock&diffMinrock=800&diffMinboulder=20000&diffMinaid=70000&diffMinice=30000&diffMinmixed=50000&diffMaxrock=12400&diffMaxboulder=21400&diffMaxaid=75260&diffMaxice=38500&diffMaxmixed=60000&is_trad_climb=1&is_sport_climb=1&is_top_rope=1&stars=0&pitches=2&sort1=area&sort2=rating",
            "https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21400&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=70000&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=2&selectedIds=105833388&sort1=area&sort2=rating&stars=0&type=rock&page=2",
            "https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21400&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=70000&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=2&selectedIds=105833388&sort1=area&sort2=rating&stars=0&type=rock&page=3",
            "https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21400&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=70000&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=2&selectedIds=105833388&sort1=area&sort2=rating&stars=0&type=rock&page=4",
            "https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21400&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=70000&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=2&selectedIds=105833388&sort1=area&sort2=rating&stars=0&type=rock&page=5",
            "https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21400&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=70000&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=2&selectedIds=105833388&sort1=area&sort2=rating&stars=0&type=rock&page=6"]


yos_urls = scrape_urls(yos_urls)

In [84]:
def FA_split(string):
    split = re.split(', |: | and |, and | & |-| - |- |\(', string)
    
    return(split)

def FA_scrape(urls):
    """scrape the first ascent field from climbs located at a list of MP urls"""
    
    # data frame to hold all climbs and first ascentionists
    zero_data = np.zeros(shape=(len(urls), 2))
    firsts = pd.DataFrame(zero_data, columns=['climb', 'FA'])

    # iterate through climbs and record FAs
    for i in range(len(firsts)):
        # get unique page for climb and parse
        response = requests.get(urls[i])
        soup = BeautifulSoup(response.text, "html5lib")

        # get the name of the climb from the tile
        title = soup.find("title").contents[0]
        climb = ' '.join(','.join(title.split(',')[:-1]).split(' ')[2:]).strip()

        # get the contents of the FA field
        details = soup.find_all("table", {"class", "description-details"})    

        for detail in details:
            tdtag = detail.find('tbody')

        FAs = tdtag.contents[2].contents[3].contents[0].strip()

        # set name and FA in firsts df
        firsts.climb.loc[i] = climb
        firsts.FA.loc[i] = FA_split(FAs)

    return(firsts)


yosemite = FA_scrape(yos_urls)
yosemite

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,climb,FA
0,"Delectable Pinnacle, Right","[Warren Harding, Brian Small]"
1,El Cap Tree,"[Allen Steck, Will Siri, Bill Dunmire, Bob Swi..."
2,"Little John, Right","[Jack Turner, Royal Robbins, April, 1962.]"
3,Peter Pan,"[Bob Kamps, Jim Sims , July '62)]"
4,"Moby Dick, Left","[Bob Kamps, Frank Sacherer, 10/63.]"
5,West Buttress (pitch 1&2),"[Layton Kor, Steve Roper, 1963]"
6,Peter Left,"[Mead Hargis, Kim Schmitz , 1971)]"
7,"The Slack, Left","[Chuck Pratt, Royal Robbins , May '65)]"
8,The Slack ( center),"[Charlie Raymond, Wally Reed, 1958 FFA Pat Am..."
9,La Escuela,"[Yvon Chouinard, TM Herbert , May, '62) , , F..."


In [78]:
filename = "yosemite.csv"

# get filepath to save response json
curdir = os.path.dirname(os.path.abspath('.'))
filepath = os.path.join(curdir, "climber-net", filename)

with open(filepath, 'w') as outfile:
    np.savetxt(outfile, yosemite.as_matrix(), fmt='%s', delimiter=",")
