In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from requests import get
import re
import json
import random
import time

# Get Soup

In [6]:
# get trail html
def get_soup(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
    page = get(url,headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

# Main Trail Stats

In [7]:
# get basic trail stats

def get_stats(soup):
    trail_details_label = []
    trail_details_value = []
    basic_box = soup.find('div',id="basicTrailStats")
    basic_cols = basic_box.find_all('div',class_='large')

    distance = basic_cols[0].text
    trail_details_label.append('distance')
    trail_details_value.append(distance)

    climb = basic_cols[1].text
    trail_details_label.append('climb')
    trail_details_value.append(climb)

    descent = basic_cols[2].text
    trail_details_label.append('descent')
    trail_details_value.append(descent)

    avg_time = basic_cols[3].text
    trail_details_label.append('avg_time')
    trail_details_value.append(avg_time)

    # get rating and number of votes
    rating_box = soup.find('div',class_='star-rating readonly')
    rating = rating_box.find('span',class_='result').text
    trail_details_label.append('rating')
    trail_details_value.append(rating)

    votes = rating_box.find('span',class_='votesLink underline grey clickable').text
    trail_details_label.append('votes')
    trail_details_value.append(votes)

    ## trail detailed stats
    details_box = soup.find('ul',id='traildetails_display').find_all('li')
    for line in details_box:
        line_segs = line.find_all('div')
        if len(line_segs) == 2:
            trail_details_label.append(line_segs[0].get_text(strip=True))
            trail_details_value.append(line_segs[1].get_text(strip=True))
    stats_box = soup.find('ul',id='trailstats_display').find_all('li')
    for line in stats_box:
        line_segs = line.find_all('div')
        if len(line_segs) == 2:
            trail_details_label.append(line_segs[0].get_text(strip=True))
            trail_details_value.append(line_segs[1].get_text(strip=True))

    # descriptions
    description_box_html = soup.find('p',id='trail_description')
    if description_box_html:
        description = description_box_html.text
        trail_details_label.append('description')
        trail_details_value.append(description)
    else:
        description=[]

    # geo info
    parsed_json = json.loads(soup.find('script',type='application/ld+json').text)
    trail_details_label.append('latitude')
    trail_details_value.append(parsed_json['geo']['latitude'])
    trail_details_label.append('longitude')
    trail_details_value.append(parsed_json['geo']['longitude'])
    trail_details_label.append('city')
    trail_details_value.append(parsed_json['address']['addressLocality'])
    trail_details_label.append('state')
    trail_details_value.append(parsed_json['address']['addressRegion'])
    return trail_details_label,trail_details_value

# Comment Info

In [8]:
def get_comments(soup):
    com_box = soup.find('div',class_='comcount')
    if com_box:
        com_all = soup.find_all('div',class_='ppcont')
        com_votes = []
        com_user = []
        com_text = []
        for com_num in com_all:
            com_votes.append(com_num.div.div.text)
            com_user.append(com_num.div.span.text)
            com_text.append(com_num.find('div',class_="comtext translate").get_text(strip=True))
        return com_votes, com_user, com_text
    else:
        comcount = 0
        return [],[],[]


# Simple Test

In [9]:
soup= get_soup('https://www.trailforks.com/trails/half-nelson/')
detail_label,detail_value = get_stats(soup)
comment_votes,user,comment_text = get_comments(soup)
print(detail_label,detail_value,comment_votes,user,comment_text)

['distance', 'climb', 'descent', 'avg_time', 'rating', 'votes', 'Riding area', 'Difficulty rating', 'Trail type', 'Bike type', 'Physical rating', 'Season', 'TTFs on trail', 'Ride in rain', 'Global Ranking', 'Altitude change', 'Altitude min', 'Altitude max', 'Altitude start', 'Altitude end', 'Grade', 'Grade max', 'Grade min', 'Distance climb', 'Distance down', 'Distance flat', 'Avg time', 'Avg reverse time', 'description', 'latitude', 'longitude', 'city', 'state'] ['1 miles', '56 ft', '-823 ft', '00:12:06', 'Avg: 4.6', '173 votes', 'Diamond HeadSquamish, British Columbia', 'Bluerate', 'Singletrack', 'DH, AM', 'Moderate', 'May - October', 'Bridge, Jump', 'Yes', '#2', '-766 ft', '1,116 ft', '1,883 ft', '1,883 ft', '1,116 ft', '-11.414%', '-47.666%', '20.067%', '913 ft', '1 miles', '188 ft', '00:12:06', '01:35:43', "One of Squamish's most popular trails is basically a 3km downhill pumptrack. A beginner/intermediate/advanced trail built by Big Red Ted Tempany and a cadre of local volunteers

# Load list of trails and urls

In [10]:
df = pd.read_csv('/Users/briangraham/insight/trailrec/data/BC_trail_names')
df.head()

Unnamed: 0.1,Unnamed: 0,trail_id,trail_url,trail_name,region_url,region_name
0,0,1-87-dh,https://www.trailforks.com/trails/1-87-dh/,$1.87 DH,https://www.trailforks.com/region/mount-prevost/,Mount Prevost
1,1,palomino,https://www.trailforks.com/trails/palomino/,* Palomino,https://www.trailforks.com/region/three-blind-...,Three Blind Mice
2,2,rock-star-66875,https://www.trailforks.com/trails/rock-star-66...,*ROCK STAR*,https://www.trailforks.com/region/west-sechelt/,West Sechelt
3,3,10-dollar,https://www.trailforks.com/trails/10-dollar/,10 Dollar,https://www.trailforks.com/region/pidherny-rec...,Pidherny Recreation Site
4,4,10-km,https://www.trailforks.com/trails/10-km/,10 km,https://www.trailforks.com/region/mount-7/,Mount 7


In [11]:
df['trail_url'].head()

0           https://www.trailforks.com/trails/1-87-dh/
1          https://www.trailforks.com/trails/palomino/
2    https://www.trailforks.com/trails/rock-star-66...
3         https://www.trailforks.com/trails/10-dollar/
4             https://www.trailforks.com/trails/10-km/
Name: trail_url, dtype: object

## Randomize trail list to make scraping less obvious

In [12]:
#df2 = df.sample(frac=1)
#df2.head()
#df2.to_csv('/Users/briangraham/insight/trailrec/data/random_scrap_list')

In [14]:
df_random = pd.read_csv('/Users/briangraham/insight/trailrec/data/random_scrap_list')

In [15]:
df_random.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,trail_id,trail_url,trail_name,region_url,region_name
0,784,784,brat-22073,https://www.trailforks.com/trails/brat-22073/,BRAT,https://www.trailforks.com/region/sechelt/,Sechelt
1,3817,3817,meadow-of-the-grizzly-spelhx-en-tl-a-stl-lhale...,https://www.trailforks.com/trails/meadow-of-th...,Meadow of the Grizzly (Spélhx̱en tl'a Stl'lhalem),https://www.trailforks.com/region/diamond-head/,Diamond Head
2,6262,6262,the-dungeon,https://www.trailforks.com/trails/the-dungeon/,The Dungeon,https://www.trailforks.com/region/iron-mountain/,Iron Mountain
3,6538,6538,treacherous-cretins,https://www.trailforks.com/trails/treacherous-...,Treacherous Cretins,https://www.trailforks.com/region/lost-lake/,Lost Lake
4,979,979,capilano-pacific-trail-exit,https://www.trailforks.com/trails/capilano-pac...,Capilano Pacific Trail Exit,https://www.trailforks.com/region/west-vancouver/,West Vancouver


# SCRAPE LOOP

In [31]:
errorlog = []
for t_id, url in zip(df_random['trail_id'],df_random['trail_url']):
    
    try:
        #scrape
        soup = get_soup(url)
        detail_label,detail_value = get_stats(soup)
        comment_votes,user,comment_text = get_comments(soup)

        #save
        trail_data_save_loc = '/Users/briangraham/insight/trailrec/data/trail_info/'
        df_trail_data = pd.DataFrame({'label':detail_label,'value':detail_value})
        df_trail_data.to_pickle(trail_data_save_loc+ t_id + '.pickle')

        comment_data_save_loc = '/Users/briangraham/insight/trailrec/data/comments/'
        df_comment_data = pd.DataFrame({'comment_votes':comment_votes,'user':user,'comment_text':comment_text})
        df_comment_data.to_pickle(comment_data_save_loc + t_id + '.pickle')
    except:
        errorlog.append(url)
    time.sleep(random.uniform(.5,1))


In [40]:
df_error = pd.DataFrame({'missed_trail_url':errorlog})
df_error.head()
df_error.to_csv('/Users/briangraham/insight/trailrec/data/missed_trail_urls')