
<h3>Scraping content from US National Forests webpages</h3>

<img src="images/Gimli.jpg" width=30% style="display: inline-block">
<br>
image source: lotr.wikia.com
<br>
It may not be pretty or elegant, but it gets the job done
<br>
<br>
[Example: Cultus Creek Campground FS website](http://www.fs.usda.gov/recarea/giffordpinchot/recreation/camping-cabins/recarea/?recid=31736&actid=29)
<img src="images/cultus_creek_screenshot.png" width=80% style="display: inline-block">
<br>
<br>
Everyone's favorite: view source
<img src="images/cultus_creek_viewsource.png" width=80% style="display: inline-block">

In [1]:
import requests	
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
from sqlalchemy import create_engine
import config
from unidecode import unidecode

In [4]:
campgrounds = pd.read_csv('or_nf_campgrounds.csv')
campgrounds = campgrounds.head()

In [10]:
req = requests.get('http://172.17.0.2/Gifford Pinchot National Forest - Campground_ Big Creek.html')
print(req.text)






  
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">   

 
 
 
 
 
  
 

 
 
 
 
 
  
  
  
  
 
 
  


           
                                                                                                 
                                                            
                               
                              
<html xmlns="http://www.w3.org/1999/xhtml"  lang="en">  

<head>
 


<script type="text/javascript">
        var bidiSupport = new Object();
        bidiSupport.bidiAlignRight = "right";
        bidiSupport.bidiAlignLeft = "left"; 
        bidiSupport.bidiDirAttr = ""; 
        bidiSupport.bidiImageRTL = null;
        bidiSupport.isRTL = false;
</script>




















 






    
    
    
    
        
    


<title> 
	    	   
			
			Gifford Pinchot National Forest - Campground: Big 

In [8]:
def extract_cg_info(campgrounds, area, outfile) :
    for index, campground in campgrounds.iterrows():
        latitude = longitude = elevation = site_id = numsites = ""
        fees = openseason = usage = water = restroom = status = ""

        print(campground['site_name'] + '\t' + campground['site_url'])
        try :
            cg_req = requests.get(campground['site_url'])
            cg_soup = BeautifulSoup(cg_req.text, 'lxml')
            site_id = campground['site_url'].split('?')[1].split('&')[0].split('=')[1]
        except Exception as ex :
            print('couldnt get site_url ' + campground['site_url'])
            print(ex)
            continue

        # get area status if available
        try :
            for strong_tag in cg_soup.find_all('strong'):
                if ('Area Status' in unidecode(strong_tag.text)):
                    status = unidecode(strong_tag.next_sibling).strip()
        except Exception:
            print('couldnt get area status')


        print("getting location")
        # get lat, long, altitude
        try :
            lat = cg_soup.find_all('div', text=re.compile('Latitude'))
            div = [row.next_sibling.next_sibling for row in lat]
            latitude  = div[0].text.strip()

            lng = cg_soup.find_all('div', text=re.compile('Longitude'))
            div = [row.next_sibling.next_sibling for row in lng]
            longitude  = div[0].text.strip()

            el = cg_soup.find_all('div', text=re.compile('Elevation'))
            div = [row.next_sibling.next_sibling for row in el]
            elevation  = div[0].text.strip()

            # get site usage, type, num sites, site info

        except Exception:
            print('couldnt get location info')

        # table[0] is the basic info table
        print("getting basic info")

        try :
            tables = cg_soup.find_all('div', {'class': 'tablecolor'})
        except Exception:
            print('couldnt get tables')

        try :
            rows = tables[0].find_all('tr')

            for row in rows:
                if row.th.text == 'Reservations:':
                    reservations = unidecode(row.td.text).strip()
                if row.th.text == 'Open Season:':
                    openseason = unidecode(row.td.text).strip()
                if row.th.text == 'Current Conditions:':
                    conditions = unidecode(row.td.text).strip()
                if row.th.text == 'Water:':
                    water = unidecode(row.td.text).strip()
                if row.th.text == 'Restroom:':
                    restroom = unidecode(row.td.text).strip()
        except Exception :
            print('couldnt get basic campground info')

        # table 1 is the campground info
        print("getting reservation info")
        try:
            rows = tables[1].find_all('tr')

            for row in rows:
                if row.td.text == 'Reservation Info':
                    reserveinfo = unidecode(row.td.next_sibling.text).strip()
                if row.td.text == 'No. of Sites':
                    numsites = unidecode(row.td.next_sibling.text).strip()
        except Exception:
            print('couldnt get campsite availability info')
        
        # assemble into DataFrame
        print('appending data')
        df_cg = pd.DataFrame ({
            'latitude': [latitude],
            'longitude': [longitude],
            'elevation': [elevation],
            'facilityname' : [campground['site_name']],
            'facilityurl' : [campground['site_url']],
            'facilityid' : [site_id],
            'status' : [status],
            'water' : [water],
            'restroom' : [restroom],
            'reserveinfo': [reserveinfo],
            'reservations': [reservations],
            'conditions': [conditions],
            'numsites': [numsites]
            })

        #print df_cg
        print('writing to file')

        if not os.path.isfile(outfile):
           df_cg.to_csv(outfile,header ='column_names', index=False)
        else: # else it exists so append without writing the header
            df_cg.to_csv(outfile,mode = 'a',header=False, index=False)

In [9]:
test = extract_cg_info(campgrounds, "Mt Hood NF", "out.csv")

Badger Lake Campground	http://www.fs.usda.gov/recarea/mthood/recreation/camping-cabins/recarea/?recid=52784&actid=29
getting location
getting basic info
getting reservation info
appending data
writing to file
Bear Springs Group Campground	http://www.fs.usda.gov/recarea/mthood/recreation/camping-cabins/recarea/?recid=52786&actid=29
getting location
getting basic info
getting reservation info
appending data
writing to file
Bonney Crossing Campground	http://www.fs.usda.gov/recarea/mthood/recreation/camping-cabins/recarea/?recid=52790&actid=29
getting location
getting basic info
getting reservation info
appending data
writing to file
Bonney Meadow Campground	http://www.fs.usda.gov/recarea/mthood/recreation/camping-cabins/recarea/?recid=52792&actid=29
getting location
getting basic info
getting reservation info
appending data
writing to file
Bonney Meadows Trail  #471	http://www.fs.usda.gov/recarea/mthood/recreation/hiking/recarea/?recid=80374&actid=50
getting location
couldnt get location 

In [12]:
test = pd.read_csv('out.csv')


In [13]:
test.shape

(13, 13)

In [14]:
test

Unnamed: 0,conditions,elevation,facilityid,facilityname,facilityurl,latitude,longitude,numsites,reservations,reserveinfo,restroom,status,water
0,Campground not accessible due to snow,4400.0,52784,Badger Lake Campground,http://www.fs.usda.gov/recarea/mthood/recreati...,45.30496,-121.55537,4 single site(s),No reservations,Not reservable,Vault Toilet (1),Open,No
1,Officially closed but still accessible. Pack ...,3200.0,52786,Bear Springs Group Campground,http://www.fs.usda.gov/recarea/mthood/recreati...,45.11665,-121.53091,"4 Group Camping Sites, 1 Group Shelter Day Use",This site can be reserved by calling Toll Free...,http://www.recreation.gov or call 877-444-6777,Vault Toilet (1),Closed,Potable Water
2,Officially closed (no services provided) but s...,2200.0,52790,Bonney Crossing Campground,http://www.fs.usda.gov/recarea/mthood/recreati...,45.257,-121.39205,"6 single site(s), 2 double site(s)",No reservations,Not reservable,Vault Toilet (1),Closed,No
3,Campground is not accessible due to snow,4800.0,52792,Bonney Meadow Campground,http://www.fs.usda.gov/recarea/mthood/recreati...,45.26548,-121.58286,,No reservations,Not reservable,Vault Toilet (1),Unreachable,No
4,Campground not accessible due to snow,4400.0,52784,Badger Lake Campground,http://www.fs.usda.gov/recarea/mthood/recreati...,45.30496,-121.55537,4 single site(s),No reservations,Not reservable,Vault Toilet (1),Open,No
5,Officially closed but still accessible. Pack ...,3200.0,52786,Bear Springs Group Campground,http://www.fs.usda.gov/recarea/mthood/recreati...,45.11665,-121.53091,"4 Group Camping Sites, 1 Group Shelter Day Use",This site can be reserved by calling Toll Free...,http://www.recreation.gov or call 877-444-6777,Vault Toilet (1),Closed,Potable Water
6,Officially closed (no services provided) but s...,2200.0,52790,Bonney Crossing Campground,http://www.fs.usda.gov/recarea/mthood/recreati...,45.257,-121.39205,"6 single site(s), 2 double site(s)",No reservations,Not reservable,Vault Toilet (1),Closed,No
7,Campground is not accessible due to snow,4800.0,52792,Bonney Meadow Campground,http://www.fs.usda.gov/recarea/mthood/recreati...,45.26548,-121.58286,,No reservations,Not reservable,Vault Toilet (1),Unreachable,No
8,Campground not accessible due to snow,4400.0,52784,Badger Lake Campground,http://www.fs.usda.gov/recarea/mthood/recreati...,45.30496,-121.55537,4 single site(s),No reservations,Not reservable,Vault Toilet (1),Open,No
9,Officially closed but still accessible. Pack ...,3200.0,52786,Bear Springs Group Campground,http://www.fs.usda.gov/recarea/mthood/recreati...,45.11665,-121.53091,"4 Group Camping Sites, 1 Group Shelter Day Use",This site can be reserved by calling Toll Free...,http://www.recreation.gov or call 877-444-6777,Vault Toilet (1),Closed,Potable Water
