Zach Tretter

June 2020

--------

In [1]:
import requests 
from bs4 import BeautifulSoup

import os
import pandas as pd
import numpy as np
import time

import folium

________

# Step 03D - Campground Characteristics

Scraped Attributes for each Campground from [campground status page](https://www.nps.gov/applications/glac/cgstatus/cgstatus.cfm)

#### Contents
* [Setup Scrape](#Set-up-Scrape-for-Campground-Characteristics)
* [Execute Scrape](#Execute-the-Webscrape)
* [Clean Dataframe](#Clean-the-Dataframe)
* [Identify Nearest Weather Station](#Identify-Nearest-Weather-Station)



## Set up Scrape for Campground Characteristics

#### Read in campground names

In [3]:
df_names = pd.read_csv('../data/02_filltimes_clean.csv')
cg_names = df_names['cg_name'].unique()

# cg_names

#### Make list of Campground URLs

In [4]:
base_url = 'https://www.nps.gov/applications/glac/cgstatus/camping_detail.cfm?cg='
fulllink_campgroundstatus_byname = [base_url + i for i in cg_names]

# fulllink_campgroundstatus_byname

#### Function to Strip Escape Characters

In [5]:
def strip_escape_chars(text):
    '''
    getText() on an element in the 'Current and Historic Campground Fill Times' looks like this:
    '\n3\n\r\n\t\t\t\t\t\t\t11:00am\r\n\t\t\t\t\t\t\t\n\n'
    This function cleans it to: '3 11:00am'
    A date with no fill up time will just be a number (the day of the month)
    '''
    return text.strip().replace("\n","").replace('\t',"").replace('\r'," ")

## Execute the Webscrape

In [6]:
campground_attributes = []

for input_url in fulllink_campgroundstatus_byname:
    
    time.sleep(0.5)
    
    req = requests.get(input_url)
    soup = BeautifulSoup(req.content,'html.parser')

    title = soup.find('title').getText()
    name = title.split(" - ")[1].replace(" Campground Information","")

    location = soup.find_all('table')[0].find_all('tr')[4].find_all('table')[0].find_all('tr')

    # Identify where each attribute is in the soup
    attributes = {
        'fee': location[5],
        'sites': location[6],
        'flush_toilets': location[7],
        'showers': location[8],
        'disposal_station': location[9],
        'reservations': location[10]
    }

    # Get the text, strip escape characters, and remove the row title
    for i in attributes:
        attributes[i] = attributes[i].getText()
        attributes[i] = strip_escape_chars(attributes[i])
        attributes[i] = attributes[i].replace('\xa0',"-").split("-")[1]
    
    # RV sites?
    if attributes['sites'][-1] == "*":
        attributes['rv'] = 'Yes'
        attributes['sites'] = attributes['sites'][:-1]
    else:
        attributes['rv'] = 'No'
    
    # Add the campground name
    attributes['cg_name'] = name
    
    # Clean up fee
    attributes['fee'] = attributes['fee'].split(" / ")[0].replace("$","").split(".")[0]
    
    campground_attributes.append(attributes)
    
df_cg_attributes = pd.DataFrame(campground_attributes)

# Put the campground name first
cols = df_cg_attributes.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_cg_attributes = df_cg_attributes[cols]

## Clean the Dataframe

#### Set the Campground Name as the Index

In [7]:
df_cg_attributes = df_cg_attributes.set_index('cg_name')

#### Binarize Yes/No Features

In [8]:
cols_to_binarize = ['flush_toilets',
                    'showers',
                    'disposal_station',
                    'reservations',
                    'rv']

for i in cols_to_binarize:
    df_cg_attributes[i] = np.where(df_cg_attributes[i] == "Yes", 1, 0)

#### Convert Fee and Sites to Integer

In [9]:
df_cg_attributes['sites']=df_cg_attributes['sites'].astype(int)
df_cg_attributes['fee']=df_cg_attributes['fee'].astype(int)

#### Designate 'Primitive"

In [10]:
df_cg_attributes['primitive']=0

df_cg_attributes.loc[['Cut Bank',
                      'Logging Creek',
                      'Quartz Creek']
                      ,'primitive'] = 1

#### Designate 'Isolated"



In [11]:
df_cg_attributes['isolated']=0

df_cg_attributes.loc[['Cut Bank',
                      'Many Glacier',
                      'Two Medicine',
                      'Bowman Lake',
                      'Kintla Lake']
                      ,'isolated'] = 1

## Identify Nearest Weather Station

#### Identify Campground Lat Longs

In [12]:
# Done via google, it's only 13 places...

lat_long_list = [
    [48.5277, -113.9932], #apgar
    [48.6780, -113.8187], #avalanche
    [48.8288, -114.2027], #bowman
    [48.6019, -113.3828], #cut bank
    [48.5478, -113.9853], #fish creek
    [48.9358, -114.3456], #kintla
    [48.6985, -114.1917], #logging
    [48.7967, -113.6773], #many
    [48.7214, -114.2245], #quartz
    [48.6954, -113.5214], #rising
    [48.6064, -113.8855], #sprague
    [48.7532, -113.4414], #st
    [48.4915, -113.3646], #two
]

lats = [i[0] for i in lat_long_list]
longs = [i[1] for i in lat_long_list]

#### Create a Tuple

In [13]:
df_cg_lat_long = pd.DataFrame(index = df_cg_attributes.index)
df_cg_lat_long['geocode_tuple'] = list(zip(lats,longs))

#### Read in Weather Locations

In [14]:
df_wx = pd.read_csv('../data/03a_weather_clean.csv')
df_wx['geocode_lat_long'] = list(zip(df_wx['LATITUDE'], df_wx['LONGITUDE']))

In [15]:
# List of Weather Stations
df_wx['wx_station'].unique()

array(['east_glac', 'many_glac', 'st_mary', 'west_glac'], dtype=object)

#### Visualize Nearest Weather Station

In [16]:
m = folium.Map(
    location = [48.6966, - 113.7182],
    tiles = 'Stamen Terrain',
    zoom_start = 9
)

# Map Campgrounds
for campground in df_cg_lat_long['geocode_tuple']:
    folium.CircleMarker(
        radius = 5,
        location = campground,
        color = 'red').add_to(m)

# Map Weather Stations    
for wx_station in df_wx['geocode_lat_long'].unique():
    folium.CircleMarker(
        radius = 20,
        location = wx_station,
        color = 'blue').add_to(m)

m

#### Manually Map to Nearest Station

In [17]:
df_cg_attributes.loc[['Apgar',
                      'Avalanche',
                      'Bowman Lake',
                      'Fish Creek',
                      'Kintla Lake',
                      'Logging Creek',
                      'Quartz Creek',
                      'Sprague Creek']
                     ,'nearest_wx_station'] = 'west_glac'

df_cg_attributes.loc[['Many Glacier']
                      ,'nearest_wx_station'] = 'many_glac'
                     
df_cg_attributes.loc[['St. Mary',
                      'Rising Sun']
                      ,'nearest_wx_station'] = 'st_mary'

df_cg_attributes.loc[['Cut Bank',
                      'Two Medicine']
                      ,'nearest_wx_station'] = 'east_glac'

#### Some Serious Correlation Issues but we'll address these when we conduct EDA

In [18]:
df_cg_attributes.corr()

Unnamed: 0,fee,sites,flush_toilets,showers,disposal_station,reservations,rv,primitive,isolated
fee,1.0,0.790601,0.919422,0.497379,0.744852,0.47552,0.706289,-0.863869,-0.166252
sites,0.790601,1.0,0.754463,0.508649,0.842721,0.573261,0.627272,-0.592464,-0.267682
flush_toilets,0.919422,0.754463,1.0,0.433013,0.731925,0.3371,0.843274,-0.69282,-0.35
showers,0.497379,0.508649,0.433013,1.0,0.591608,0.778499,0.365148,-0.3,-0.433013
disposal_station,0.744852,0.842721,0.731925,0.591608,1.0,0.460566,0.617213,-0.507093,-0.09759
reservations,0.47552,0.573261,0.3371,0.778499,0.460566,1.0,0.284268,-0.23355,-0.3371
rv,0.706289,0.627272,0.843274,0.365148,0.617213,0.284268,1.0,-0.426006,-0.158114
primitive,-0.863869,-0.592464,-0.69282,-0.3,-0.507093,-0.23355,-0.426006,1.0,-0.057735
isolated,-0.166252,-0.267682,-0.35,-0.433013,-0.09759,-0.3371,-0.158114,-0.057735,1.0


### Export to CSV


In [19]:
df_cg_attributes.to_csv('../data/03d_campground_attributes_clean.csv')