In [None]:
#imports 
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import psycopg2
import time
import random

## Data Base Setup

In [5]:
DB_NAME = "camping"
DB_USER = "arrowboy"

# example connect and query
try:
    # Connect to the PostgreSQL database
    conn = psycopg2.connect(
        dbname=DB_NAME,
        user=DB_USER
    )

    # Create a cursor to interact with the DB
    cur = conn.cursor()

    # Example: create a simple query
    cur.execute("SELECT name FROM campsites;")
    camps = cur.fetchall()

    for camp in camps:
        print(camp)

    # Clean up
    cur.close()
    conn.close()

except Exception as e:
    print("Error connecting to the database:", e)

## List of Nation Park URLs

In [6]:
base_url = "https://www.fs.usda.gov/" # pacific southwest
rest_of_it = "/recreation/camping-cabins"

# not usable for the URLs 
r05_urls = [
    "r05/angeles",
    "r05/klamath", # butte valley, 
    "r05/cleveland",
    "r05/eldorado",
    "r05/sequoia",
    "r05/inyo",
    "r05/klamath",
    "r05/laketahoebasin",
    "r05/lassen",
    "r05/lospadres",
    "r05/mendocino",
    "r05/modoc",
    "r05/plumas",
    "r05/sanbernardino",
    "r05/shasta-trinity",
    "r05/sierra",
    "r05/sixrivers",
    "r05/stanislaus",
    "r05/tahoe"
]

r04_urls = [
    "r04/humboldt-toiyabe"
]

r06_urls = [
    "r06/rogue-siskiyou"
]

# full list of URLs
all_urls = r05_urls + r04_urls + r06_urls

for url in all_urls:
    all = base_url + url + rest_of_it 
    print(all)

https://www.fs.usda.gov/r05/angeles/recreation/camping-cabins
https://www.fs.usda.gov/r05/klamath/recreation/camping-cabins
https://www.fs.usda.gov/r05/cleveland/recreation/camping-cabins
https://www.fs.usda.gov/r05/eldorado/recreation/camping-cabins
https://www.fs.usda.gov/r05/sequoia/recreation/camping-cabins
https://www.fs.usda.gov/r05/inyo/recreation/camping-cabins
https://www.fs.usda.gov/r05/klamath/recreation/camping-cabins
https://www.fs.usda.gov/r05/laketahoebasin/recreation/camping-cabins
https://www.fs.usda.gov/r05/lassen/recreation/camping-cabins
https://www.fs.usda.gov/r05/lospadres/recreation/camping-cabins
https://www.fs.usda.gov/r05/mendocino/recreation/camping-cabins
https://www.fs.usda.gov/r05/modoc/recreation/camping-cabins
https://www.fs.usda.gov/r05/plumas/recreation/camping-cabins
https://www.fs.usda.gov/r05/sanbernardino/recreation/camping-cabins
https://www.fs.usda.gov/r05/shasta-trinity/recreation/camping-cabins
https://www.fs.usda.gov/r05/sierra/recreation/camp

## First Populations of DB
- will need to loop through all the park urls for now just did the first one
    - mostly just need to change the references that say {list}[0] to an iterator in a for loop
- was able to get site name, site url, forest name, forest url
- put in a pandas dataframe for the forest: camp_initial_df

In [7]:
end_of_url = '?items_per_page=50'
full_urls = []
park_urls = []
for url in all_urls:
    t = base_url + url + rest_of_it + end_of_url
    p = base_url + url
    full_urls.append(t)
    park_urls.append(p)

r = requests.get(full_urls[0])

soup = BeautifulSoup(r.text, 'html.parser')

# Find the container of all campsites
container = soup.find("div", class_="rows__container")

# Each campsite is a "main-view-item"
campsite_cards = container.find_all("div", class_="main-view-item")

In [8]:
site_name = []
site_url = []
for site in campsite_cards:
    link_tag = site.find("a")
    if link_tag:
        site_url.append(base_url  + link_tag.get("href"))   
        site_name.append(link_tag.get_text(strip=True))
    else:
        continue
    
# grabbing the park name
pr = requests.get(park_urls[0]) 
soup = BeautifulSoup(pr.text, "html.parser")

breadcrumb = soup.find("nav", class_="usa-breadcrumb")
breadcrumb_items = breadcrumb.find_all("li", class_="usa-breadcrumb__list-item")
# Get the last one (forest name)
forest_name = breadcrumb_items[-1].get_text(strip=True)

park_name = [forest_name]*len(site_name)
# gabbing the park url
park_url = [park_urls[0]]*len(site_name)

camp_initial_df = pd.DataFrame({"site_name":site_name, "site_url":site_url, "park_url":park_url, "park_name":park_name})

## Getting more static data for the camp site

In [None]:
def get_accordion_text_by_label(soup, label):
    """Finds text content inside an accordion by its button label"""
    button = soup.find("button", string=lambda s: s and label in s)
    if button:
        content_div = button.find_parent("h3").find_next_sibling("div")
        return content_div.get_text(strip=True, separator="\n") if content_div else None
    return None

season_of_use_camps = []*len(camp_initial_df)
fee_info_camps = []*len(camp_initial_df)
contact_info_camps = []*len(camp_initial_df)
info_center_camps = []*len(camp_initial_df)
latitude_camps = []*len(camp_initial_df)
longitude_camps = []*len(camp_initial_df)
directions_camps = []*len(camp_initial_df)
restrooms_camps = []*len(camp_initial_df)
water_camps = []*len(camp_initial_df)
overview_camps = []*len(camp_initial_df)
amenities_camps = []*len(camp_initial_df)

for i, camp in enumerate(camp_initial_df['site_url']):
    delay = random.uniform(1, i)
    time.sleep(delay)
    # go camp by camp url
    # idea: camp_test = camp_initial_df['site_url'].loc[i]
    camp_test = camp
    print(camp_test)

    r = requests.get(camp_test)

    soup = BeautifulSoup(r.text, 'html.parser')
    accordion_div = soup.find("div", class_="usa-accordion general__accordion-container margin-top-3")
    local_div = soup.find("div", class_="margin-top-5")

    # --- Extract fields ---
    season_of_use = get_accordion_text_by_label(soup, "Seasons of Use")
    fee_info = get_accordion_text_by_label(soup, "Fee Site and Info")
    contact_info_raw = get_accordion_text_by_label(soup, "Contact Information")
    info_center_raw = get_accordion_text_by_label(soup, "Information Center")

    # directions and lat/long and amenities/facilities
    mt5_blocks = soup.find_all("div", class_="margin-top-5")

    latitude = None
    longitude = None
    directions = None
    restrooms = None
    water = None

    for block in mt5_blocks:
        h2 = block.find("h2")
        if h2 and "Getting There" in h2.get_text(strip=True):
            # Parse this block for coordinates and directions
            for p in block.find_all("p"):
                text = p.get_text(strip=True)
                if "Latitude:" in text:
                    latitude = text.replace("Latitude:", "").strip()
                elif "Longitude:" in text:
                    longitude = text.replace("Longitude:", "").strip()
                elif "From" in text or "Take" in text:
                    directions = text.strip()
            break


    for block in mt5_blocks:
        h2 = block.find("h2")
        if h2 and "Facility and Amenity Information" in h2.get_text(strip=True):
            for p in block.find_all("p"):
                text = p.get_text(strip=True)
                if "Restrooms" in text:
                    restrooms = text
                elif "water" in text:
                    water = text
            break
        
    # overview and amenities
    overview = soup.find("div", class_="rec-intro")
    overview = overview.find("div", class_="field field--name-field-rec-description field--type-text-with-summary field--label-hidden field__item")

    overview_text = None
    amenities_text = None
    for p in overview.find_all("p"):
        text = p.get_text(strip=True)
        if "Overview" in text: # its the next p tag after the overview
            next_p = p.find_next_sibling("p")
            if next_p:
                overview_text = next_p.get_text(strip=True)

        if "Amenities:" in text:
            amenities_text = text.replace("Amenities:", "").strip()
            
        if overview_text and amenities_text:
            break

    # incase aminities_text are not found
    if not amenities_text: # go to "General Information" section
        amenities_text = ""
        general_info = soup.find("div", class_="field field--name-field-rec-general-info field--type-text-long field--label-hidden field__item")
        if general_info:
            for p in general_info.find_all("p"):
                if "Amenities" in p.text:
                        # Remove label from the paragraph
                        stripped_text = p.get_text(strip=True).replace("Amenities:", "").strip()

                        if stripped_text:
                            # Text exists after "Amenities:", use it
                            amenities_text = stripped_text
                        else:
                            amenities = p.find_next_sibling("ul")
                            for li in amenities.find_all("li"):
                                amenities_text += li.text.strip() + "\n"
        else:
            amenities_text = None
    
    # into list for df later
    season_of_use_camps.append(season_of_use)
    fee_info_camps.append(fee_info)
    contact_info_camps.append(contact_info_raw)
    info_center_camps.append(info_center_raw)
    latitude_camps.append(latitude)
    longitude_camps.append(longitude)
    directions_camps.append(directions)
    restrooms_camps.append(restrooms)
    water_camps.append(water)
    overview_camps.append(overview_text)
    amenities_camps.append(amenities_text)
    

https://www.fs.usda.gov/r05/angeles/recreation/appletree-campground
https://www.fs.usda.gov/r05/angeles/recreation/bandido-group-campground
https://www.fs.usda.gov/r05/angeles/recreation/bear-campground
https://www.fs.usda.gov/r05/angeles/recreation/big-rock-campground
https://www.fs.usda.gov/r05/angeles/recreation/blue-ridge-campground
https://www.fs.usda.gov/r05/angeles/recreation/buckhorn-campground
https://www.fs.usda.gov/r05/angeles/recreation/chilao-campground
https://www.fs.usda.gov/r05/angeles/recreation/coldbrook-campground
https://www.fs.usda.gov/r05/angeles/recreation/crystal-lake-recreation-area-campground
https://www.fs.usda.gov/r05/angeles/recreation/deer-flat-group-campground
https://www.fs.usda.gov/r05/angeles/recreation/glenn-camp-campground-hike-or-bike-7-miles
https://www.fs.usda.gov/r05/angeles/recreation/guffy-campground
https://www.fs.usda.gov/r05/angeles/recreation/horse-flat-campground
https://www.fs.usda.gov/r05/angeles/recreation/jackson-flat-group-campground


In [77]:
# append back to original df
camp_initial_df['season_of_use'] = season_of_use_camps
camp_initial_df['fee_info'] = fee_info_camps
camp_initial_df['contact_info'] = contact_info_camps
camp_initial_df['info_center'] = info_center_camps
camp_initial_df['latitude'] = latitude_camps
camp_initial_df['longitude'] = longitude_camps
camp_initial_df['directions'] = directions_camps
camp_initial_df['restrooms'] = restrooms_camps
camp_initial_df['water'] = water_camps  
camp_initial_df['overview'] = overview_camps
camp_initial_df['amenities'] = amenities_camps

camp_initial_df['site_url'] = camp_initial_df['site_url'].apply(lambda x: x.replace('www.fs.usda.gov//', 'www.fs.usda.gov/'))

In [None]:
# and save, so dont have to run again
camp_initial_df.to_csv('camp_initial_df.csv', index=False)

## More Data: 
- lat and long, facilities and amenities (water bathroom), overview, and amenities

In [64]:
# first url of campsite
camp_1_url = camp_initial_df['site_url'].iloc[5]
print(camp_1_url)

# getting 
r = requests.get(camp_1_url)
soup = BeautifulSoup(r.text, "html.parser")

https://www.fs.usda.gov/r05/angeles/recreation/buckhorn-campground


In [65]:
# directions and lat/long and amenities/facilities
mt5_blocks = soup.find_all("div", class_="margin-top-5")

latitude = None
longitude = None
directions = None
restrooms = None
water = None

for block in mt5_blocks:
    h2 = block.find("h2")
    if h2 and "Getting There" in h2.get_text(strip=True):
        # Parse this block for coordinates and directions
        for p in block.find_all("p"):
            text = p.get_text(strip=True)
            if "Latitude:" in text:
                latitude = text.replace("Latitude:", "").strip()
            elif "Longitude:" in text:
                longitude = text.replace("Longitude:", "").strip()
            elif "From" in text or "Take" in text:
                directions = text.strip()
        break


for block in mt5_blocks:
    h2 = block.find("h2")
    if h2 and "Facility and Amenity Information" in h2.get_text(strip=True):
        for p in block.find_all("p"):
            text = p.get_text(strip=True)
            if "Restrooms" in text:
                restrooms = text
            elif "water" in text:
                water = text
        break
    
# overview and amenities
overview = soup.find("div", class_="rec-intro")
overview = overview.find("div", class_="field field--name-field-rec-description field--type-text-with-summary field--label-hidden field__item")

overview_text = None
amenities_text = None
for p in overview.find_all("p"):
    text = p.get_text(strip=True)
    if "Overview" in text: # its the next p tag after the overview
        next_p = p.find_next_sibling("p")
        if next_p:
            overview_text = next_p.get_text(strip=True)

    if "Amenities:" in text:
        amenities_text = text.replace("Amenities:", "").strip()
        
    if overview_text and amenities_text:
        break

# incase aminities_text are not found
if not amenities_text: # go to "General Information" section
    amenities_text = ""
    general_info = soup.find("div", class_="field field--name-field-rec-general-info field--type-text-long field--label-hidden field__item")
    if general_info:
        for p in general_info.find_all("p"):
            if "Amenities" in p.text:
                    # Remove label from the paragraph
                    stripped_text = p.get_text(strip=True).replace("Amenities:", "").strip()

                    if stripped_text:
                        # Text exists after "Amenities:", use it
                        amenities_text = stripped_text
                    else:
                        amenities = p.find_next_sibling("ul")
                        for li in amenities.find_all("li"):
                            amenities_text += li.text.strip() + "\n"
    else:
        amenities_text = None



print("Latitude:", latitude)
print("Longitude:", longitude)
print("Directions:", directions)
print("Restrooms:", restrooms)
print("Water:", water)
print("Overview:", overview_text)
print("Amenities:", amenities_text)

Latitude: 34.34611111
Longitude: -117.9127778
Directions: From Interstate 210 in the city of La Cañada Flintridge, exit at Angeles Crest Highway (CA-2) and drive northeast for 36 miles.
Restrooms: Restrooms are available at this site.
Water: Potable water is available at this site.
Overview: Buckhorn Campground can be the quick escape you need from LA. The campground is drenched in fresh air with the smell of incense cedars surrounding visitors. Overnight, campers may spot seasonal constellations, planets, and the occasional shooting star. During the day, visitors can venture to nearby waterfalls or trails including the Burkhart Trail, Pacific Crest Trail, and Silver Moccasin. Buckhorn Campground has much to offer.
Amenities: 38 total campsites with 3 accessible campsites. 3-single SST toilets, 2-double SST toilets, 2-double vault toilets. Picnic tables. Bear-proof boxes available for storage of food and scented items.
