In [1]:
%matplotlib inline
import pandas as pd
import regex as re
import json
import altair as alt
from vega_datasets import data
import matplotlib.pyplot as plt

Datasets
- https://www.kaggle.com/code/regionalbird/national-parks/input
- https://www.kaggle.com/datasets/thedevastator/the-united-states-national-parks/?select=df_2.csv 

In [2]:
# all national parks
# https://github.com/joshuakemmerling/AwesomeData/blob/master/national-parks.csv
parks = pd.read_csv('data/national-parks.csv')
parks['park'] = parks['name'].apply(lambda s:' '.join(s.replace('National', '').replace('Park', '').strip().split(" ")))
parks = parks.rename(columns={"name": "park", "park": "name", "location":"state"})
display(parks.head())

# data with national parks lat/long
# https://github.com/sughodke/D3-US-Graph/blob/master/nationalparks.csv
parks_long = pd.read_csv('data/nationalparks.csv')
parks_long['details'] = parks_long['details'].apply(lambda s:s.replace('"', "").strip())
parks_long['name'] = parks_long['details'].apply(lambda s: ' '.join(s.split(" ")[2:]))
display(parks_long.head())

# set the longitude and latitude of parks from parks_long
parks['longitude'] = -1
parks['latitude'] = -1
for i, park in parks.iterrows():
    if parks_long['details'].str.contains(park['name']).any():
        index = parks_long[parks_long['name'].str.match(park['name'])].index[0]
        row = parks_long.loc[index]
        parks.loc[i, ['longitude', 'latitude']] = [row.longitude, row.latitude]
        
# hacky way to add coordinates to parks with missing coordinates
# they had Glacier wrong
coordinates = [['Glacier', -113.7870, 48.7596],
               ['American Samoa', -170.68333, -14.25833],
               ['Gateway Arch', -90.184776, 38.624691],
               ['Indiana Dunes', -87.053762, 41.633349],
               ['New River Gorge', -81.042973, 37.896731],
               ['Pinnacles', -121.197243, 36.491508],
               ['Virgin Islands', -64.7685, 18.3515],
               ['White Sands', -106.171669, 32.779720]]

for coordinate in coordinates:
    parks.loc[parks['name'] == coordinate[0], ['longitude', 'latitude']] = coordinate[1:]

display(parks.head())

# data with visitor info
# https://www.kaggle.com/datasets/thedevastator/the-united-states-national-parks/?select=df_2.csv
# drop empty columns, rename columns, get area in acres as float
parks_3 = pd.read_csv('data/df_2.csv').iloc[: , 1:].drop(columns=['Image'])
parks_3.columns = ['name', 'location', 'date_established', 'area', 'visitors_2021', 'description']
parks_3['name'] = parks_3['name'].apply(lambda s:s.replace(' *', "").strip())
parks_3['area'] = parks_3['area'].apply(lambda s: int(float(s.replace(',', '').split(' ')[0])))
# this gets rid of the weird citation things in the description and splits the sentences into smaller 
# chunks so that it looks better in the description
parks_3['description'] = parks_3['description'].apply(lambda s: re.sub("\[\d+\]*", '', s).replace('(WHS)','').replace('(BR)', '').strip())
parks_3['description2'] = parks_3['description'].apply(lambda s: [' '.join(s.split()[i:i+7]) for i in range(0,len(s.split()),7)])
# make sure spelling is the same as from other tables
parks_3.loc[[29, 30, 59], 'name'] = ['Haleakala', 'Hawaii Volcanoes', 'Wrangell-St. Elias']
display(parks_3.head())

# join data 
parks = parks.merge(parks_3, on='name', how='left')
display(parks.head())
parks.to_csv('data/cleaned/parks.csv')    

Unnamed: 0,park,state,name
0,Acadia National Park,Maine,Acadia
1,American Samoa National Park,American Samoa Territory,American Samoa
2,Arches National Park,Utah,Arches
3,Badlands National Park,South Dakota,Badlands
4,Big Bend National Park,Texas,Big Bend


Unnamed: 0,longitude,latitude,details,name
0,-154.88689,58.58305,USA-National Park Katmai,Katmai
1,-116.0089,33.9529,USA-National Park Joshua Tree,Joshua Tree
2,-88.8917,47.9624,USA-National Park Isle Royale,Isle Royale
3,-93.0221,34.3796,USA-National Park Hot Springs,Hot Springs
4,-155.3,19.4,USA-National Park Hawaii Volcanoes,Hawaii Volcanoes


Unnamed: 0,park,state,name,longitude,latitude
0,Acadia National Park,Maine,Acadia,-68.0493,44.454
1,American Samoa National Park,American Samoa Territory,American Samoa,-170.68333,-14.25833
2,Arches National Park,Utah,Arches,-109.565,38.77
3,Badlands National Park,South Dakota,Badlands,-102.4343,43.6504
4,Big Bend National Park,Texas,Big Bend,-103.2432,29.3816


Unnamed: 0,name,location,date_established,area,visitors_2021,description,description2
0,Acadia,"Maine.mw-parser-output .geo-default,.mw-parser...","February 26, 1919",49071,4069098,Covering most of Mount Desert Island and other...,"[Covering most of Mount Desert Island and, oth..."
1,American Samoa,American Samoa14°15′S 170°41′W﻿ / ﻿14.25°S 170...,"October 31, 1988",8256,8495,The southernmost national park is on three Sam...,"[The southernmost national park is on three, S..."
2,Arches,Utah38°41′N 109°34′W﻿ / ﻿38.68°N 109.57°W,"November 12, 1971",76678,1806865,"This site features more than 2,000 natural san...","[This site features more than 2,000 natural, s..."
3,Badlands,South Dakota43°45′N 102°30′W﻿ / ﻿43.75°N 102.50°W,"November 10, 1978",242755,1224226,"The Badlands are a collection of buttes, pinna...","[The Badlands are a collection of buttes,, pin..."
4,Big Bend,Texas29°15′N 103°15′W﻿ / ﻿29.25°N 103.25°W,"June 12, 1944",801163,581220,Named for the prominent bend in the Rio Grande...,"[Named for the prominent bend in the, Rio Gran..."


Unnamed: 0,park,state,name,longitude,latitude,location,date_established,area,visitors_2021,description,description2
0,Acadia National Park,Maine,Acadia,-68.0493,44.454,"Maine.mw-parser-output .geo-default,.mw-parser...","February 26, 1919",49071,4069098,Covering most of Mount Desert Island and other...,"[Covering most of Mount Desert Island and, oth..."
1,American Samoa National Park,American Samoa Territory,American Samoa,-170.68333,-14.25833,American Samoa14°15′S 170°41′W﻿ / ﻿14.25°S 170...,"October 31, 1988",8256,8495,The southernmost national park is on three Sam...,"[The southernmost national park is on three, S..."
2,Arches National Park,Utah,Arches,-109.565,38.77,Utah38°41′N 109°34′W﻿ / ﻿38.68°N 109.57°W,"November 12, 1971",76678,1806865,"This site features more than 2,000 natural san...","[This site features more than 2,000 natural, s..."
3,Badlands National Park,South Dakota,Badlands,-102.4343,43.6504,South Dakota43°45′N 102°30′W﻿ / ﻿43.75°N 102.50°W,"November 10, 1978",242755,1224226,"The Badlands are a collection of buttes, pinna...","[The Badlands are a collection of buttes,, pin..."
4,Big Bend National Park,Texas,Big Bend,-103.2432,29.3816,Texas29°15′N 103°15′W﻿ / ﻿29.25°N 103.25°W,"June 12, 1944",801163,581220,Named for the prominent bend in the Rio Grande...,"[Named for the prominent bend in the, Rio Gran..."


### working exploratory viz

In [3]:
states = alt.topo_feature(data.us_10m.url, feature='states')

background = alt.Chart(
    states,
    title=alt.Title("U.S. National Parks", fontSize=18)
).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project('albersUsa').properties(
    width=800,
    height=500
)

points = alt.Chart(parks).mark_circle().encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    size=alt.value(40),
    tooltip=[
        alt.Tooltip('name', title='Name'), 
        alt.Tooltip('state', title='State'),
        alt.Tooltip('longitude:Q', title='Longitude'), 
        alt.Tooltip('latitude:Q', title='Latitude')
    ]
)

(background + points).interactive()

### viz with some bar chart interaction

In [4]:
states = alt.topo_feature(data.us_10m.url, feature='states')

background = alt.Chart(
    states,
    title=alt.Title("U.S. National Parks", fontSize=18)
).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project('albersUsa').properties(
    width=600,
    height=400
)

highlight = alt.selection_interval(fields=['name'], empty=True)

points = alt.Chart(parks).mark_circle().encode(
    color=alt.condition(highlight, alt.value('pink'), alt.value('steelblue')),
    longitude='longitude:Q',
    latitude='latitude:Q',
    size=alt.value(60),
    tooltip=[
        alt.Tooltip('name', title='Name'), 
        alt.Tooltip('state', title='State'),
        alt.Tooltip('longitude:Q', title='Longitude'), 
        alt.Tooltip('latitude:Q', title='Latitude')
    ]
).add_params(
    highlight
)

bars = alt.Chart(parks).mark_point().encode(
    x='area',
    y='visitors_2021',
    tooltip=[
        alt.Tooltip('name', title='Name'), 
        alt.Tooltip('state', title='State')
    ]
).transform_filter(
    highlight
)

(background + points) | bars

#### interaction that groups by state

In [5]:
# states
states = alt.topo_feature(data.us_10m.url, feature='states')
background = alt.Chart(
    states,
    title=alt.TitleParams("U.S. National Parks", fontSize=16,
          subtitle="Click on a park to learn more about parks in the state!")
).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project('albersUsa').properties(
    width=800,
    height=400
)

# parks
highlight = alt.selection_point(
    fields=["state"], value=[{"state": "California"}]
)
points = alt.Chart(parks).mark_circle().encode(
    color=alt.condition(highlight, alt.value('pink'), alt.value('steelblue')),
    longitude='longitude:Q',
    latitude='latitude:Q',
    size=alt.value(60),
    tooltip=[
        alt.Tooltip('name', title='Name'), 
        alt.Tooltip('state', title='State'),
        alt.Tooltip('longitude:Q', title='Longitude'), 
        alt.Tooltip('latitude:Q', title='Latitude'),
        alt.Tooltip('description2', title='description')
    ]
).add_params(highlight)

# info table 
ranked_text = alt.Chart(parks).mark_text(align='left').encode(
    y=alt.Y('row_number:O',axis=None)
).transform_filter(
    highlight
).transform_window(
    row_number='row_number()'
).transform_window(
    rank='rank(row_number)'
).transform_filter(
    alt.datum.rank<20
)

# Data Tables
park = ranked_text.encode(text='park').properties(
    title=alt.Title(text='Park', align='left')
)
area = ranked_text.encode(text='area:N').properties(
    title=alt.Title(text='Area (acres)', align='left')
)
date = ranked_text.encode(text='date_established').properties(
    title=alt.Title(text='Date Established', align='left')
)
description = ranked_text.encode(text='description').properties(
    title=alt.Title(text='Date Established', align='left')
)


text = alt.hconcat(park, description) # Combine data tables

# Build chart

alt.vconcat(
    background + points,
    text
).resolve_legend(
    color="independent"
).configure_view(
    stroke=None
)

#(background + points) / text

In [6]:
parks.description2[0]

['Covering most of Mount Desert Island and',
 'other coastal islands, Acadia features the tallest',
 'mountain on the Atlantic coast of the',
 'United States, granite peaks, ocean shoreline, woodlands,',
 'and lakes. There are freshwater, estuary, forest,',
 'and intertidal habitats.']

In [11]:
# states
states = alt.topo_feature(data.us_10m.url, feature='states')
background = alt.Chart(
    states,
    title=alt.TitleParams("U.S. National Parks", fontSize=16,
          subtitle="Click on a park to learn more about it!")
).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project('albersUsa').properties(
    width=700,
    height=400
)

# parks
highlight = alt.selection_point(on='click', fields=["park"], value=[{"park": "Mount Rainier National Park"}])
#highlight = alt.selection_interval(fields=['name'], empty=True)
points = alt.Chart(parks).mark_circle().encode(
    color=alt.condition(highlight, alt.value('hotpink'), alt.value('cornflowerblue')),
    longitude='longitude:Q',
    latitude='latitude:Q',
    size=alt.Size('area', scale=alt.Scale(range=[100, 700]), 
                  legend = alt.Legend(
                      orient='none',
                      title = ["Number of", "visitors (2021)"],
                      legendX=750, legendY=50)),
    tooltip=[
        alt.Tooltip('name', title='Name'), 
        alt.Tooltip('state', title='State'),
        alt.Tooltip('longitude:Q', title='Longitude'), 
        alt.Tooltip('latitude:Q', title='Latitude')
    ]
).add_params(highlight)

# info table 
ranked_text = alt.Chart(parks).mark_text(align='left').encode(
    y=alt.Y('row_number:O',axis=None)
).transform_window(
    row_number='row_number()'
).transform_filter(
    highlight
).transform_window(
    row_number='row_number()'
).transform_filter(
    'datum.row_number<20'
)

# Data Tables
park = ranked_text.encode(text='park').properties(title=alt.Title(text='Park', align='left'))
area = ranked_text.encode(text=alt.Text('area:N', format=",.0f")).properties(title=alt.Title(text='Area', align='left'))
state = ranked_text.encode(text='state').properties(title=alt.Title(text='State', align='left'))
established = ranked_text.encode(text='date_established').properties(title=alt.Title(text='Establishment', align='left'))
description = ranked_text.encode(text='description2').properties(title=alt.Title(text='Description', align='left'))

text = alt.hconcat(park, state, established, area, description) # Combine data tables

# Build chart
alt.vconcat(background + points, text, spacing=10
           ).resolve_legend(color="independent").configure_view(stroke=None)

#(background + points) | text

In [8]:
# states
states = alt.topo_feature(data.us_10m.url, feature='states')
background = alt.Chart(
    states,
    title=alt.Title("U.S. National Parks", fontSize=18)
).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project('albersUsa')
# .properties(
#     width=600,
#     height=400
# )

# parks
highlight = alt.selection_point()
points = alt.Chart(parks).mark_circle().encode(
    color=alt.condition(highlight, alt.value('pink'), alt.value('steelblue')),
    longitude='longitude:Q',
    latitude='latitude:Q',
    size=alt.value(60),
    tooltip=[
        alt.Tooltip('name', title='Name'), 
        alt.Tooltip('state', title='State'),
        alt.Tooltip('longitude:Q', title='Longitude'), 
        alt.Tooltip('latitude:Q', title='Latitude'),
        alt.Tooltip('description', title='Description')
    ]
).add_params(highlight)

# info table 
ranked_text = alt.Chart(parks).mark_text(align='left').encode(
    y=alt.Y('row_number:O',axis=None)
).transform_window(
    row_number='row_number()'
).transform_filter(
    highlight
).transform_filter(
    alt.datum.row_number<20
).properties(width=300)

# Data Tables
park = ranked_text.encode(text='park').properties(title=alt.Title(text='Park', align='left'))
area = ranked_text.encode(text='area:N').properties(title=alt.Title(text='Area', align='left'))
# description = ranked_text.transform_calculate(description="split(datum.description, '.')"
# ).encode(text='description').properties(title=alt.Title(text='Description', align='right'))

text = alt.vconcat(park, area) # Combine data tables

# Build chart

alt.hconcat(background + points, text 
           ).resolve_legend(color="independent").configure_view(stroke=None)

#(background + points) | text

### viz with some hover over state interaction

In [9]:
highlight = alt.selection_single(on='mouseover', fields=['id'], empty='none')

background = alt.Chart(
    states,
    title=alt.Title("U.S. National Parks", fontSize=18)
).mark_geoshape().encode(
    color=alt.condition(highlight, alt.value('red'), 'id:Q'),
    tooltip=['id:Q']
).add_selection(highlight).project(
    type='albersUsa'
).properties(
    width=900,
    height=600
)

points = alt.Chart(parks).mark_circle().encode(
    color=alt.condition(highlight, alt.value('red'), alt.value('blue')),
    longitude='longitude:Q',
    latitude='latitude:Q',
    size=alt.value(40),
    tooltip=[
        alt.Tooltip('name', title='Name'), 
        alt.Tooltip('location', title='State'),
        alt.Tooltip('longitude', title='Longitude'), 
        alt.Tooltip('latitude', title='Latitude')
    ]
).add_params(highlight)


(background + points)



Plotting real time data: https://stackoverflow.com/questions/71252170/altair-plotting-realtime-data-python

In [10]:
#
states = alt.topo_feature(data.us_10m.url, feature='states')

background = alt.Chart(
    states,
    title=alt.Title("U.S. National Parks", fontSize=18)
).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project('albersUsa').properties(
    width=600,
    height=400
)

highlight = alt.selection_single(on='click', fields=['name'], empty='none')

points = alt.Chart(parks).mark_circle().encode(
    color=alt.condition(highlight, alt.value('pink'), alt.value('steelblue')),
    longitude='longitude:Q',
    latitude='latitude:Q',
    size=alt.value(60),
    tooltip=[
        alt.Tooltip('name', title='Name'), 
        alt.Tooltip('location', title='State'),
        alt.Tooltip('longitude:Q', title='Longitude'), 
        alt.Tooltip('latitude:Q', title='Latitude')
    ]
).add_selection(
    highlight
)

bars = alt.Chart(parks).mark_bar().encode(
    x='longitude',
    y='latitude'
).transform_filter(
    highlight
)

(background + points) | bars