# Purpose
* Create the base tracking tables for National Parks Passport Stamps
* Pulled on February 3, 2025.

# Setup

## Packages

In [1]:
###############################################################################
# GENERAL #
import os
import pathlib as pl
import zipfile


###############################################################################
# ANALYSIS #
import geopandas as gpd
import numpy as np
import pandas as pd

###############################################################################
# LOCAL #

ROOT_DIR = "/Users/evancanfield/Documents/Projects/national_park_passport_stamps/"


## Data

### Google Map Export

#### Unzip KMZ File

In [2]:
def kmz_to_kml(kmz_file, output_dir):
    kmz_file_name = f'{kmz_file.stem}.kml'
    with zipfile.ZipFile(kmz_file, 'r') as kmz:
        for file in kmz.namelist():
            if file.endswith('.kml'):
                kml_content = kmz.read(file)
                kml_path = pl.Path(output_dir, kmz_file_name)
                with open(kml_path, 'wb') as kml_file:
                    kml_file.write(kml_content)
                print(f"KML file extracted to: {kml_path}")

# Define path to kml file
date_of_conversion = "2025-02-03"
map_stem = f"national_park_passport_stamp_series_export_{date_of_conversion}"
map_dir = pl.Path(ROOT_DIR, "data", "raw")
map_kml_file = pl.Path(map_dir, f'{map_stem}.kml')
map_kmz_file = pl.Path(map_dir, f'{map_stem}.kmz')

if not map_kml_file.exists():

    kmz_to_kml(map_kmz_file, map_kmz_file.parent)

else:
    print('KML File Aready Exists.')

KML file extracted to: /Users/evancanfield/Documents/Projects/national_park_passport_stamps/data/raw/national_park_passport_stamp_series_export_2025-02-03.kml


#### Read KML File

In [3]:
# Identify all kml layers
layers = gpd.list_layers(map_kml_file)

# Init storage
dct_map = {}


for layer in layers['name'].unique():
    dct_map[layer] = gpd.read_file(
        map_kml_file,
        layer = layer
    )

### Visited Parks

In [4]:
visited_path = pl.Path(
    ROOT_DIR,
    'data',
    'raw',
    f'national_park_passport_stamp_series_visited_{date_of_conversion}.csv'
)

df_visited = pd.read_csv(visited_path)

# Processing

## Clean Up Google Map Data

In [5]:
# Initialize varaiables
frames = []
drop_cols = ['description', 'geometry']


for region, gdf in dct_map.items():
    print(80 * '-')
    print(region)
    print()

    # Set columns to lower case
    cols = gdf.columns
    gdf.columns = [col.lower() for col in cols]

    # Extract year
    gdf['year'] = gdf['name'].apply(
        lambda x: int(x.split('-')[0])
    )

    # Extract name
    gdf['name'] = gdf['name'].apply(
        lambda x: x.split(' - ')[-1]
    )

    # Remove (1/5) (1988 National Stickers)
    gdf['name'] = gdf['name'].apply(
        lambda x: x.replace("(1/5)", "")
    )

    # Latitude
    gdf['latitude'] = gdf['geometry'].apply(
        lambda x: x.y
    )

    # Longitude
    gdf['longitude'] = gdf['geometry'].apply(
        lambda x: x.x
    )

    # Convert to dataframe and drop select columns
    df = pd.DataFrame(gdf).drop(columns = drop_cols)

    # Assin region to a column
    df['region'] = region

    frames.append(df)

df_stamps = pd.concat(frames)

# Sort by Year and Name
df_stamps = df_stamps.sort_values(
    ['region', 'year', 'name'], 
    ascending=[True, True, True]
)

--------------------------------------------------------------------------------
National

--------------------------------------------------------------------------------
North Atlantic

--------------------------------------------------------------------------------
Mid-Atlantic

--------------------------------------------------------------------------------
Southeast

--------------------------------------------------------------------------------
Midwest

--------------------------------------------------------------------------------
Southwest

--------------------------------------------------------------------------------
Rocky Mountain

--------------------------------------------------------------------------------
Western

--------------------------------------------------------------------------------
Pacific Northwest & Alaska



In [6]:
df_stamps

Unnamed: 0,name,year,latitude,longitude,region
0,Colonial National Historical Park,1986,37.229134,-76.503903,Mid-Atlantic
1,Hopewell Furnace National Historic Site,1987,40.209727,-75.769126,Mid-Atlantic
34,Gettysburg National Military Park,1988,39.810644,-77.227106,Mid-Atlantic
2,Assateague Island National Seashore,1989,38.060797,-75.236238,Mid-Atlantic
35,Appomattox Court House National Historical Park,1990,37.377520,-78.796007,Mid-Atlantic
...,...,...,...,...,...
30,Pu'ukohola Heiau National Historic Site,2021,20.025629,-155.821777,Western
31,Great Basin National Park,2022,38.929980,-114.263379,Western
32,Pipe Spring National Monument,2023,36.862533,-112.737463,Western
33,Wupatki National Monument,2024,35.559984,-111.393527,Western


## Combine Region Maps into Single Table

In [7]:
# Determine which parks were visited
visited_parks = df_visited['name'].to_list()
visited_parks = [
    park \
        .split(' - ')[-1] \
        .replace('\xa0(1/5)', '') \
        
    for park in visited_parks
    ]

visited_parks = sorted(visited_parks)

# Create boolean column for determining visited parks
df_stamps['visited'] = df_stamps['name'].apply(
    lambda x: 'Yes' if x in visited_parks else 'No'
)

# Count number of parks visited
n_parks = int(df_stamps['visited'].value_counts().get("Yes"))

# Check visited matching worked
print('Number of Visited Parks:') 
print(f'-- Visited List:\t{len(visited_parks):,}')
print(f'-- Stamp Table:\t\t{n_parks:,}')

Number of Visited Parks:
-- Visited List:	52
-- Stamp Table:		52


In [8]:
df_stamps['region'].unique()

array(['Mid-Atlantic', 'Midwest', 'National', 'North Atlantic',
       'Pacific Northwest & Alaska', 'Rocky Mountain', 'Southeast',
       'Southwest', 'Western'], dtype=object)

# Export

In [9]:
processed_data_dir = pl.Path(ROOT_DIR, "data", "processed")
stamp_table_name = f"national_park_passport_stamp_series_export_{date_of_conversion}.csv"
stamp_table_path = pl.Path(processed_data_dir, stamp_table_name)

df_stamps.to_csv(stamp_table_path, index = False)