## Import Data

Data from [https://github.com/rfordatascience/tidytuesday/blob/master/data/2024/2024-10-08/readme.md](https://github.com/rfordatascience/tidytuesday/blob/master/data/2024/2024-10-08/readme.md)

In [1]:
#load libraries
import polars as pl
from lets_plot import *
LetsPlot.setup_html()

#load in data 
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-10-08/most_visited_nps_species_data.csv"
species_data = pl.read_csv(url)
species_data = species_data.with_columns(
    pl.col("ParkName").str.replace(" National Park$", "", literal=False).alias("ParkName")
)
species_data

ParkCode,ParkName,CategoryName,Order,Family,TaxonRecordStatus,SciName,CommonNames,Synonyms,ParkAccepted,Sensitive,RecordStatus,Occurrence,OccurrenceTags,Nativeness,NativenessTags,Abundance,NPSTags,ParkTags,References,Observations,Vouchers,ExternalLinks,TEStatus,StateStatus,OzoneSensitiveStatus,GRank,SRank
str,str,str,str,str,str,str,str,str,bool,bool,str,str,str,str,str,str,str,str,i64,i64,i64,str,str,str,str,str,str
"""ACAD""","""Acadia""","""Mammal""","""Artiodactyla""","""Cervidae""","""Active""","""Alces alces""","""Moose""",,true,false,"""Approved""","""Present""","""NA""","""Native""","""NA""","""Rare""","""Resident""","""NA""",11,1,0,"""NA""","""50""","""NA""","""NA""","""G5""","""ME: S5"""
"""ACAD""","""Acadia""","""Mammal""","""Artiodactyla""","""Cervidae""","""Active""","""Odocoileus virginianus""","""Northern White-tailed Deer, Vi…",,true,false,"""Approved""","""Present""","""NA""","""Native""","""NA""","""Abundant""","""NA""","""NA""",20,0,0,"""NA""","""50""","""NA""","""NA""","""G5""","""ME: S5"""
"""ACAD""","""Acadia""","""Mammal""","""Carnivora""","""Canidae""","""Active""","""Canis latrans""","""Coyote, Eastern Coyote""",,true,false,"""Approved""","""Present""","""NA""","""Non-native""","""NA""","""Common""","""NA""","""NA""",8,1,0,"""NA""","""SC""","""NA""","""NA""","""G5""","""ME: S5"""
"""ACAD""","""Acadia""","""Mammal""","""Carnivora""","""Canidae""","""Active""","""Canis lupus""","""Eastern Timber Wolf, Gray Wolf…",,true,false,"""Approved""","""Unconfirmed""","""NA""","""Native""","""NA""","""NA""","""NA""","""NA""",2,0,0,"""NA""","""E""","""NA""","""NA""","""G5""","""ME: SH"""
"""ACAD""","""Acadia""","""Mammal""","""Carnivora""","""Canidae""","""Active""","""Vulpes vulpes""","""Black Fox, Cross Fox, Eastern …",,true,false,"""Approved""","""Present""","""NA""","""Unknown""","""NA""","""Common""","""Breeder""","""NA""",16,0,0,"""NA""","""NA""","""NA""","""NA""","""G5""","""ME: S5"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ZION""","""Zion""","""Vascular Plant""","""Solanales""","""Solanaceae""","""Active""","""Solanum triflorum""","""cut-leaf nightshade""",,true,false,"""Approved""","""Present""","""NA""","""Native""","""NA""","""Uncommon""","""NA""","""NA""",2,0,2,"""NA""","""NA""","""NA""","""NA""","""G5""","""UT: SNR"""
"""ZION""","""Zion""","""Vascular Plant""","""Vitales""","""Vitaceae""","""Active""","""Vitis arizonica""","""canyon grape""",,true,false,"""Approved""","""Present""","""NA""","""Native""","""NA""","""Uncommon""","""NA""","""NA""",3,0,8,"""NA""","""NA""","""NA""","""NA""","""G4G5""","""UT: S1"""
"""ZION""","""Zion""","""Vascular Plant""","""Vitales""","""Vitaceae""","""Active""","""Vitis vinifera""","""wine grape""",,true,false,"""Approved""","""Present""","""NA""","""Non-native""","""NA""","""Uncommon""","""NA""","""NA""",2,0,0,"""NA""","""NA""","""NA""","""NA""","""G4""","""UT: SNA"""
"""ZION""","""Zion""","""Vascular Plant""","""Zygophyllales""","""Zygophyllaceae""","""Active""","""Larrea tridentata""","""creosote bush""",,true,false,"""Approved""","""Present""","""NA""","""Native""","""NA""","""Rare""","""NA""","""NA""",3,0,1,"""NA""","""NA""","""NA""","""NA""","""G5""","""UT: SNR"""


## Species Diversity by Park

In [2]:
species_per_park = species_data.select(["ParkName", "SciName"]).unique().group_by("ParkName").count().rename({"count":"number_of_species"}).sort("number_of_species", descending = True).head(10)
species_per_park

(
    ggplot(species_per_park, aes(x = "number_of_species", y = as_discrete("ParkName", order_by = "number_of_species", order=1))) + 
    geom_bar(stat = "identity") +
    labs(
        title = "Species Diversity by US National Park \n",
        x = "Number of Species",
        y = "Park Name"
    ) + 
    theme_minimal()
)



`GroupBy.count` is deprecated. It has been renamed to `len`.



Why is Great Smoky Mountains National Park so biodiverse? 

*"Great Smoky Mountains National Park is known to be the most biologically diverse in the entire National Park system. This biodiversity is in part a result of its past as a refuge for animals and plants migrating south away from glaciers. It can also be attributed to its mild, rainy climate. Scientists have identified 19,000 different species of plants and animals in the park and think that as many as 100,000 other species may have yet to be identified."*
 -- [USGS](https://www.usgs.gov/geology-and-ecology-of-national-parks/ecology-great-smoky-mountains-national-park-0#:~:text=Great%20Smoky%20Mountains%20National%20Park%20is%20known%20to%20be%20the,to%20its%20mild%2C%20rainy%20climate.)

In [3]:
# grsm_taxonomy = (
# species_data
# .filter(pl.col("ParkCode") == "GRSM")
# .unique()
# .with_columns(
#     pl.col("SciName")
#     .str.split_exact(" ", 1)
#     .struct.rename_fields(["Genus", "Species"])
#     .alias("fields")
# ).unnest("fields")
# .select(["Order", "Family", "Genus"])
# .with_columns(pl.col("Species").str.to_titlecase())
# )
# grsm_taxonomy

# grsm_taxonomy.group_by("Genus").count().sort("count", descending = True)

# import plotly.express as px

# grsm_taxonomy_df = grsm_taxonomy_breakdown.to_pandas()

# grsm_taxonomy_df["Species"] = grsm_taxonomy_df["Species"].fillna("Unknown")

# fig = px.sunburst(
#    grsm_taxonomy_df,
#    path = ["Order", "Family", "Genus", "Species"],
#    values=None,
#    title = "Sunburst Chart of GRSM National Park"
# )

# fig.show()


## National Parks Ranked by Biodiversity

In [4]:
import polars as pl

animals = (
    species_data
    .filter(
        pl.col("CategoryName").is_in(["Insect", "Spider/Scorpion", "Bird", "Other Non-vertebrates",
                                      "Mammal", "Fish", "Slug/Snail", "Reptile",
                                      "Crab/Lobster/Shrimp", "Amphibian"])
    )
    .with_columns(
        pl.when(pl.col("CategoryName") == "Slug/Snail").then(pl.lit("Invertebrate"))
        .when(pl.col("CategoryName") == "Other Non-vertebrates").then(pl.lit("Invertebrate"))
        .when(pl.col("CategoryName") == "Insect").then(pl.lit("Invertebrate"))
        .when(pl.col("CategoryName") == "Spider/Scorpion").then(pl.lit("Invertebrate"))
        .when(pl.col("CategoryName") == "Crab/Lobster/Shrimp").then(pl.lit("Invertebrate"))
        .otherwise(pl.col("CategoryName"))
        .alias("Category")
    )
    .group_by(["ParkName", "Category"])
    .count()
    .sort("count", descending=True)
)

animals_vertebrates = animals.filter(pl.col("Category") != "Invertebrate")

values = {
    "Bird":"#8EA163", 
    "Fish":"#1A8FA5", 
    "Mammal":"#EFCF72", 
    "Reptile":"#EF8C64", 
    "Amphibian":"#9DC9B7"
}

(
    ggplot(animals_vertebrates, aes(x = "count", y = as_discrete("ParkName", order_by="count", order = 1), fill = "Category")) +
           geom_bar(stat = "identity") +
           scale_fill_manual(values = values) +
           labs( # type: ignore
               title = "National Parks Ranked by Animal Biodiversity \n (excluding invertebrates) \n",
               x = "Number of Species",
               y = "Park Name"
           )
)


`GroupBy.count` is deprecated. It has been renamed to `len`.



In [5]:
# (
#     ggplot(animals, aes(x = "count", y = as_discrete("ParkName", order_by="count", order = 1), fill = "Category")) +
#            geom_bar(stat = "identity") +
#            scale_fill_manual(values=["#8EA163", "#1A8FA5", "#EFCF72", "#EF8C64", "#9DC9B7"]) +
#            labs(
#                title = "National Parks Ranked by Animal Biodiversity \n (including invertebrates) \n",
#                x = "Number of Species",
#                y = "Park Name"
#            )
# )

In [6]:
plants = (
    species_data
    .filter(
        pl.col("CategoryName").is_in(["Vascular Plant", "Non-vascular Plant"])
    )
    .group_by(["ParkName", "CategoryName"])
    .count()
    .sort("count", descending=True)
)

(
    ggplot(plants, aes(x = "count", y = as_discrete("ParkName", order_by="count", order = 1), fill = "CategoryName")) + # type: ignore
           geom_bar(stat = "identity") + # type: ignore
           scale_fill_manual(values=["#8EA163", "#1A8FA5"]) +
           labs( # type: ignore
               title = "National Parks Ranked by Plant Biodiversity \n",
               x = "Number of Species",
               y = "Park Name"
           )
)


`GroupBy.count` is deprecated. It has been renamed to `len`.



In [7]:
misc = (
    species_data
    .filter(
        pl.col("CategoryName").is_in(["Fungi", "Chromista", "Bacteria", "Protozoa"])
    )
    .group_by(["ParkName", "CategoryName"])
    .count()
    .sort("count", descending=True)
)

(
    ggplot(misc, aes(x = "count", y = as_discrete("ParkName", order_by="count", order = 1), fill = "CategoryName")) +
           geom_bar(stat = "identity") +
           scale_fill_manual(values=["#8EA163", "#1A8FA5", "#EFCF72", "#EF8C64"]) +
           labs( # type: ignore
               title = "National Parks Ranked by Other Biodiversity \n",
               x = "Number of Species",
               y = "Park Name"
           )
)


`GroupBy.count` is deprecated. It has been renamed to `len`.



In [8]:
animal_list = ("Mammal", "Bird", "Fish", "Reptile", "Amphibian", "Insect")
selected_park = "Zion"
specific_park_data = species_data.filter(pl.col("ParkName") == selected_park)

animals_in_park = (species_data
                   .filter((pl.col("ParkName") == selected_park) & (pl.col("CategoryName").is_in(animal_list)))
                   .group_by("CategoryName").agg(pl.col("CategoryName").count().alias("Count"))
                   )
animals_in_park


CategoryName,Count
str,u32
"""Fish""",15
"""Amphibian""",7
"""Bird""",301
"""Reptile""",30
"""Mammal""",80


In [9]:
from lets_plot import *

values = {
    "Bird":"#8EA163", 
    "Fish":"#1A8FA5", 
    "Mammal":"#EFCF72", 
    "Reptile":"#EF8C64", 
    "Amphibian":"#9DC9B7"
}

(
    ggplot(animals_in_park) +
    geom_pie(aes(slice = "Count", fill = "CategoryName"), stat = "identity") +
    scale_fill_manual(values = values) +
    theme_void()
)