In [None]:
import numpy as np
import pandas as pd

In [None]:
# See database_helpers.py
run database_helpers.py

In [None]:
# Read the taxonomy data gathered in a separate script
taxa = pd.read_csv("../Data/taxonomy.csv")

In [None]:
# We want to drop the Kingdom column, since it is all Animalia and not useful information
taxa.drop(columns=["kingdom"], inplace=True)

In [None]:
# Now we need to grab the last column from the Fact data source, since it is not provided
# in the taxonomic data
common_names = pd.read_csv("../Data/all_data.csv", usecols=["scientific_name", "common_name"])
common_names.rename(columns={"scientific_name":"species", "common_name":"commonname"}, inplace=True)

# Gather just the unique rows, and merge them into our master taxa df
common_names.drop_duplicates(inplace = True)

taxa = taxa.merge(common_names)
taxa.head()

In [None]:
# One final record should be created to capture "unidentified" taxa,
# or taxa that do not have a matching taxonomy in this table when the
# fact data is loaded
unidentified = {col:None for col in taxa.columns.to_list()}
unidentified["species"] = "Unclassified Taxon"
unidentified["commonname"] = "No Classification"

taxa = pd.concat([pd.DataFrame([unidentified]), taxa], ignore_index=True)

In [None]:
# This is effectively an historic load, so we need to provide an Id value
taxa.reset_index(inplace=True)
taxa.rename(columns = {"index": "id"}, inplace=True)

# Lastly, instead of importing strings reading NaN into our databse, import NULL values
taxa.replace(np.nan, None, inplace=True)

In [None]:
# Bulk load taxonomy data using bulk insert function
dimTaxaInsertQuery = """
INSERT INTO "Dim_Taxon"
("Id", "Phylum", "Class", "Order", "Family", "Genus", "Species", "CommonName")
VALUES
%s
"""
bulk_insert(dimTaxaInsertQuery, taxa)