At the end of DataPrep we download two .csv files: "F23_DataPrep.csv" and "S23_DataPrep.csv"
Before brining them back into Python in this notebook, I did some manual work in Excel, deleting and moving columns so they align.

In [3]:
import geopandas as gpd
import pandas as pd

In [None]:
F23 = gpd.read_file("Merge_Ready/F23_modified.csv")
S23 = gpd.read_file("Merge_Ready/S23_modified.csv")
baseline = gpd.read_file("Input_Data/DCRreplanted141516gisfieldmap.csv")

In [None]:
# Check if the columns match
if set(S23.columns) == set(F23.columns):
    # Columns match, so you can combine the dataframes
    combined = pd.concat([S23, F23], ignore_index=True)
else:
    print("Columns do not match. Cannot combine dataframes.")
    
# Set display options to show all columns
pd.set_option('display.max_columns', None)
# Check the first few rows of the combined DataFrame
#print(combined.head())


In [None]:
# Standardize ObservationDate formatting
# I wish I didn't have to do this in such an arcane way, but nothing else was working
print(combined['EditDate'].unique())


# Convert 'EditDate' column to datetime format
for idx, date_str in enumerate(combined['EditDate']):
    try:
        # Try to convert the date string to datetime format
        date_obj = pd.to_datetime(date_str)
        if date_obj.strftime('%H:%M:%S') == '00:00:00':
            # If time part is '00:00:00', convert to '%m/%d/%Y' format
            combined.loc[idx, 'EditDate'] = date_obj.strftime('%m/%d/%Y')
        else:
            # Otherwise, convert to '%m/%d/%y' format
            combined.loc[idx, 'EditDate'] = date_obj.strftime('%m/%d/%y')
    except ValueError:
        # If conversion fails, handle the exception here
        # You can add custom logic to handle different date formats
        pass
print()

In [None]:
# Rename the 'EditDate' column to 'ObservationDate_2023'
combined = combined.rename(columns={'EditDate': 'ObservationDate_2023'})

# And double check it all worked
dates = sorted((combined['ObservationDate_2023'].unique()).tolist())
dates

In [None]:
# Do the same for "ObservationDate_baseline"
print(combined['ObservationDate_baseline'].unique())

# Convert 'EditDate' column to datetime format
for idx, date_str in enumerate(combined['ObservationDate_baseline']):
    try:
        # Try to convert the date string to datetime format
        date_obj = pd.to_datetime(date_str)
        if date_obj.strftime('%H:%M:%S') == '00:00:00':
            # If time part is '00:00:00', convert to '%m/%d/%Y' format
            combined.loc[idx, 'ObservationDate_baseline'] = date_obj.strftime('%m/%d/%Y')
        else:
            # Otherwise, convert to '%m/%d/%y' format
            combined.loc[idx, 'ObservationDate_baseline'] = date_obj.strftime('%m/%d/%y')
    except ValueError:
        # If conversion fails, handle the exception here
        # You can add custom logic to handle different date formats
        pass
print()

In [None]:
# And double check it all worked
baseline_dates = sorted((combined['ObservationDate_baseline'].unique()).tolist())
baseline_dates

In [None]:
# Check out species list
print(len(S23['SPECIES'].unique()))
print(len(F23['SPECIES'].unique()))
print(len(combined['SPECIES'].unique()))
species = sorted(combined['SPECIES'].unique().tolist())
species

In [None]:
# There are some errors and incomplete entries. 
# Lets correct common names where applicable
combined['SPECIES'] = combined['SPECIES'].replace({'White Fur': 'White Fir'})
combined['SPECIES'] = combined['SPECIES'].replace({'Japanese tree lilac': 'Japanese Tree Lilac'})
combined['SPECIES'] = combined['SPECIES'].replace({'Linden': 'American Linden'})
combined['SPECIES'] = combined['SPECIES'].replace({'Magnolia': 'Sweet Bay Magnolia'})

In [None]:
# Generate SPECIES to Scientific.Name key dictionary 
species_to_scientific = {}

# Iterate over unique species names
for species in sorted(combined['SPECIES'].unique()):
    # Get the corresponding scientific name for the species
    scientific_name = combined.loc[combined['SPECIES'] == species, 'Scientific.Name'].iloc[0]
    # Add the mapping to the dictionary
    species_to_scientific[species] = scientific_name

# Print the derived dictionary sorted by common name
for species, scientific_name in species_to_scientific.items():
    print(f"{species}: {scientific_name}")

In [None]:
# Work a little GPT magic to generate the following
# F23 did not have scientific names
# Use key dictionary printed above, modify to correct errors and incomplete entries

# Dictionary mapping species names to scientific names, sorted alphabetically by common name
species_to_scientific = {
    'American Arborvitae': 'Thuja occidentalis',
    'American Linden': 'Tilia americana',
    'Austrian Pine': 'Pinus nigra',
    'Bald Cypress': 'Taxodium distichum',
    'Balsam Fir': 'Abies balsamea',
    'Beech': 'Fagus sylvatica',
    'Blackgum': 'Nyssa sylvatica',
    'Bradford Pear': 'Pyrus calleryana',
    'Carolina Silverbell': 'Halesia carolina',
    'Cherry': 'Prunus spp.',
    'Colorado Spruce': 'Picea pungens',
    'Crabapple': 'Malus sylvestris',
    'Cucumber Magnolia': 'Magnolia acuminata',
    'Dawn Redwood': 'Metasequoia glyptostroboides',
    'Dogwood': 'Cornus florida',
    'Fraser Fir': 'Abies fraseri',
    'Fringetree': 'Chionanthus virginicus',
    'Ginkgo': 'Ginkgo biloba',
    'Golden Raintree': 'Koelreuteria paniculata',
    'Hawthorn': 'Crataegus spp.',
    'Honeylocust': 'Gleditsia triacanthos',
    'Hophornbeam': 'Ostrya virginiana',
    'Hornbeam': 'Carpinus betulus',
    'Japanese Pagoda': 'Styphnolobium japonicum',
    'Japanese Snowbell': 'Styrax japonicus',
    'Japanese Stewartia': 'Stewartia pseudocamellia',
    'Japanese Tree Lilac': 'Syringa reticulata',
    'Juniper': 'Juniperus virginiana',
    'Kousa Dogwood': 'Cornus kousa',
    'Larch': 'Larix laricina',
    'Littleleaf Linden': 'Tilia cordata',
    'Sweet Bay Magnolia': 'Magnolia virginiana',
    'Norway Spruce': 'Picea abies',
    'Pin Oak': 'Quercus palustris',
    'Red Oak': 'Quercus rubra',
    'Sargent Cherry': 'Prunus sargentii',
    'Scarlet Oak': 'Quercus coccinea',
    'Serbian Spruce': 'Picea omorika',
    'Serviceberry': 'Amelanchier spp.',
    'Snow Goose Cherry': 'prunus serrulata',
    'Sourwood': 'Oxydendrum arboreum',
    'Swamp White Oak': 'Quercus bicolor',
    'Sweetgum': 'Liquidambar styraciflua',
    'Tulip': 'Liriodendron tulipifera',
    'White Fir': 'Abies concolor',
    'White Oak': 'Quercus alba',
    'White Pine': 'Pinus strobus',
    'Yellowwood': 'Cladrastis kentukea',
    'Zelkova': 'Zelkova serrata'
}

# Fill in the 'Scientific.Name' column based on the mapping
combined['Scientific.Name'] = combined['SPECIES'].map(species_to_scientific)

In [None]:
# Run key dictionary generator again to double check it all looks good
species_to_scientific = {}

# Iterate over unique species names
for species in sorted(combined['SPECIES'].unique()):
    # Get the corresponding scientific name for the species
    scientific_name = combined.loc[combined['SPECIES'] == species, 'Scientific.Name'].iloc[0]
    # Add the mapping to the dictionary
    species_to_scientific[species] = scientific_name

# Print the derived dictionary sorted by common name
for species, scientific_name in species_to_scientific.items():
    print(f"{species}: {scientific_name}")


In [None]:
baselineColumns = baseline.columns.tolist()
baselineColumns

In [None]:
# Add forester column from baseline
combined = pd.merge(combined, baseline[['ID','FORESTER']], on='ID', how='left')
print(combined.head())

In [None]:
columns = combined.columns.tolist()
columns

At this point I ran a script to get the unique values of every column in the dataframe excluding comments, notes, and address information
The output is long, so I left it at the bottom of the script

In [None]:
'''
# Iterate over each column in the DataFrame
for column in combined.columns:
    # Check if the column contains either "_baseline" or "_2023" in the name
    # and does not contain "Comments"
    if ("_baseline" in column or "_2023" in column) and "Comments" not in column:
        # Print the name of the column
        print(f"Unique values in column '{column}':")
        # Print the unique values of the column
        print(combined[column].unique())
        print()  # Add a newline for readability
'''

In [None]:
# 'LandUse_baseline' and 'SiteType_baseline' are messy. We could replace with values from 2023. But it doesn't really matter. 
# I did not run this. 

# Replace values in 'LandUse_baseline' and 'SiteType_baseline' with values from 2023
# combined['LandUse_baseline'] = combined['LandUse_2023']
# combined['SiteType_baseline'] = combined['SiteType_2023']

In [None]:
# Unique values in column 'Mortality_baseline':
# ['NA' 'Alive' 'Removed/Missing' '' 'Standing Dead' 'Removed/missing'
# 'Unknown' 'Removed' 'Remove/Missing' 'N/A' 'Removed / missing' 'Unkown'
# 'Stump' 'Removed/Missng' 'Removed ']

# Define the mapping dictionary
mapping = {
    'NA': 'NA',
    'Alive': 'Alive',
    'Removed/Missing': 'Removed',
    '': 'NA',
    'Standing Dead': 'Standing Dead',
    'Removed/missing': 'Removed',
    'Unknown': 'Unknown',
    'Remove/Missing': 'Removed',
    'N/A': 'NA',
    'Removed / missing': 'Removed',
    'Unkown': 'Unknown',
    'Stump': 'Stump',
    'Removed/Missng': 'Removed',
    'Removed ': 'Removed'
}

# Map the values using the mapping dictionary
combined['Mortality_baseline'] = combined['Mortality_baseline'].map(mapping)

# Fill NaN values with 'NA'
combined['Mortality_baseline'].fillna('NA', inplace=True)

# Print the unique values again to verify
print("Unique values in column 'Mortality_baseline':")
print(combined['Mortality_baseline'].unique())


In [None]:
# Clean up Vigor_baseline

# Define mapping for replacement
replacement_mapping = {
    '0': '',
    ' ': '',
    '1-25 %': '1',
    '26-50 %': '2'
}

# Replace values in the column
combined['Vigor_baseline'] = combined['Vigor_baseline'].replace(replacement_mapping)

# Print unique values to verify
print("Unique values in column 'Vigor_baseline':")
print(combined['Vigor_baseline'].unique())


In [None]:
# Unique values in column 'Condition_baseline':
# ['NA' 'Good' '0' '' 'Fair' 'Poor' 'Removed/Missing' 'Critical' ' ' '18']

# Define replacement mapping
replacement_mapping = {
    '0': '',
    ' ': '',
    '18': 'Good',
    'Removed/Missing': '',
    'Critical': 'Poor'
}

# Replace values in the column
combined['Condition_baseline'] = combined['Condition_baseline'].replace(replacement_mapping)

# Print unique values to verify
print("Unique values in column 'Condition_baseline':")
print(combined['Condition_baseline'].unique())


In [None]:
combined = combined.rename(columns={'NOTES_baseline': 'LocationNotes'})

In [None]:
# Unique values in column 'BasalSprouts_baseline':
# ['NA' 'N' '' 'No' 'Y' 'Yes' 'Y(lots!)']

# Define replacement mapping
replacement_mapping = {
    '0': 'N',
    '1': 'Y'
}

# Replace values in the column
combined['BasalSprouts_2023'] = combined['BasalSprouts_2023'].replace(replacement_mapping)

# Print unique values to verify
print("Unique values in column 'BasalSprouts_2023':")
print(combined['BasalSprouts_2023'].unique())

In [None]:
# Unique values in column 'Vigor_2023':
# ['1' '2' '0' '4' '3' '5' 'NA']

# Define replacement mapping
replacement_mapping = {
    '0': '',
    '5': ''
}

# Replace values in the column
combined['Vigor_2023'] = combined['Vigor_2023'].replace(replacement_mapping)

# Print unique values to verify
print("Unique values in column 'Vigor_2023':")
print(combined['Vigor_2023'].unique())


In [None]:
# Unique values in column 'Condition_2023':
# ['Good' 'Poor' '' 'Fair' 'Dead' 'NA']

# Define replacement mapping
replacement_mapping = {
    'Dead': ''
}

# Replace values in the column
combined['Condition_2023'] = combined['Condition_2023'].replace(replacement_mapping)

# Print unique values to verify
print("Unique values in column 'Condition_2023':")
print(combined['Condition_2023'].unique())


In [None]:
#Unique values in column 'BasalSprouts_baseline':
#['NA' 'N' '' 'No' 'Y' 'Yes' 'Y(lots!)']

# Define replacement mapping
replacement_mapping = {
    'No': 'N',
    'Yes': 'Y',
    'Y(lots!)': 'Y'
}

# Replace values in the column
combined['BasalSprouts_baseline'] = combined['BasalSprouts_baseline'].replace(replacement_mapping)

# Replace other empty strings with 'NA'
combined['BasalSprouts_baseline'].replace('', 'NA', inplace=True)

# Print unique values to verify
print("Unique values in column 'BasalSprouts_baseline':")
print(combined['BasalSprouts_baseline'].unique())

In [None]:
# Save DataFrame F23 as a CSV file
combined.to_csv('HERO2023_Final.csv', index=False)

In [None]:
# Run this again at the end to double check 
# Iterate over each column in the DataFrame
for column in combined.columns:
    # Check if the column contains either "_baseline" or "_2023" in the name
    # and does not contain "Comments"
    if ("_baseline" in column or "_2023" in column) and "Comments" not in column:
        # Print the name of the column
        print(f"Unique values in column '{column}':")
        # Print the unique values of the column
        print(combined[column].unique())
        print()  # Add a newline for readability