# Threatened Plant Data Wrangling and Database Import

Name: Zihan Yin

## Step 1 - Load and Clean Raw Data

In [1]:
import pandas as pd
import re

def strip_prefix(value):
    if pd.isna(value):
        return value
    # Match prefix: optional number + optional * + followed by - or =
    return re.sub(r'^\s*\d+\*?\s*[-=]\s*', '', str(value)).strip()

# Load raw file
df = pd.read_csv("01_raw_data/01_threatened-plant-living-collection-plan.csv")

rename_map = {
    "Species Name": "species_name",
    "Common Name": "common_name",
    "Habit": "habit",
    "Soil": "soil",
    "Sun": "sun",
    "Cultivation Protocols ": "cultivation_protocols",
    "Propagation Protocols ": "propagation_protocols",
    "Propagation Methods": "propagation_methods",
    "Cultivation Requirements Subtotal": "cultivation_requirements_subtotal",
    "Germplasm Source": "germplasm_source",
    "Conservation Status": "conservation_status",
    "Germplasm Origin": "germplasm_origin",
    "Provenance": "provenance",
    "Additional Conservation\r\nbenefit": "additional_conservation_benefit",
    "Conservation Subtotal": "conservation_subtotal",
    "Additional Local Benefits Description (i.e., to City of Melbourne)": "additional_local_benefits_description_i_e_to_city_of_melbourne",
    "Additional Local Benefits Subtotal": "additional_local_benefits_subtotal",
    "Total Score": "total_score",
    "Horticultural Potential ": "horticultural_potential",
    "Total Score Including Hort Potential": "total_score_including_hort_potential",
    "Weed Rating": "weed_rating",
    "Priority species": "priority_species"
}

df.rename(columns=rename_map, inplace=True)

## Step 2 - Create Table06 and Table07 DataFrames

In [2]:
import numpy as np

# Add threatened_plant_id
df['threatened_plant_id'] = range(3001, 3001 + len(df))

# Table06
table06 = df[['threatened_plant_id',
              'conservation_status',
              'provenance',
              'weed_rating',
              'habit',
              'germplasm_source',
              'additional_conservation_benefit',
              'additional_local_benefits_description_i_e_to_city_of_melbourne',
              'horticultural_potential']].copy()

table06.rename(columns={
    'additional_conservation_benefit': 'conservation_benefit',
    'additional_local_benefits_description_i_e_to_city_of_melbourne': 'local_benefits_description'
}, inplace=True)

table06['conservation_benefit'] = table06['conservation_benefit'].replace('0', np.nan)

# Remove prefix numbers
for col in ['provenance', 'conservation_status', 'conservation_benefit', 'germplasm_source', 'horticultural_potential']:
    table06[col] = table06[col].apply(strip_prefix)

table06.to_csv("02_wrangled_data/Table06_ThreatenedPlantDescriptionTable.csv", index=False)

# Table07
table07 = df[['threatened_plant_id',
              'soil',
              'sun',
              'propagation_methods',
              'propagation_protocols',
              'cultivation_protocols']].copy()

table07.rename(columns={
    'propagation_protocols': 'propagation_level',
    'cultivation_protocols': 'cultivation_note'
}, inplace=True)

# Remove prefix numbers
for col in ['propagation_level', 'cultivation_note']:
    table07[col] = table07[col].apply(strip_prefix)

table07.to_csv("02_wrangled_data/Table07_ThreatenedPlantCareGuideTable.csv", index=False)

## Step 3 - Import Table06 into MySQL Database

In [3]:
import mysql.connector
from mysql.connector import Error

# Database connection config
db_config = {
    'host': 'database-plantx.cqz06uycysiz.us-east-1.rds.amazonaws.com',
    'user': 'zihan',
    'password': '2002317Yzh12138.',
    'database': 'FIT5120_PlantX_Database',
    'allow_local_infile': True,
    'use_pure': True  # Use pure Python implementation
}

try:
    # Establish connection
    connection = mysql.connector.connect(**db_config)
    
    if connection.is_connected():
        print("Successfully connected to MySQL server")
        
        # Create cursor
        cursor = connection.cursor()
        
        # Construct LOAD DATA LOCAL INFILE query
        load_data_query = """
        LOAD DATA LOCAL INFILE '02_wrangled_data/Table06_ThreatenedPlantDescriptionTable.csv'
        INTO TABLE Table06_ThreatenedPlantDescriptionTable
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ',' 
        OPTIONALLY ENCLOSED BY '"'
        LINES TERMINATED BY '\\r\\n'
        IGNORE 1 LINES
        (
            threatened_plant_id, conservation_status, provenance, weed_rating, habit, germplasm_source,
            conservation_benefit, local_benefits_description, horticultural_potential
        );
        """
        
        # Execute query
        cursor.execute(load_data_query)
        connection.commit()
        
        print(f"Data import successful! {cursor.rowcount} rows affected.")
        
except Error as e:
    print(f"Error occurred during execution: {e}")
    
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection closed.")

Successfully connected to MySQL server
Data import successful! 0 rows affected.
MySQL connection closed.


## Step 4 - Verify Data in Table06

In [4]:
try:
    connection = mysql.connector.connect(**db_config)
    cursor = connection.cursor()
    
    cursor.execute("SELECT COUNT(*) FROM Table06_ThreatenedPlantDescriptionTable")
    row_count = cursor.fetchone()[0]
    print(f"The table currently contains {row_count} rows")
    
    cursor.execute("SELECT * FROM Table06_ThreatenedPlantDescriptionTable LIMIT 5")
    rows = cursor.fetchall()
    for row in rows:
        print(row)
        
except Error as e:
    print(f"Error occurred during query: {e}")
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()

The table currently contains 200 rows
(3001, 'Critically Endangered', 'Indigenous to City of Melbourne', 'Low Risk', 'Annual', 'Currently held limited propagation material', 'Bushfire Recovery', 'Culturally Important to Traditional owners / and or local community', 'Plant with some ornamental attributes (form/flowers/fruit etc.) at various times of the year and could be reasonably incorporated into mixed plantings')
(3002, 'Critically Endangered', 'Indigenous to City of Melbourne', 'Low Risk', 'Annual', 'Currently held in collection', 'Bushfire Recovery', 'Culturally Important to Traditional owners / and or local community', 'Plant with some ornamental attributes (form/flowers/fruit etc.) at various times of the year and could be reasonably incorporated into mixed plantings')
(3003, 'Endangered', 'Indigenous to City of Melbourne', 'Medium Risk', 'Herbaceous Perennial', 'Not held in collection but easily obtained', '', 'Culturally Important to Traditional owners / and or local community

## Step 5 - Import Table07 into MySQL Database

In [5]:
try:
    connection = mysql.connector.connect(**db_config)
    
    if connection.is_connected():
        print("Successfully connected to MySQL server")
        
        cursor = connection.cursor()
        
        load_data_query = """
        LOAD DATA LOCAL INFILE '02_wrangled_data/Table07_ThreatenedPlantCareGuideTable.csv'
        INTO TABLE Table07_ThreatenedPlantCareGuideTable
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ',' 
        OPTIONALLY ENCLOSED BY '"'
        LINES TERMINATED BY '\\r\\n'
        IGNORE 1 LINES
        (
            threatened_plant_id, soil, sun, propagation_methods, propagation_level, cultivation_note
        );
        """
        
        cursor.execute(load_data_query)
        connection.commit()
        
        print(f"Data import successful! {cursor.rowcount} rows affected.")
        
except Error as e:
    print(f"Error occurred during execution: {e}")
    
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection closed.")

Successfully connected to MySQL server
Data import successful! 0 rows affected.
MySQL connection closed.


## Step 6 - Verify Data in Table07

In [6]:
try:
    connection = mysql.connector.connect(**db_config)
    cursor = connection.cursor()
    
    cursor.execute("SELECT COUNT(*) FROM Table07_ThreatenedPlantCareGuideTable")
    row_count = cursor.fetchone()[0]
    print(f"The table currently contains {row_count} rows")
    
    cursor.execute("SELECT * FROM Table07_ThreatenedPlantCareGuideTable LIMIT 5")
    rows = cursor.fetchall()
    for row in rows:
        print(row)
        
except Error as e:
    print(f"Error occurred during query: {e}")
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()

The table currently contains 200 rows
(3001, 'Free Draining, Moderately Draining', 'Full Sun', 'Seed/Cuttings', 'Can be propagated - but requires significant time/effort', 'Difficult (but not impossible) to cultivate - short lived or has specific requirements for cultivation such as misting or mycorrhizal associates or pot culture')
(3002, 'Free Draining, Moderately Draining', 'Full Sun', 'Seed', 'Difficult to propagate', 'Difficult (but not impossible) to cultivate - short lived or has specific requirements for cultivation such as misting or mycorrhizal associates or pot culture')
(3003, 'Free Draining', 'Full Sun', 'Seed/Cuttings', 'Easily propagated', 'Can be cultivated with specific growing conditions eg moist well drained soils  - would need to be replaced within 5 years to maintain high quality plant')
(3004, 'Free Draining', 'Full Sun, Part Shade', 'Seed', 'Easily propagated', 'Can be cultivated and is reasonably  tolerant of a range of garden situations/conditions - Longer live