# Plant Main Table Assembly and Database Import

Name: Zihan, Klarissa

## Step 1 - Set Up Paths and Imports

In [1]:
import os
import json
import pandas as pd
from glob import glob

# Define input and output paths
json_dir = "01_raw_data/01_species_details"
csv_path = "01_raw_data/01_threatened-plant-living-collection-plan.csv"
output_path = "02_wrangled_data/Table01_PlantMainTable.csv"

## Step 2 - Helpers and Flatten Functions

In [2]:
# Flatten each JSON record
def first_or_none(lst):
    return lst[0] if isinstance(lst, list) and lst else None


cycle_map = {
    "Perennial": "Every year",
    "Annual": "Once a year",
    "Biennial": "Every 2 years"
}

def flatten_general(data):
    plant_id = data.get("id")
    return {
        "plant_id": plant_id,
        "general_plant_id": plant_id,
        "threatened_plant_id": None,
        "common_name": data.get("common_name"),
        "scientific_name": first_or_none(data.get("scientific_name")),
        "other_name": first_or_none(data.get("other_name")),
        "if_threatened": False,
        "if_edible": data.get("edible_fruit", False) or data.get("edible_leaf", False) or data.get("cuisine", False),
        "if_indoors": data.get("indoor", False),
        "if_medicinal": data.get("medicinal", False),
        "if_poisonous": data.get("poisonous_to_humans", False) or data.get("poisonous_to_pets", False),
        "if_fruits": data.get("fruits", False),
        "if_flowers": data.get("flowers", False),
        # Store as JSON string to fit MySQL Workbench
        "sun_expose": json.dumps(data.get("sunlight", []), ensure_ascii=False),
        "watering": data.get("watering"),
        "plant_cycle": cycle_map.get(data.get("cycle"), data.get("cycle")),
        "growth_rate": data.get("growth_rate")
    }

def flatten_threatened(row, index_offset):
    plant_id = 3001 + index_offset
    return {
        "plant_id": plant_id,
        "general_plant_id": None,
        "threatened_plant_id": plant_id,
        "common_name": row["Common Name"],
        "scientific_name": row["Species Name"],
        "other_name": None,
        "if_threatened": True,
        "if_edible": False,
        "if_indoors": False,
        "if_medicinal": False,
        "if_poisonous": False,
        "if_fruits": False,
        "if_flowers": False,
        # Store as JSON string to fit MySQL Workbench
        "sun_expose": json.dumps(([row["Sun"]] if pd.notna(row["Sun"]) else []), ensure_ascii=False),
        "watering": None,
        "plant_cycle": cycle_map.get(row["Habit"], row["Habit"]),
        "growth_rate": None
    }

## Step 3 - Load Raw JSON and CSV

In [3]:
# Load all JSON files
json_files = glob(os.path.join(json_dir, "plant_species_details_*.json"))
general_data = []
for file in json_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        general_data.append(flatten_general(data))

# Load threatened CSV
threatened_df = pd.read_csv(csv_path)
threatened_data = [flatten_threatened(row, i) for i, row in threatened_df.iterrows()]

## Step 4 - Combine, Order Columns, and Save CSV

In [4]:
# Create DataFrame and sort
combined_df = pd.DataFrame(general_data + threatened_data)
combined_df = combined_df.sort_values(by="plant_id").reset_index(drop=True)

# Reorder columns
ordered_cols = [
    "plant_id", "general_plant_id", "threatened_plant_id",
    "common_name", "scientific_name", "other_name",
    "if_threatened", "if_edible", "if_indoors", "if_medicinal",
    "if_poisonous", "if_fruits", "if_flowers",
    "sun_expose", "watering", "plant_cycle", "growth_rate"
]
combined_df = combined_df[ordered_cols]

# Ensure ID fields are integers (or NaN if missing)
combined_df["plant_id"] = pd.to_numeric(combined_df["plant_id"], errors="coerce").astype("Int64")
combined_df["general_plant_id"] = pd.to_numeric(combined_df["general_plant_id"], errors="coerce").astype("Int64")
combined_df["threatened_plant_id"] = pd.to_numeric(combined_df["threatened_plant_id"], errors="coerce").astype("Int64")

# Save to CSV
os.makedirs(os.path.dirname(output_path), exist_ok=True)
combined_df.to_csv(output_path, index=False)

## Step 5 - Import Plant Main Table into MySQL

In [5]:
import mysql.connector
from mysql.connector import Error

# Database connection configuration
db_config = {
    'host': 'database-plantx.cqz06uycysiz.us-east-1.rds.amazonaws.com',
    'user': 'zihan',
    'password': '2002317Yzh12138.',
    'database': 'FIT5120_PlantX_Database',
    'allow_local_infile': True,
    'use_pure': True  # Use pure Python implementation
}

try:
    # Establish connection
    connection = mysql.connector.connect(**db_config)
    
    if connection.is_connected():
        print("Successfully connected to MySQL server")
        
        # Create cursor
        cursor = connection.cursor()
        
        # Construct LOAD DATA LOCAL INFILE command
        load_data_query = """
        LOAD DATA LOCAL INFILE '02_wrangled_data/Table01_PlantMainTable.csv'
        INTO TABLE Table01_PlantMainTable
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ',' 
        OPTIONALLY ENCLOSED BY '"'
        LINES TERMINATED BY '\\r\\n'
        IGNORE 1 LINES
        (
            plant_id, general_plant_id, threatened_plant_id, common_name, 
            scientific_name, other_name, if_threatened, if_edible, 
            if_indoors, if_medicinal, if_poisonous, if_fruits, 
            if_flowers, sun_expose, watering, plant_cycle, growth_rate
        );
        """
        
        # Execute command
        cursor.execute(load_data_query)
        connection.commit()  # Commit transaction
        
        print(f"Data import successful! {cursor.rowcount} rows affected.")
        
except Error as e:
    print(f"Error occurred during execution: {e}")
    
finally:
    # Close connection
    if connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection closed.")

Successfully connected to MySQL server
Data import successful! 891 rows affected.
MySQL connection closed.


## Step 6 - Verify Imported Rows and Preview

In [6]:
# In the same connection session, or in a new one
try:
    connection = mysql.connector.connect(**db_config)
    cursor = connection.cursor()
    
    cursor.execute("SELECT COUNT(*) FROM Table01_PlantMainTable")
    row_count = cursor.fetchone()[0]
    print(f"The table currently contains {row_count} rows")
    
    # Preview first few rows
    cursor.execute("SELECT * FROM Table01_PlantMainTable LIMIT 5")
    rows = cursor.fetchall()
    for row in rows:
        print(row)
        
except Error as e:
    print(f"Error occurred during query: {e}")
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()

The table currently contains 1684 rows
(1, 1, 0, 'European Silver Fir', 'Abies alba', 'Common Silver Fir', 'False', 'False', 'False', 'True', 'False', 'False', 'False', '["full sun"]', 'Frequent', 'Every year', 'High')
(2, 2, 0, 'Pyramidalis Silver Fir', "Abies alba 'Pyramidalis'", '', 'False', 'False', 'False', 'False', 'False', 'False', 'False', '["full sun"]', 'Average', 'Every year', 'Low')
(3, 3, 0, 'White Fir', 'Abies concolor', 'Silver Fir', 'False', 'False', 'False', 'True', 'False', 'False', 'True', '["Full sun", "part shade"]', 'Average', 'Every year', 'Low')
(4, 4, 0, 'Candicans White Fir', "Abies concolor 'Candicans'", 'Silver Fir', 'False', 'False', 'False', 'False', 'False', 'False', 'False', '["full sun"]', 'Average', 'Every year', 'Low')
(5, 5, 0, 'Fraser Fir', 'Abies fraseri', 'Southern Fir', 'False', 'False', 'False', 'False', 'False', 'False', 'True', '["full sun", "part shade", "filtered shade"]', 'Frequent', 'Every year', 'Moderate')
