# Plant Disease Tables (09 & 10) Assembly and Database Import

Name: Zihan

## Step 1 - Set Up Paths and Imports

In [79]:
import os
import json
import pandas as pd
from glob import glob
import mysql.connector
from mysql.connector import Error

# Define input and output paths for raw data and wrangled data
json_dir = "01_raw_data/04_plant_diseases"
output_path_09 = "02_wrangled_data/Table09_PlantDiseaseTable.csv"
output_path_10 = "02_wrangled_data/Table10_PlantDiseaseImageTable.csv"

## Step 2 - Helpers and Flatten Functions

In [80]:
# Helper function to safely get the first element of a list or return None
def first_or_none(lst):
    """Returns the first item of a list, or None if the list is empty, not a list, or contains non-list items."""
    if isinstance(lst, list) and lst:
        return lst[0]
    # Handle cases where the API might return a single string instead of a list of strings
    if isinstance(lst, str):
        return lst
    return None

# Helper function to safely extract image URLs
def get_image_url(images_list, key):
    """Safely extracts a specific URL from the first image dictionary in a list."""
    if isinstance(images_list, list) and images_list:
        first_image = images_list[0] if isinstance(images_list[0], dict) else {}
        return first_image.get(key)
    return None

def to_mysql_json(value):
    """
    Safely prepares a Python object for insertion into a MySQL JSON column.
    - Converts null-like values to an empty JSON array '[]'.
    - Escapes backslashes, newlines, and tabs to be compatible with `LOAD DATA INFILE`.
    """
    if value in (None, "", "null"):
        return "[]"  # Return a valid empty JSON array

    # First, convert the Python object to a JSON string
    json_str = json.dumps(value, ensure_ascii=False)

    # CRITICAL: Backslashes must be escaped first, before other special characters.
    json_str = json_str.replace('\\', '\\\\').replace('\n', '\\n').replace('\t', '\\t')

    return json_str

# NOTE: The following functions process the raw JSON data into structured records for each table.

# Flatten each JSON record for Table09_PlantDiseaseTable
def flatten_disease_data(payload):
    """
    Flattens a plant disease JSON object for the main disease table.
    If the 'data' key is empty, returns None to skip the record.
    """
    items = payload.get('data') or []
    if not items:
        return None
    disease_info = items[0]
    
    return {
        "plant_disease_id": disease_info.get("id"),
        "common_name": disease_info.get("common_name"),
        "scientific_name": first_or_none(disease_info.get("scientific_name")),
        "other_name": to_mysql_json(disease_info.get("other_name", [])),
        "host": to_mysql_json(disease_info.get("host", [])),
        "description": to_mysql_json(disease_info.get("description", [])),
        "solution": to_mysql_json(disease_info.get("solution", []))
    }


# Flatten each JSON record for Table10_PlantDiseaseImageTable
def flatten_image_data(payload):
    """
    Flattens a plant disease JSON object for the disease image table.
    If the 'data' key is empty, returns None. It only processes the first image 
    to match the existing table structure.
    """
    items = payload.get('data') or []
    if not items:
        return None
    disease_info = items[0]

    return {
        "plant_disease_id": disease_info.get("id"),
        "regular_url_image": get_image_url(disease_info.get("images"), "regular_url"),
        "thumbnail_image": get_image_url(disease_info.get("images"), "thumbnail"),
    }

## Step 3 - Load Raw JSON Data

In [81]:
# Find all plant disease JSON files in the specified directory
json_files = glob(os.path.join(json_dir, "plant_disease_*.json"))

# Initialize lists to hold the structured records
disease_records = []
image_records = []

# Process each JSON file. Files with empty 'data' will be skipped, preventing IndexErrors.
skipped = 0
for file in json_files:
    with open(file, "r", encoding="utf-8") as f:
        payload = json.load(f)

    # Flatten the data for both the disease and image tables
    rec_d = flatten_disease_data(payload)
    rec_i = flatten_image_data(payload)

    # If the payload was empty, increment the skipped counter
    if rec_d is None and rec_i is None:
        skipped += 1
        continue

    # Append the flattened records to their respective lists
    if rec_d is not None:
        disease_records.append(rec_d)
    if rec_i is not None:
        image_records.append(rec_i)

print(f"Successfully processed {len(json_files) - skipped} JSON files (skipped {skipped} empty).")

Successfully processed 96 JSON files (skipped 4 empty).


## Step 4 - Create DataFrames, Order Columns, and Save to CSV

In [82]:
# Create a DataFrame from the list of disease records
disease_df = pd.DataFrame(disease_records)

# Define and apply the desired column order
ordered_cols_09 = [
    "plant_disease_id", "common_name", "scientific_name", "other_name",
    "host", "description", "solution"
]
disease_df = disease_df[ordered_cols_09]

# Sort by ID, reset the index, and ensure the ID is an integer type
disease_df = disease_df.sort_values(by="plant_disease_id").reset_index(drop=True)
disease_df["plant_disease_id"] = pd.to_numeric(disease_df["plant_disease_id"]).astype("Int64")

# --- Process Table 10 (Plant Disease Images) ---
# Create a DataFrame from the list of image records
image_df = pd.DataFrame(image_records)

# Define and apply the desired column order
ordered_cols_10 = ["plant_disease_id", "regular_url_image", "thumbnail_image"]
image_df = image_df[ordered_cols_10]

# Sort by ID, reset the index, and ensure the ID is an integer type
image_df = image_df.sort_values(by="plant_disease_id").reset_index(drop=True)
image_df["plant_disease_id"] = pd.to_numeric(image_df["plant_disease_id"]).astype("Int64")

# --- Save DataFrames to CSV ---
# Ensure the output directory exists
os.makedirs(os.path.dirname(output_path_09), exist_ok=True)

# Save both tables to CSV files, explicitly using utf-8-sig encoding for compatibility
disease_df.to_csv(output_path_09, index=False, lineterminator='\r\n', encoding='utf-8-sig')
image_df.to_csv(output_path_10, index=False, lineterminator='\r\n', encoding='utf-8-sig')

print(f"Table 09 saved to: {output_path_09}")
print(f"Table 10 saved to: {output_path_10}")

Table 09 saved to: 02_wrangled_data/Table09_PlantDiseaseTable.csv
Table 10 saved to: 02_wrangled_data/Table10_PlantDiseaseImageTable.csv


## Step 5 - Import Plant Disease Table (Table09) into MySQL

In [83]:
# Database connection configuration, specifying utf8mb4 for full Unicode support
db_config = {
    'host': 'database-plantx.cqz06uycysiz.us-east-1.rds.amazonaws.com',
    'user': 'zihan',
    'password': '2002317Yzh12138.',
    'database': 'FIT5120_PlantX_Database',
    'allow_local_infile': True,
    'use_pure': True,
    'charset': 'utf8mb4'
}

# Attempt to connect and create the necessary tables
try:
    connection = mysql.connector.connect(**db_config)
    if connection.is_connected():
        print("Successfully connected to MySQL server")
        cursor = connection.cursor()
        
        # SQL statement to create the main plant disease table
        create_table_09 = """
        CREATE TABLE IF NOT EXISTS Table09_PlantDiseaseTable (
            plant_disease_id INT PRIMARY KEY,
            common_name TEXT,
            scientific_name TEXT,
            other_name JSON,
            host JSON,
            description JSON,
            solution JSON
        ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
        """

        # SQL statement to create the plant disease image table with a foreign key
        create_table_10 = """
        CREATE TABLE IF NOT EXISTS Table10_PlantDiseaseImageTable (
            plant_disease_id INT PRIMARY KEY,
            regular_url_image TEXT,
            thumbnail_image TEXT,
            CONSTRAINT fk_disease_image FOREIGN KEY (plant_disease_id)
                REFERENCES Table09_PlantDiseaseTable (plant_disease_id)
                ON DELETE CASCADE
        ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
        """
        
        # Execute the table creation queries
        cursor.execute(create_table_09)
        cursor.execute(create_table_10)
        
        connection.commit()
        print(f"Table09 & Table10 schema are created.")
        
except Error as e:
    print(f"Error occurred during creating Table09 & Table10 schema: {e}")
    
finally:
    # Ensure the connection is closed
    if 'connection' in locals() and connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection for creating Table09 & Table10 schema closed.")

Successfully connected to MySQL server
Table09 & Table10 schema are created.
MySQL connection for creating Table09 & Table10 schema closed.


In [84]:
try:
    # Re-establish the connection for the data import operation
    connection = mysql.connector.connect(**db_config)
    if connection.is_connected():
        print("Successfully connected to MySQL server for Table09 import.")
        cursor = connection.cursor()
        
        # Define the query to load data from the CSV file into the database
        # This uses LOAD DATA LOCAL INFILE for efficient bulk insertion
        load_data_query_09 = f"""
        LOAD DATA LOCAL INFILE '02_wrangled_data/Table09_PlantDiseaseTable.csv'
        INTO TABLE Table09_PlantDiseaseTable
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ','
        OPTIONALLY ENCLOSED BY '"'
        LINES TERMINATED BY '\\r\\n'
        IGNORE 1 LINES
        (
            plant_disease_id, common_name, scientific_name, other_name,
            host, description, solution
        );
        """
        
        cursor.execute(load_data_query_09)
        connection.commit()
        print(f"Table09 data import successful! {cursor.rowcount} rows affected.")
        
except Error as e:
    print(f"Error occurred during Table09 import: {e}")
    
finally:
    # Ensure the connection is closed
    if 'connection' in locals() and connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection for Table09 closed.")

Successfully connected to MySQL server for Table09 import.
Table09 data import successful! 96 rows affected.
MySQL connection for Table09 closed.


## Step 6 - Verify Imported Rows and Preview (Table09)

In [85]:
# --- Verify Table 09 Data ---
# This cell checks the row count and previews the first few entries to confirm successful import.
try:
    connection = mysql.connector.connect(**db_config)
    if connection.is_connected():
        cursor = connection.cursor()
        
        # Get the total number of rows in the table
        cursor.execute("SELECT COUNT(*) FROM Table09_PlantDiseaseTable")
        row_count = cursor.fetchone()[0]
        print(f"Table09_PlantDiseaseTable currently contains {row_count} rows.")
        
        # Fetch and print the first 5 rows for a quick preview
        print("\n--- Preview of first 5 rows from Table09 ---")
        cursor.execute("SELECT * FROM Table09_PlantDiseaseTable LIMIT 5")
        rows = cursor.fetchall()
        for row in rows:
            print(row)
            
except Error as e:
    print(f"Error occurred during Table09 verification: {e}")
    
finally:
    # Ensure the connection is closed
    if 'connection' in locals() and connection.is_connected():
        cursor.close()
        connection.close()

Table09_PlantDiseaseTable currently contains 96 rows.

--- Preview of first 5 rows from Table09 ---
(1, 'Fairy ring', 'Agrocybe', '[]', '["all lawn grasses"]', '[{"subtitle": "What is Fairy Ring (Agrocybe)?", "description": "A fairy ring is a lawn issue where circular bands of dark green or dying grass appear. Different fungi like Agrocybe, Marasmius oreades, and Lepiota species cause it. Fairy rings can be found on lawns worldwide. The size and shape of the rings depend on the fungus and environment. Common symptoms of the fairy ring include forming a ring pattern with vibrant or dead grass. Some fungi produce mushrooms without rings, while others create rings without mushrooms. This issue affects actively growing turf, including cool-season and Bermudagrass."}, {"subtitle": "How Does Fairy Ring Occur?", "description": "Fairy rings are formed when underground fungal mycelia expand in a circular pattern. They release spores from mushrooms at the ring\'s edge, which can spread through a

## Step 7 - Import Plant Disease Image Table (Table10) into MySQL

In [86]:
# --- Import Plant Disease Image Table (Table 10) into MySQL ---
try:
    connection = mysql.connector.connect(**db_config)
    if connection.is_connected():
        print("\nSuccessfully connected to MySQL server for Table10 import.")
        cursor = connection.cursor()
        
        # Define the query to load data from the image CSV into its table
        load_data_query_10 = f"""
        LOAD DATA LOCAL INFILE '02_wrangled_data/Table10_PlantDiseaseImageTable.csv'
        INTO TABLE Table10_PlantDiseaseImageTable
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ','
        OPTIONALLY ENCLOSED BY '"'
        LINES TERMINATED BY '\\r\\n'
        IGNORE 1 LINES
        (
            plant_disease_id, regular_url_image, thumbnail_image
        );
        """
        
        cursor.execute(load_data_query_10)
        connection.commit()
        print(f"Table10 data import successful! {cursor.rowcount} rows affected.")

except Error as e:
    print(f"Error occurred during Table10 import: {e}")

finally:
    # Ensure the connection is closed
    if 'connection' in locals() and connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection for Table10 closed.")


Successfully connected to MySQL server for Table10 import.
Table10 data import successful! 96 rows affected.
MySQL connection for Table10 closed.


## Step 8 - Verify Imported Rows and Preview (Table10)

In [87]:
# --- Verify Table 10 Data ---
# This cell checks the row count and previews the first few entries to confirm successful import.
try:
    connection = mysql.connector.connect(**db_config)
    if connection.is_connected():
        cursor = connection.cursor()
        
        # Get the total number of rows in the table
        cursor.execute("SELECT COUNT(*) FROM Table10_PlantDiseaseImageTable")
        row_count = cursor.fetchone()[0]
        print(f"Table10_PlantDiseaseImageTable currently contains {row_count} rows.")
        
        # Fetch and print the first 5 rows for a quick preview
        print("\n--- Preview of first 5 rows from Table10 ---")
        cursor.execute("SELECT * FROM Table10_PlantDiseaseImageTable LIMIT 5")
        rows = cursor.fetchall()
        for row in rows:
            print(row)
            
except Error as e:
    print(f"Error occurred during Table10 verification: {e}")
    
finally:
    # Ensure the connection is closed
    if 'connection' in locals() and connection.is_connected():
        cursor.close()
        connection.close()

Table10_PlantDiseaseImageTable currently contains 96 rows.

--- Preview of first 5 rows from Table10 ---
(1, 'https://perenual.com/storage/species_disease/1_a/regular/Fairy_ring_-_geograph.org.uk_-_1057031.jpg', 'https://perenual.com/storage/species_disease/1_a/thumbnail/Fairy_ring_-_geograph.org.uk_-_1057031.jpg')
(2, 'https://perenual.com/storage/species_disease/2__panaeolus_foenisecii_/regular/Panaeolus_foenisecii_124316833.jpg', 'https://perenual.com/storage/species_disease/2__panaeolus_foenisecii_/thumbnail/Panaeolus_foenisecii_124316833.jpg')
(3, 'https://perenual.com/storage/species_disease/3_p/regular/Monographella_nivalis-2013-Brno-4.jpg', 'https://perenual.com/storage/species_disease/3_p/thumbnail/Monographella_nivalis-2013-Brno-4.jpg')
(4, 'https://perenual.com/storage/species_disease/4_c/regular/Coprinus_comatus-2_hg.jpg', 'https://perenual.com/storage/species_disease/4_c/thumbnail/Coprinus_comatus-2_hg.jpg')
(5, 'https://perenual.com/storage/species_disease/5_e/regular/Blu