# General Plant Distribution Map Assembly and Database Import

Name: Zihan

## Step 1 - Imports and Path Setup

In [9]:
import os
import json
import pandas as pd
from glob import glob

# Paths
html_dir = "01_raw_data/03_hardiness_map"
output_path = "02_wrangled_data/Table04_GeneralPlantDistributionMapTable.csv"

## Step 2 - Extract Distribution Map HTML from Files

In [10]:
# Extract distribution map info
html_files = glob(os.path.join(html_dir, "plant_species_hardiness_map_*.html"))

records = []
for file in html_files:
    basename = os.path.basename(file)
    try:
        # Extract plant ID from the filename
        general_plant_id = int(basename.split("_")[-1].split(".")[0])
        
        # Read HTML file content
        with open(file, 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        # Store plant ID and the HTML content (not the file path)
        records.append({
            "general_plant_id": general_plant_id,
            "distribution_map_html": html_content
        })
    except (ValueError, IndexError) as e:
        print(f"Skipping malformed file: {basename}, error: {e}")
        continue

## Step 3 - Build DataFrame and Save CSV (CRLF)

In [11]:
# Create DataFrame and sort
df = pd.DataFrame(records)
df = df.sort_values(by="general_plant_id").reset_index(drop=True)
df["general_plant_id"] = pd.to_numeric(df["general_plant_id"], errors="coerce").astype("Int64")

# Save to CSV
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False, encoding='utf-8', lineterminator='\r\n')

print(f"Successfully processed {len(records)} HTML files and saved to {output_path}")

Successfully processed 593 HTML files and saved to 02_wrangled_data/Table04_GeneralPlantDistributionMapTable.csv


## Step 4 - 1st Spot-Check a Saved HTML Snippet

In [12]:
import pandas as pd
import tempfile
import os
import webbrowser

# Read the CSV file
csv_path = "02_wrangled_data/Table04_GeneralPlantDistributionMapTable.csv"
df = pd.read_csv(csv_path)

# Choose a plant ID to verify (e.g., the 78th row, index 77)
plant_id = df.iloc[77]['general_plant_id']
html_content = df.iloc[77]['distribution_map_html']

# Create a temporary HTML file
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f:
    f.write(html_content)
    temp_file = f.name

print(f"Temporary file created: {temp_file}")

# Open in the default browser
webbrowser.open('file://' + os.path.realpath(temp_file))

# Optionally wait for user input before deleting the temporary file
input("Press Enter to delete the temporary file...")
os.unlink(temp_file)

Temporary file created: C:\Users\zyyin1\AppData\Local\Temp\tmp8b9moe48.html


## Step 5 - Verify Line Endings

In [13]:
# Verify line terminator
with open(output_path, 'rb') as f:
    content = f.read()
    
if b'\r\n' in content:
    print("CSV uses CRLF (\\r\\n) line endings")
elif b'\n' in content:
    print("CSV uses LF (\\n) line endings")
else:
    print("Unable to determine line ending type")

CSV uses CRLF (\r\n) line endings


## Step 6 - Inspect First Bytes for Line Ending Check

In [14]:
# Read the first few hundred bytes to inspect line endings
with open(output_path, 'rb') as f:
    first_chunk = f.read(500)  # first 500 bytes
    
print("File header (hex):")
print(first_chunk.hex())

print("\nFile header (ASCII; non-printable characters shown as escape sequences):")
# Convert bytes to a readable form, showing special chars as escapes
readable = first_chunk.replace(b'\r', br'\r').replace(b'\n', br'\n')
print(readable.decode('utf-8', errors='ignore'))

File header (hex):
67656e6572616c5f706c616e745f69642c646973747269627574696f6e5f6d61705f68746d6c0d0a312c223c21444f43545950452068746d6c3e0a3c68746d6c206c616e673d2222656e22223e0a202020203c686561643e0a0a20202020202020203c212d2d203630202d2d3e0a20202020202020200a20202020202020203c6d657461206e616d653d2222726f626f7473222220636f6e74656e743d22226e6f696e6465782c206e6f666f6c6c6f7722223e0a0a20202020202020203c212d2d204c69766577697265205374796c6573202d2d3e3c7374796c65203e5b776972655c3a6c6f6164696e675d5b776972655c3a6c6f6164696e675d2c205b776972655c3a6c6f6164696e675c2e64656c61795d5b776972655c3a6c6f6164696e675c2e64656c61795d2c205b776972655c3a6c6f6164696e675c2e696e6c696e652d626c6f636b5d5b776972655c3a6c6f6164696e675c2e696e6c696e652d626c6f636b5d2c205b776972655c3a6c6f6164696e675c2e696e6c696e655d5b776972655c3a6c6f6164696e675c2e696e6c696e655d2c205b776972655c3a6c6f6164696e675c2e626c6f636b5d5b776972655c3a6c6f6164696e675c2e626c6f636b5d2c205b776972655c3a6c6f6164696e675c2e666c65785d5b776972655c3a6c6f6164696e675c2e6

## Step 7 - Import Distribution Map Table into MySQL

In [15]:
import mysql.connector
from mysql.connector import Error

# Database connection configuration
db_config = {
    'host': 'database-plantx.cqz06uycysiz.us-east-1.rds.amazonaws.com',
    'user': 'zihan',
    'password': '2002317Yzh12138.',
    'database': 'FIT5120_PlantX_Database',
    'allow_local_infile': True,
    'use_pure': True  # Use pure Python implementation
}

try:
    # Establish connection
    connection = mysql.connector.connect(**db_config)
    
    if connection.is_connected():
        print("Successfully connected to MySQL server")
        
        # Create cursor
        cursor = connection.cursor()
        
        # truncate_query = "TRUNCATE TABLE Table04_GeneralPlantDistributionMapTable;"
        # cursor.execute(truncate_query)
        # print("Existing rows truncated.")
        
        # Construct LOAD DATA LOCAL INFILE command
        load_data_query = """
        LOAD DATA LOCAL INFILE '02_wrangled_data/Table04_GeneralPlantDistributionMapTable.csv'
        INTO TABLE Table04_GeneralPlantDistributionMapTable
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ',' 
        OPTIONALLY ENCLOSED BY '"'
        LINES TERMINATED BY '\\r\\n'
        IGNORE 1 LINES
        (   
            general_plant_id, distribution_map_html
        );
        """
        
        # Execute command
        cursor.execute(load_data_query)
        connection.commit()  # Commit transaction
        
        print(f"Data import successful! {cursor.rowcount} rows affected.")
        
except Error as e:
    print(f"Error occurred during execution: {e}")
    
finally:
    # Close connection
    if connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection closed.")

Successfully connected to MySQL server
Data import successful! 0 rows affected.
MySQL connection closed.


## Step 8 - Verify Imported Rows and Preview

In [16]:
# In the same connection session, or in a new one
try:
    connection = mysql.connector.connect(**db_config)
    cursor = connection.cursor()
    
    cursor.execute("SELECT COUNT(*) FROM Table04_GeneralPlantDistributionMapTable")
    row_count = cursor.fetchone()[0]
    print(f"The table currently contains {row_count} rows")
    
    # Preview first few rows
    cursor.execute("SELECT * FROM Table04_GeneralPlantDistributionMapTable LIMIT 5")
    rows = cursor.fetchall()
    for row in rows:
        print(row)
        
except Error as e:
    print(f"Error occurred during query: {e}")
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()

The table currently contains 593 rows
(1, '<!DOCTYPE html>\n<html lang="en">\n    <head>\n\n        <!-- 60 -->\n        \n        <meta name="robots" content="noindex, nofollow">\n\n        <!-- Livewire Styles --><style >[wire:loading][wire:loading], [wire:loading.delay][wire:loading.delay], [wire:loading.inline-block][wire:loading.inline-block], [wire:loading.inline][wire:loading.inline], [wire:loading.block][wire:loading.block], [wire:loading.flex][wire:loading.flex], [wire:loading.table][wire:loading.table], [wire:loading.grid][wire:loading.grid], [wire:loading.inline-flex][wire:loading.inline-flex] {display: none;}[wire:loading.delay.none][wire:loading.delay.none], [wire:loading.delay.shortest][wire:loading.delay.shortest], [wire:loading.delay.shorter][wire:loading.delay.shorter], [wire:loading.delay.short][wire:loading.delay.short], [wire:loading.delay.default][wire:loading.delay.default], [wire:loading.delay.long][wire:loading.delay.long], [wire:loading.delay.longer][wire:loadi

## Step 9 - 2nd Spot-Check: Read HTML from DB and Open in Browser
Extract HTML from the database and test by saving to a file.

In [17]:
import mysql.connector
from mysql.connector import Error
import tempfile
import os
import webbrowser

# Database extraction and browser test
def test_html_from_database(plant_id):
    """Fetch HTML for a given plant_id from the database and test in the browser."""
    try:
        connection = mysql.connector.connect(**db_config)
        cursor = connection.cursor()
        
        # Query the HTML content for the specified plant
        query = """
            SELECT 
                distribution_map_html 
            FROM 
                Table04_GeneralPlantDistributionMapTable 
            WHERE 
                general_plant_id = %s
        """
        cursor.execute(query, (plant_id,))
        result = cursor.fetchone()
        
        if result:
            html_content = result[0]
            
            # Create a temporary HTML file
            with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f:
                f.write(html_content)
                temp_file = f.name
            
            print(f"Temporary file created: {temp_file}")
            
            # Open in the default browser
            webbrowser.open('file://' + os.path.realpath(temp_file))
            
            # Wait for user confirmation
            input(f"Please check whether the distribution map for plant ID {plant_id} renders correctly. Press Enter to continue...")
            
            # Delete the temporary file
            os.unlink(temp_file)
            print(f"Temporary file deleted: {temp_file}")
        else:
            print(f"No distribution map found for plant ID {plant_id}")
            
    except Error as e:
        print(f"Error occurred during query: {e}")
    finally:
        if connection.is_connected():
            cursor.close()
            connection.close()

# Test a few different plant IDs (example)
test_html_from_database(77)

Temporary file created: C:\Users\zyyin1\AppData\Local\Temp\tmpkwdye7yp.html
Temporary file deleted: C:\Users\zyyin1\AppData\Local\Temp\tmpkwdye7yp.html
