# General Plant Image Table Assembly and Database Import

Name: Zihan

## Step 1 - Imports and Path Setup

In [10]:
import os
import json
import pandas as pd
from glob import glob

# Paths
details_dir = "01_raw_data/01_species_details"
output_path = "02_wrangled_data/Table05_GeneralPlantImageTable.csv"

## Step 2 - Load Species Details and Extract Image URLs

In [11]:
# Get all species detail JSON files
detail_files = glob(os.path.join(details_dir, "plant_species_details_*.json"))

records = []
for file in detail_files:
    with open(file, "r", encoding="utf-8") as f:
        details = json.load(f)
    
    plant_id = details.get("id")
    
    # Skip threatened plants (ID > 3000)
    if plant_id > 3000:
        continue
    
    # Extract image URLs
    default_image = details.get("default_image", {})
    regular_url = default_image.get("regular_url", "") if default_image else ""
    thumbnail_url = default_image.get("thumbnail", "") if default_image else ""
    
    records.append({
        "general_plant_id": plant_id,
        "regular_url_image": regular_url,
        "thumbnail_image": thumbnail_url
    })

## Step 5 - Verify Line Endings

In [12]:
with open(output_path, 'rb') as f:
    content = f.read(200)  # Read first 200 bytes
    
if b'\r\n' in content:
    print("CSV uses CRLF (\\r\\n) line endings")
else:
    print("CSV does not use CRLF line endings")

CSV uses CRLF (\r\n) line endings


## Step 6 - Import Image Table into MySQL

In [13]:
import mysql.connector
from mysql.connector import Error

# Database connection configuration
db_config = {
    'host': 'database-plantx.cqz06uycysiz.us-east-1.rds.amazonaws.com',
    'user': 'zihan',
    'password': '2002317Yzh12138.',
    'database': 'FIT5120_PlantX_Database',
    'allow_local_infile': True,
    'use_pure': True  # Use pure Python implementation
}

try:
    # Establish connection
    connection = mysql.connector.connect(**db_config)
    
    if connection.is_connected():
        print("Successfully connected to MySQL server")
        
        # Create cursor
        cursor = connection.cursor()
        
        # Construct LOAD DATA LOCAL INFILE command
        load_data_query = """
        LOAD DATA LOCAL INFILE '02_wrangled_data/Table05_GeneralPlantImageTable.csv'
        INTO TABLE Table05_GeneralPlantImageTable
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ',' 
        OPTIONALLY ENCLOSED BY '"'
        LINES TERMINATED BY '\\r\\n'
        IGNORE 1 LINES
        (   
            general_plant_id, regular_url_image, thumbnail_image
        );
        """
        
        # Execute command
        cursor.execute(load_data_query)
        connection.commit()  # Commit transaction
        
        print(f"Data import successful! {cursor.rowcount} rows affected.")
        
except Error as e:
    print(f"Error occurred during execution: {e}")
    
finally:
    # Close connection
    if connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection closed.")

Successfully connected to MySQL server
Data import successful! 0 rows affected.
MySQL connection closed.


## Step 7 - Verify Imported Rows and Preview

In [14]:
# In the same connection session, or in a new one
try:
    connection = mysql.connector.connect(**db_config)
    cursor = connection.cursor()
    
    cursor.execute("SELECT COUNT(*) FROM Table05_GeneralPlantImageTable")
    row_count = cursor.fetchone()[0]
    print(f"The table currently contains {row_count} rows")
    
    # Preview first few rows
    cursor.execute("SELECT * FROM Table05_GeneralPlantImageTable LIMIT 5")
    rows = cursor.fetchall()
    for row in rows:
        print(row)
        
except Error as e:
    print(f"Error occurred during query: {e}")
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()

The table currently contains 593 rows
(1, 'https://perenual.com/storage/species_image/1_abies_alba/regular/1536px-Abies_alba_SkalitC3A9.jpg', 'https://perenual.com/storage/species_image/1_abies_alba/thumbnail/1536px-Abies_alba_SkalitC3A9.jpg')
(2, 'https://perenual.com/storage/species_image/2_abies_alba_pyramidalis/regular/49255769768_df55596553_b.jpg', 'https://perenual.com/storage/species_image/2_abies_alba_pyramidalis/thumbnail/49255769768_df55596553_b.jpg')
(3, 'https://perenual.com/storage/species_image/3_abies_concolor/regular/52292935430_f4f3b22614_b.jpg', 'https://perenual.com/storage/species_image/3_abies_concolor/thumbnail/52292935430_f4f3b22614_b.jpg')
(4, 'https://perenual.com/storage/species_image/4_abies_concolor_candicans/regular/49283844888_332c9e46f2_b.jpg', 'https://perenual.com/storage/species_image/4_abies_concolor_candicans/thumbnail/49283844888_332c9e46f2_b.jpg')
(5, 'https://perenual.com/storage/species_image/5_abies_fraseri/regular/36843539702_e80fc436e0_b.jpg',

## Step 8 - Download Thumbnails

In [15]:
import os
import pandas as pd
import requests
import time
from pathlib import Path

# Paths
csv_path = "02_wrangled_data/Table05_GeneralPlantImageTable.csv"
output_dir = "01_raw_data/05_thumbnail_image"

# Create output directory if not exists
os.makedirs(output_dir, exist_ok=True)

# Read CSV
df = pd.read_csv(csv_path)

# Counters
success_count = 0
fail_count = 0

print(f"Starting download of {len(df)} thumbnails...")

for index, row in df.iterrows():
    plant_id = row['general_plant_id']
    thumbnail_url = row['thumbnail_image']
    
    # Skip empty URLs
    if pd.isna(thumbnail_url) or not thumbnail_url:
        print(f"Skipping plant ID {plant_id}: empty thumbnail URL")
        fail_count += 1
        continue
    
    # Build filename
    filename = f"plant_species_thumbnail_image_{plant_id}.jpg"
    filepath = os.path.join(output_dir, filename)
    
    # Skip if file already exists
    if os.path.exists(filepath):
        print(f"Skipping plant ID {plant_id}: file already exists")
        success_count += 1
        continue
    
    try:
        # Directly request the image
        response = requests.get(thumbnail_url, timeout=30)
        response.raise_for_status()
        
        # Save image
        with open(filepath, 'wb') as f:
            f.write(response.content)
        
        print(f"Successfully downloaded thumbnail for plant ID {plant_id}")
        success_count += 1
        
        time.sleep(0.1)  # Short delay to avoid too many rapid requests
        
    except requests.exceptions.RequestException as e:
        print(f"Failed to download thumbnail for plant ID {plant_id}: {e}")
        fail_count += 1
    except Exception as e:
        print(f"Error while processing plant ID {plant_id}: {e}")
        fail_count += 1

print(f"\nDownload complete! Success: {success_count}, Failures: {fail_count}")

# Verify file count
downloaded_files = list(Path(output_dir).glob("plant_species_thumbnail_image_*.jpg"))
print(f"There are {len(downloaded_files)} files in the thumbnail folder")

Starting download of 593 thumbnails...
Skipping plant ID 1: file already exists
Skipping plant ID 2: file already exists
Skipping plant ID 3: file already exists
Skipping plant ID 4: file already exists
Skipping plant ID 5: file already exists
Skipping plant ID 6: file already exists
Skipping plant ID 7: file already exists
Skipping plant ID 8: file already exists
Skipping plant ID 9: file already exists
Skipping plant ID 10: file already exists
Skipping plant ID 11: file already exists
Skipping plant ID 12: file already exists
Skipping plant ID 13: file already exists
Skipping plant ID 14: file already exists
Skipping plant ID 15: file already exists
Skipping plant ID 16: file already exists
Skipping plant ID 17: file already exists
Skipping plant ID 18: file already exists
Skipping plant ID 19: file already exists
Skipping plant ID 20: file already exists
Skipping plant ID 21: file already exists
Skipping plant ID 22: file already exists
Skipping plant ID 23: file already exists
Skip