# Melbourne Urban Forest: Data Wrangling and MySQL Ingestion

Name: Zihan

## Step 0 - Data Wrangling for Raw Dataset

In [None]:
import pandas as pd

# 1. Define file path and new column mapping relationships

file_path = r'01_raw_data/04_urban-forest.csv' 

# Updated mapping relationships:
# - Remove 'CoordinateLocation'
# - Add 'Latitude' and 'Longitude'
column_mapping = {
    'CoM ID': 'com_id',
    'Common Name': 'common_name',
    'Scientific Name': 'scientific_name',
    'Genus': 'genus',
    'Family': 'family',
    'Diameter Breast Height': 'diameter_breast_height',
    'Year Planted': 'year_planted',
    'Date Planted': 'date_planted',
    'Age Description': 'age_description',
    'Useful Life Expectency': 'useful_life_expectency',
    'Useful Life Expectency Value': 'useful_life_expectency_value',
    'Precinct': 'precinct',
    'Located in': 'located_in',
    'UploadDate': 'uploaddate',
    # Directly read independent latitude and longitude columns
    'Latitude': 'latitude',
    'Longitude': 'longitude'
}

# 2. Read, select and rename data

try:
    # Only read the columns we need (now includes Latitude and Longitude)
    df = pd.read_csv(file_path, usecols=column_mapping.keys())

    # Use updated mapping relationships for renaming
    df.rename(columns=column_mapping, inplace=True)
    
    print("Data loaded and renamed successfully!")
    print("\n" + "="*50 + "\n")


    # 3. Check and display final results
    # No splitting operations needed at this point
    
    print("Final processed data information:")
    # Pandas usually automatically recognizes latitude and longitude as float64 type when reading
    df.info()

    print("\nFinal data preview (first 5 rows):")
    print(df.head())

except FileNotFoundError:
    print(f"Error: File not found, please check if path '{file_path}' is correct.")
except KeyError as e:
    print(f"Error: CSV file seems to be missing column {e}. Please check if the keys in column_mapping exactly match the CSV column names.")
except Exception as e:
    print(f"Error occurred during processing: {e}")

数据加载并重命名成功！


最终处理完成的数据信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76928 entries, 0 to 76927
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   com_id                        76928 non-null  int64  
 1   common_name                   76903 non-null  object 
 2   scientific_name               76927 non-null  object 
 3   genus                         76927 non-null  object 
 4   family                        76927 non-null  object 
 5   diameter_breast_height        24986 non-null  float64
 6   year_planted                  76928 non-null  int64  
 7   date_planted                  76928 non-null  object 
 8   age_description               24969 non-null  object 
 9   useful_life_expectency        24969 non-null  object 
 10  useful_life_expectency_value  24969 non-null  float64
 11  precinct                      0 non-null      float64
 12  located_in                    769

In [3]:
df

Unnamed: 0,com_id,common_name,scientific_name,genus,family,diameter_breast_height,year_planted,date_planted,age_description,useful_life_expectency,useful_life_expectency_value,precinct,located_in,uploaddate,latitude,longitude
0,1049657,Unknown,Melaleuca parvistaminea,Melaleuca,Myrtaceae,,1998,1998-12-17,,,,,Park,2021-01-10,-37.790705,144.944666
1,1782373,Coastal Banksia,Banksia integrifolia,Banksia,Proteaceae,,2020,2020-03-04,,,,,Park,2021-01-10,-37.802899,144.926193
2,1604511,Red Box,Eucalyptus polyanthemos,Eucalyptus,Myrtaceae,,2015,2015-05-08,,,,,Park,2021-01-10,-37.795723,144.969386
3,1070399,Ironbark,Eucalyptus sideroxylon,Eucalyptus,Myrtaceae,12.0,2006,2006-12-19,Semi-Mature,31-60 years,60.0,,Street,2021-01-10,-37.827934,144.901980
4,1734680,Drooping sheoak,Allocasuarina verticillata,Allocasuarina,Casuarinaceae,,2018,2018-09-05,,,,,Park,2021-01-10,-37.792724,144.948192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76923,1787150,Drooping sheoak,Allocasuarina verticillata,Allocasuarina,Casuarinaceae,,2020,2020-04-08,,,,,Park,2021-01-10,-37.778453,144.957660
76924,1518924,Drooping sheoak,Allocasuarina verticillata,Allocasuarina,Casuarinaceae,,2012,2012-11-18,,,,,Park,2021-01-10,-37.780520,144.955620
76925,1509645,Hoop Pine,Araucaria cunninghamii,Araucaria,Araucariaceae,,2012,2012-07-06,,,,,Park,2021-01-10,-37.815750,144.937147
76926,1782310,Silver Banksia,Banksia marginata,Banksia,Proteaceae,,2020,2020-01-04,,,,,Street,2021-01-10,-37.802856,144.925837


In [None]:
# Remove precinct column
# inplace=True means directly modify the original DataFrame
df.drop(columns=['precinct'], inplace=True)

# Verify that the column has been removed
print("'precinct' column has been removed, remaining columns are:")
print(df.columns)

'precinct' 列已移除，剩余列如下：
Index(['com_id', 'common_name', 'scientific_name', 'genus', 'family',
       'diameter_breast_height', 'year_planted', 'date_planted',
       'age_description', 'useful_life_expectency',
       'useful_life_expectency_value', 'located_in', 'uploaddate', 'latitude',
       'longitude'],
      dtype='object')


## Step 1 - Save Your Cleaned DataFrame to CSV


In [6]:
# Define the path for the wrangled data file
wrangled_file_path = '02_wrangled_data/Table12_UrbanForestTable.csv'

# Save the DataFrame to a CSV file, ready for import
df.to_csv(wrangled_file_path, index=False, encoding='utf-8')

print(f"Cleaned data saved to {wrangled_file_path}")

Cleaned data saved to 02_wrangled_data/Table12_UrbanForestTable.csv


## Step 2 - Create Schema for `Table12_UrbanForestTable`

In [None]:
import mysql.connector
from mysql.connector import Error

# Database connection configuration
db_config = {
    'host': 'database-plantx.cqz06uycysiz.us-east-1.rds.amazonaws.com',
    'user': 'zihan',
    'password': '2002317Yzh12138.',
    'database': 'FIT5120_PlantX_Database',
    'allow_local_infile': True,
    'use_pure': True,
    'charset': 'utf8mb4'
}

# --- Task 1: Create table structure (final corrected version) ---
print("--- [Task 1/4] Creating table structure... ---")
try:
    connection = mysql.connector.connect(**db_config)
    if connection.is_connected():
        print("✅ Successfully connected to MySQL server")
        cursor = connection.cursor()

        # Delete any existing old table to ensure we start from a clean state
        print("Deleting old table (if exists)...")
        cursor.execute("DROP TABLE IF EXISTS Table12_UrbanForestTable")
        print("Old table deleted.")

        # SQL statement to create table (final correction: add default value for coords)
        create_table_12 = """
        CREATE TABLE Table12_UrbanForestTable (
            com_id INT PRIMARY KEY,
            common_name TEXT,
            scientific_name TEXT,
            genus TEXT,
            family TEXT,
            diameter_breast_height DOUBLE,
            year_planted INT,
            date_planted DATE,
            age_description TEXT,
            useful_life_expectency TEXT,
            useful_life_expectency_value INT,
            located_in TEXT,
            uploaddate DATE,
            latitude DOUBLE,
            longitude DOUBLE,
            -- Add a temporary default value for coords field to resolve NOT NULL conflict during LOAD DATA
            coords POINT NOT NULL DEFAULT (POINT(0,0)),
            SPATIAL INDEX(coords)
        ) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
        """
        cursor.execute(create_table_12)
        connection.commit()
        print("✅ Table 'Table12_UrbanForestTable' structure created successfully.")

except Error as e:
    print(f"❌ Error occurred while creating table structure: {e}")

finally:
    if 'connection' in locals() and connection.is_connected():
        cursor.close()
        connection.close()
        print("🔌 MySQL connection for table structure creation closed.\n")

--- [任务 1/4] 正在创建表结构... ---
✅ 成功连接到MySQL服务器
正在删除旧表（如果存在）...
旧表已删除。
✅ 表'Table12_UrbanForestTable'的结构创建成功。
🔌 已关闭用于创建表结构的MySQL连接。



## Step 3 - Import Data into `Table12_UrbanForestTable`

In [11]:
try:
    # Re-establish the connection for the data import
    connection = mysql.connector.connect(**db_config)
    if connection.is_connected():
        print("\nSuccessfully connected to MySQL server for Table12 import.")
        cursor = connection.cursor()

        # Define the query to load data from your cleaned CSV file
        # IMPORTANT: Make sure the file path is correct!
        load_data_query_12 = f"""
        LOAD DATA LOCAL INFILE '02_wrangled_data/Table12_UrbanForestTable.csv'
        INTO TABLE Table12_UrbanForestTable
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ','
        OPTIONALLY ENCLOSED BY '"'
        LINES TERMINATED BY '\\r\\n'
        IGNORE 1 LINES
        (
            com_id, common_name, scientific_name, genus, family,
            diameter_breast_height, year_planted, date_planted,
            age_description, useful_life_expectency,
            useful_life_expectency_value, located_in, uploaddate,
            latitude, longitude
        );
        """

        cursor.execute(load_data_query_12)
        connection.commit()
        print(f"Table12 data import successful! {cursor.rowcount} rows affected.")

except Error as e:
    print(f"Error occurred during Table12 import: {e}")

finally:
    if 'connection' in locals() and connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection for Table12 import closed.")


Successfully connected to MySQL server for Table12 import.
Table12 data import successful! 76928 rows affected.
MySQL connection for Table12 import closed.


## Step 4 - Populate the Spatial Column (coords)

In [12]:
try:
    # Re-establish the connection to update the spatial column
    connection = mysql.connector.connect(**db_config)
    if connection.is_connected():
        print("\nSuccessfully connected to MySQL server for spatial column update.")
        cursor = connection.cursor()

        # SQL statement to populate the 'coords' column from latitude and longitude
        update_spatial_column = """
        UPDATE Table12_UrbanForestTable
        SET coords = POINT(longitude, latitude)
        WHERE longitude IS NOT NULL AND latitude IS NOT NULL;
        """

        cursor.execute(update_spatial_column)
        connection.commit()
        print(f"Spatial column 'coords' populated successfully! {cursor.rowcount} rows updated.")

except Error as e:
    print(f"Error occurred during spatial column update: {e}")

finally:
    if 'connection' in locals() and connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection for spatial update closed.")


Successfully connected to MySQL server for spatial column update.
Spatial column 'coords' populated successfully! 76928 rows updated.
MySQL connection for spatial update closed.


## Step 5 - Verify Imported Data and Preview

In [13]:
try:
    connection = mysql.connector.connect(**db_config)
    if connection.is_connected():
        print("\n✅ Successfully connected to MySQL server for Table12 verification.")
        cursor = connection.cursor()

        # Get the total number of rows in the table
        cursor.execute("SELECT COUNT(*) FROM Table12_UrbanForestTable")
        row_count = cursor.fetchone()[0]
        print(f"📊 Table12_UrbanForestTable currently contains {row_count} rows.")

        # Fetch and print the first 5 rows for a quick preview
        print("\n--- Preview of first 5 rows from Table12 ---")
        # Use ST_AsText() to display the POINT data in a readable format
        cursor.execute("SELECT com_id, common_name, latitude, longitude, ST_AsText(coords) FROM Table12_UrbanForestTable LIMIT 5")
        rows = cursor.fetchall()
        for row in rows:
            print(row)

except Error as e:
    print(f"❌ Error occurred during Table12 verification: {e}")

finally:
    if 'connection' in locals() and connection.is_connected():
        cursor.close()
        connection.close()
        print("🔌 MySQL connection for Table12 verification closed.")


✅ Successfully connected to MySQL server for Table12 verification.
📊 Table12_UrbanForestTable currently contains 76928 rows.

--- Preview of first 5 rows from Table12 ---
(1013381, 'Dutch Elm', -37.81223297971192, 144.9506598126038, 'POINT(144.9506598126038 -37.81223297971192)')
(1013382, 'Huntingdon Elm', -37.81226850504794, 144.95051593219205, 'POINT(144.95051593219205 -37.81226850504794)')
(1013383, 'English Elm', -37.8125145693672, 144.94975749030814, 'POINT(144.94975749030814 -37.8125145693672)')
(1013384, 'Dutch Elm', -37.81235693844764, 144.9497772995458, 'POINT(144.9497772995458 -37.81235693844764)')
(1013385, 'Dutch Elm', -37.81242493692164, 144.94954210109805, 'POINT(144.94954210109805 -37.81242493692164)')
🔌 MySQL connection for Table12 verification closed.


## Sample for Klarissa

### Chinese Version

```
SELECT
    com_id,
    common_name,
    -- Use ST_Distance_Sphere to calculate precise spherical distance (in meters)
    ST_Distance_Sphere(coords, POINT(144.9671, -37.8183)) AS distance_in_meters
FROM
    Table12_UrbanForestTable
WHERE
    -- Use this function directly in WHERE clause for efficient filtering
    ST_Distance_Sphere(coords, POINT(144.9671, -37.8183)) <= 500;
```

### English Version

```
SELECT
    -- Select the unique identifier and the common name for each tree.
    com_id,
    common_name,

    -- Calculate the distance from each tree's location to a specified central point.
    -- ST_Distance_Sphere calculates the great-circle distance on a sphere, returning the result in meters.
    -- The target point is created using POINT(longitude, latitude).
    ST_Distance_Sphere(coords, POINT(144.9671, -37.8183)) AS distance_in_meters
FROM
    -- Specify the table containing the urban forest data.
    Table12_UrbanForestTable
WHERE
    -- Filter the results to include only records within a 500-meter radius of the central point.
    -- This operation is highly efficient because it utilizes the SPATIAL INDEX on the 'coords' column,
    -- which prevents the database from scanning the entire table.
    ST_Distance_Sphere(coords, POINT(144.9671, -37.8183)) <= 500;
```