In [15]:
import pandas as pd
import glob
import os
import shapefile
from dbfread import DBF
import json

PREPARE PLANET DATA

In [16]:
# Planet directories
# Metadata is *metadata.json

ndvi_files = r'C:\Users\bpere\Documents\EMPIRE\BARBARA\Akademisch\Remote_Sensing\Thesis\Py6S\Fields\official_dataset\extra_birds\extra_extra\Planet\ndvi'
metadata_files = r'C:\Users\bpere\Documents\EMPIRE\BARBARA\Akademisch\Remote_Sensing\Thesis\Py6S\Fields\official_dataset\extra_birds\extra_extra\Planet\metadata'
csv_path = r'C:\Users\bpere\Documents\EMPIRE\BARBARA\Akademisch\Remote_Sensing\Thesis\Py6S\Fields\official_dataset\extra_birds\extra_extra\Planet\csv\fragments'

Turn .dbf into .csv files

In [17]:
def process_dbf_files(ndvi_path, metadata_path, output_path):
    # Step 1: Locate metadata files
    metadata_files = glob.glob(os.path.join(metadata_path, "*_metadata.json"))

    for metadata_file in metadata_files:
        # Extract base name from the metadata file
        base_name = os.path.splitext(os.path.basename(metadata_file))[0]
        matching_key = '_'.join(base_name.split('_')[:-1])  # Get the matching key up to the last underscore

        # Step 2: Find the matching .dbf file in ndvi_path
        matching_dbf_file = None

        for dbf_file in os.listdir(ndvi_path):
            if matching_key in dbf_file and dbf_file.endswith(".dbf"):
                matching_dbf_file = os.path.join(ndvi_path, dbf_file)
                break

        if matching_dbf_file is not None:
            # Convert the .dbf file to a pandas DataFrame
            table = DBF(matching_dbf_file, encoding='utf-8')
            df = pd.DataFrame(iter(table))

            # Step 3: Add new columns to the DataFrame
            df["date"] = ""
            df["cloud_cover"] = ""

            # Step 4: Extract "cloud_percent" and populate cloud_cover
            with open(metadata_file, "r") as f:
                metadata = json.load(f)
                cloud_percent = metadata["properties"]["cloud_cover"]
                df["cloud_cover"] = cloud_percent

            # Step 5: Extract "acquired" and populate date
            with open(metadata_file, "r") as f:
                metadata = json.load(f)
                acquired = metadata["properties"]["acquired"]
                df["date"] = acquired.split("T")[0]

            # Step 6: Filter and rename columns
            df = df[["date", "_median", "cloud_cover", "new_id"]]
            df = df.rename(columns={"_median": "NDVI"})
            df = df.rename(columns={"new_id": "field_id"})

            # Step 7: Filter out rows where NDVI is null or missing
            df = df[df["NDVI"].notnull()]

            # Check if there are any rows left
            if not df.empty:
                # Step 8: Convert the DataFrame to CSV and save it
                csv_file = os.path.join(output_path, base_name + ".csv")
                df.to_csv(csv_file, index=False)
            else:
                print(f"No valid rows remaining for {base_name}, skipping CSV generation")

        else:
            print(f"No matching .dbf file found for {metadata_file}")

In [18]:
process_dbf_files(ndvi_files, metadata_files, csv_path)

Merge Planet files

In [19]:
def merge_csv_files(csv_dir):
    # Create an empty DataFrame to store the merged data
    merged_df = pd.DataFrame()
    
    csv_files = glob.glob(os.path.join(csv_dir, "*.csv"))
    
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        merged_df = pd.concat([merged_df, df])
    
    merged_df = merged_df.reset_index(drop=True)

    return merged_df

In [20]:
planet_raw = merge_csv_files(csv_path)

print(planet_raw)

           date      NDVI  cloud_cover field_id
0    2018-05-01  0.828534          0.0      RT1
1    2018-05-01  0.840214          0.0      RT2
2    2018-05-01  0.840210          0.0      RT3
3    2018-05-01  0.854524          0.0      RT4
4    2018-05-01  0.855745          0.0      RT5
..          ...       ...          ...      ...
655  2023-06-29  0.763382          0.0     DT53
656  2023-06-29  0.444728          0.0     DT54
657  2023-06-29  0.537545          0.0     DT55
658  2023-06-29  0.876192          0.0     DT56
659  2023-06-29  0.606636          0.0     DT58

[660 rows x 4 columns]


In [21]:
unique_planet_raw = (
    planet_raw
    .dropna(subset = "NDVI")                                                    # drop rows w/ empty NDVI values
    .groupby(['date', 'field_id'], as_index=False)[["NDVI", "cloud_cover"]]
    .median()                                                                   # calculate median for duplicate date-field_id combinations
    .pipe(lambda x: x[[c for c in x if c != 'field_id'] + ['field_id']])
    .assign(sat_name='Planet')
)

print(unique_planet_raw)

           date      NDVI  cloud_cover field_id sat_name
0    2018-05-01  0.828534          0.0      RT1   Planet
1    2018-05-01  0.840104          0.0     RT10   Planet
2    2018-05-01  0.708980          0.0     RT11   Planet
3    2018-05-01  0.840214          0.0      RT2   Planet
4    2018-05-01  0.840210          0.0      RT3   Planet
..          ...       ...          ...      ...      ...
655  2023-06-29  0.606636          0.0     DT58   Planet
656  2023-06-29  0.766971          0.0      DT6   Planet
657  2023-06-29  0.898831          0.0      DT7   Planet
658  2023-06-29  0.601714          0.0      DT8   Planet
659  2023-06-29  0.718538          0.0      DT9   Planet

[660 rows x 5 columns]


In [23]:
unique_planet_raw.to_csv(r'C:\Users\bpere\Documents\EMPIRE\BARBARA\Akademisch\Remote_Sensing\Thesis\Py6S\Fields\official_dataset\extra_birds\extra_extra\Planet\csv\Pl_complete.csv', index=False)