In [8]:
import pandas as pd
import os
import glob

def process_csvs_from_folder(folder_path, images_folder):
    # Step 1: Find all CSV files in the folder
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
    
    all_dfs = []  # List to store all dataframes
    all_large_tiles = set()  # Set to store all 'large_tile' filenames from CSVs

    for csv_file in csv_files:
        # Step 2: Read each CSV file into a DataFrame
        df = pd.read_csv(csv_file)

        # Add all 'large_tile' filenames to the set
        all_large_tiles.update(df['large_tile'].values)

        # Step 3: Filter rows based on the existence of 'large_tile' files in the given images folder
        df_filtered = df[df['large_tile'].apply(lambda x: os.path.isfile(os.path.join(images_folder, x)))]

        # Add the filtered DataFrame to the list
        all_dfs.append(df_filtered)

    # Step 4: Concatenate all DataFrames from different CSVs
    merged_df = pd.concat(all_dfs, ignore_index=True)

    # Step 5: Combine 'large_tile' and 'small_tile' into a single 'filename' column
    merged_df['filename'] = merged_df['large_tile']  # Use 'large_tile' as the filename column

    # Step 6: Drop the original 'large_tile' and 'small_tile' columns
    merged_df = merged_df.drop(columns=['large_tile', 'small_tile'])

    # Step 7: Reorder columns to make 'filename' the first column
    cols = ['filename'] + [col for col in merged_df.columns if col != 'filename']
    merged_df = merged_df[cols]

    # Step 8: Find missing files in the CSVs (i.e., files in images_folder that are not in all_large_tiles)
    all_image_files = {f for f in os.listdir(images_folder) if os.path.isfile(os.path.join(images_folder, f))}
    missing_from_csvs = all_image_files - all_large_tiles

    # Step 9: Print the missing files (those in images_folder but not in CSVs)
    if missing_from_csvs:
        print("Missing from CSVs (files found in images folder but not in any CSV):")
        for missing_file in missing_from_csvs:
            print(missing_file)

    return merged_df
    
# Example usage:
folder_path = 'merge_pixel_coordinates_csvs'  # Folder where CSVs are located
images_folder = 'seamounts_seg'  # Folder where images are stored

result_df = process_csvs_from_folder(folder_path, images_folder)

# Save the result to a new CSV file
result_df.to_csv('merged_pixel_coordinates.csv', index=False)
print(result_df)

          filename  top_left_x  top_left_y  bottom_right_x  bottom_right_y
0    2388515.0.png         136          89             263             268
1    2474327.0.png         153         118             245             238
2    2556370.0.png         137          99             262             258
3    2660201.0.png         166         142             232             214
4    2785016.0.png         134         104             265             252
..             ...         ...         ...             ...             ...
495  3693466.0.png         131         120             267             236
496  3704975.0.png         131         120             267             237
497  3709055.0.png         135         124             264             233
498  3709895.0.png         170         159             228             197
499  3712994.0.png         158         147             241             210

[500 rows x 5 columns]


In [1]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('merged_pixel_coordinates.csv')

# Rename the columns
df = df.rename(columns={
    'filename': 'image_name',
    'top_left_x': 'x_min',
    'top_left_y': 'y_min',
    'bottom_right_x': 'x_max',
    'bottom_right_y': 'y_max'
})

# Overwrite the existing file with the restructured data
df.to_csv('merged_pixel_coordinates.csv', index=False)
