load metadata.csv from amazing_logos_v4_cleanup folder

load meta_after.csv from presentation4thesis folder

filter out rows in metadata which are present in meta_after (using id col)
and save as meta_before.csv in presentation4thesis folder

In [2]:
import pandas as pd
from pathlib import Path

# Define paths
base_path = Path('..')
cleanup_folder = base_path / 'output' / 'amazing_logos_v4' / 'data' / 'amazing_logos_v4_cleanup'
presentation_folder = base_path / 'presentation4thesis'

# Use 'metadata.csv' as the source file as described in the markdown.
metadata_path = cleanup_folder / 'metadata.csv'
meta_after_path = presentation_folder / 'meta_after.csv'
meta_before_path = presentation_folder / 'meta_before.csv'

print(f"Source metadata: {metadata_path}")
print(f"Metadata to filter with: {meta_after_path}")
print(f"Output file: {meta_before_path}")

# Load the datasets
try:
    df_metadata = pd.read_csv(metadata_path)
    print(f"Successfully loaded {metadata_path.name} with {len(df_metadata)} rows.")
except FileNotFoundError:
    print(f"ERROR: {metadata_path.name} not found at {metadata_path}")
    df_metadata = None

try:
    df_meta_after = pd.read_csv(meta_after_path)
    print(f"Successfully loaded {meta_after_path.name} with {len(df_meta_after)} rows.")
except FileNotFoundError:
    print(f"ERROR: {meta_after_path.name} not found at {meta_after_path}")
    df_meta_after = None

# Perform the filtering if both dataframes are loaded
if df_metadata is not None and df_meta_after is not None:
    # Get the list of IDs to include
    ids_to_include = df_meta_after['id'].unique()
    
    # Filter the main metadata dataframe to keep only rows with matching IDs
    df_meta_before = df_metadata[df_metadata['id'].isin(ids_to_include)]
    
    print(f"\nOriginal metadata rows: {len(df_metadata)}")
    print(f"Rows to include: {len(ids_to_include)}")
    print(f"Rows in meta_before: {len(df_meta_before)}")
    
    # Save the result
    df_meta_before.to_csv(meta_before_path, index=False)
    print(f"\nSuccessfully saved meta_before.csv to {meta_before_path}")
else:
    print("\nCould not perform filtering due to missing files.")


Source metadata: ..\output\amazing_logos_v4\data\amazing_logos_v4_cleanup\metadata.csv
Metadata to filter with: ..\presentation4thesis\meta_after.csv
Output file: ..\presentation4thesis\meta_before.csv
Successfully loaded metadata.csv with 397251 rows.
Successfully loaded meta_after.csv with 1810 rows.

Original metadata rows: 397251
Rows to include: 1810
Rows in meta_before: 1810

Successfully saved meta_before.csv to ..\presentation4thesis\meta_before.csv
Successfully loaded metadata.csv with 397251 rows.
Successfully loaded meta_after.csv with 1810 rows.

Original metadata rows: 397251
Rows to include: 1810
Rows in meta_before: 1810

Successfully saved meta_before.csv to ..\presentation4thesis\meta_before.csv
