In [None]:
import os
import hashlib
import pandas as pd
from collections import defaultdict
from PIL import Image
from data_loading import scan_folders, create_dataframe

In [None]:
import os
import hashlib
import pandas as pd
from collections import defaultdict
from PIL import Image

def update_file_paths_in_csv(csv_file, base_folder, composition_categories, frame_size_categories):
    """Update the file paths in the existing CSV."""
    # Load the existing CSV into a DataFrame
    df = pd.read_csv(csv_file)
    image_data = scan_folders(base_folder, composition_categories, frame_size_categories)
    
    # Initialize a list to collect updated rows
    updated_rows = []

    # Update existing images by checking for valid file paths and updating metadata
    for idx, row in df.iterrows():
        img_hash = row['Image Hash']
        if img_hash in image_data:
            # Only keep file paths that exist
            valid_paths = [path for path in image_data[img_hash]["file_paths"] if os.path.exists(path)]
            df.at[idx, "File Paths"] = ", ".join(valid_paths)  # Update the file paths column

            # Update the composition and frame size
            df.at[idx, "Composition"] = ", ".join(image_data[img_hash]["composition"]) if image_data[img_hash]["composition"] else "Unknown"
            df.at[idx, "Frame Size"] = ", ".join(image_data[img_hash]["frame_size"]) if image_data[img_hash]["frame_size"] else "Unknown"
    
    # Handle new images (those not in the original DataFrame)
    existing_hashes = set(df['Image Hash'])
    for img_hash, metadata in image_data.items():
        if img_hash not in existing_hashes:
            # Add new row for this image
            new_row = {
                'Image Hash': img_hash,
                'Composition': ", ".join(metadata["composition"]) if metadata["composition"] else "Unknown",
                'Frame Size': ", ".join(metadata["frame_size"]) if metadata["frame_size"] else "Unknown",
                'File Paths': ", ".join(metadata["file_paths"]),
                'Width': metadata["width"],
                'Height': metadata["height"],
                'Aspect Ratio': round(metadata["width"] / metadata["height"], 2) if metadata["width"] and metadata["height"] else None
            }
            df = df.append(new_row, ignore_index=True)

    # Save the updated DataFrame back to CSV
    df.to_csv("shotdeck_update.csv", index=False)
    print(f"DataFrame updated and saved to shotdeck_update.csv")

In [None]:
composition_categories = ["balanced", "center", "left", "right", "symmetrical"]
frame_size_categories = ["ECU", "CU", "MCU", "MS", "MWS", "WS", "EWS"]

base_folder = "shotdeck_data"  # Root folder containing subfolders for composition and frame size
csv_file = "shotdeck_v2.csv"  # Existing CSV file to update

# Update file paths in the CSV and remove non-existing file paths
update_file_paths_in_csv(csv_file, base_folder, composition_categories, frame_size_categories)

In [None]:
import h5py
import numpy as np
import pandas as pd

def load_attention_maps(hdf5_path, layer_key):
    with h5py.File(hdf5_path, "r") as f:
        data = []
        
        # Loop through each image group
        for image_hash in f.keys():
            row_data = {"image_hash": image_hash}

            # Loop through all layers for this image
            for layer_key in f[image_hash].keys():
                row_data[layer_key] = f[image_hash][layer_key][:]  # Convert back to PyTorch Tensor

            data.append(row_data)

    return pd.DataFrame(data)

In [4]:
# Load the attention maps into a DataFrame
hdf5_path = "shotdeck_attention_maps.h5"
df_attn_maps = load_attention_maps(hdf5_path)

: 

In [None]:
# Display DataFrame
import ace_tools as tools
tools.display_dataframe_to_user(name="Loaded Attention Maps", dataframe=df_attn_maps)

In [30]:
df = pd.read_csv("shotdeck_data.csv")

In [None]:
df_filtered = df[df["Frame Size"].str.split(", ").str.len() == 3]
# Print all first file names from the File Paths column
# Ensure File Paths column contains lists, then extract first file name from each list


# Convert File Paths column from string to list
df_unknown = df[(df["Composition"] == "Unknown") | (df["Frame Size"] == "Unknown")]

# Print results
print(df_unknown)



                            Image Hash Composition Frame Size  \
632   5da5f45bfd6296ac4ad77ae64e309d17     Unknown         MS   
653   5c1c8139d61889d2eafc1e6718b720ce     Unknown         MS   
667   e156acee9fef60f186ce592c890c37df     Unknown         MS   
754   7d8ae5d26f0bafe090b493433cd2cc07     Unknown         MS   
775   e5c2ca9229afd82e4f3d3c529a4cb54d     Unknown    MS, MWS   
...                                ...         ...        ...   
4470  e1b2ee1de174a120cab016aca4454b07     Unknown         WS   
4471  1f42441fd95bc2ab2160a64cd2fa642e     Unknown         WS   
4472  33a3b677429be2e5b34fd58edc1a49f6     Unknown         WS   
4473  ce0fe54186b23b6c005e0c777f9aadd4     Unknown         WS   
4474  7309d3cbdadd5b9ce5b38acc9e5bb4d6     Unknown         WS   

                                             File Paths  Width  Height  \
632   ['shotdeck_data/MS/63 - The Royal Tenenbaums.j...   1920     800   
653                ['shotdeck_data/MS/55 - Minari.jpg']   1920     802 

In [32]:
df_unknown.to_csv("missing_labels.csv")