In [74]:
import os
import pandas as pd
import shutil
import re

# Define file paths
csv_file = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\data_class_trad_aug_consolidated.csv"
source_folder = r"D:\Personal_Project\kgot_training_script\kgot_remote_server_training\data\trad_aug\images"
destination_folder = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\images"
new_csv_file = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\data_class_updated.csv"

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Read the CSV file
df = pd.read_csv(csv_file)

# Extract the image names from the CSV, source folder, and destination folder
image_names_csv = df['image_path'].dropna().astype(str).tolist()
image_names_source = [f for f in os.listdir(source_folder) if os.path.isfile(os.path.join(source_folder, f))]
image_names_destination = [f for f in os.listdir(destination_folder) if os.path.isfile(os.path.join(destination_folder, f))]

# Function to check if all alphabetic letters match
def alphabet_match(target, candidate):
    target_letters = ''.join(re.findall(r'[a-zA-Z]', target))
    candidate_letters = ''.join(re.findall(r'[a-zA-Z]', candidate))
    return target_letters == candidate_letters

# Keep track of unmatched images and replacements
missing_images = []
rows_to_drop = []

# Check each image in the CSV
for idx, image_name in enumerate(image_names_csv):
    destination_image_path = os.path.join(destination_folder, image_name)
    source_image_path = os.path.join(source_folder, image_name)
    
    if os.path.exists(destination_image_path):
        # Image already exists in the destination folder, skip copying
        print(f"Exists in destination: {image_name}")
        continue
    elif os.path.exists(source_image_path):
        # Image exists in the source folder, copy it to the destination
        shutil.copy(source_image_path, destination_image_path)
        print(f"Copied from source to destination: {image_name}")
    else:
        # Image does not exist in either folder, find a unique match
        similar_images_in_destination = [img for img in image_names_destination if alphabet_match(image_name, img)]
        if len(similar_images_in_destination) == 1:  # Unique match found in destination
            match = similar_images_in_destination[0]
            df.at[idx, 'image_path'] = match
            print(f"Replaced with destination match: {image_name} -> {match}")
        else:
            # No unique match in destination, search in source
            similar_images_in_source = [img for img in image_names_source if alphabet_match(image_name, img)]
            if len(similar_images_in_source) == 1:  # Unique match found in source
                match = similar_images_in_source[0]
                df.at[idx, 'image_path'] = match
                # Copy the replacement image to the destination folder
                replacement_image_path = os.path.join(source_folder, match)
                shutil.copy(replacement_image_path, os.path.join(destination_folder, match))
                print(f"Replaced and copied from source: {image_name} -> {match}")
            else:
                # No match found anywhere, log it as missing
                missing_images.append(image_name)
                rows_to_drop.append(idx)  # Mark the row for removal
                print(f"Image not found: {image_name}")

# Drop rows corresponding to missing images
df.drop(rows_to_drop, inplace=True)

# Save the updated CSV
df.to_csv(new_csv_file, index=False)
print(f"\nUpdated CSV file saved to: {new_csv_file}")

# Save the list of missing images to a text file
missing_images_file = r"D:\KGOT_github\KGOT-image-augmentation\data\orig\missing_images.txt"
with open(missing_images_file, "w") as file:
    for missing_image in missing_images:
        file.write(missing_image + "\n")

print(f"\nList of missing images saved to: {missing_images_file}")

Exists in destination: honey_628.jpg
Exists in destination: suzanne_562.jpg
Exists in destination: suzanne_174.jpg
Exists in destination: smoke_666.jpg
Exists in destination: smoke_079.jpg
Exists in destination: sansa_2021_10_615.jpg
Exists in destination: sansa_2021_14_558.jpg
Exists in destination: sansa_236.jpg
Exists in destination: sansa_2021_16_386.jpg
Exists in destination: suzanne_178.jpg
Exists in destination: sansa_129.jpg
Exists in destination: sansa_354.jpg
Exists in destination: sansa_337.jpg
Exists in destination: sansa_2021_08_659.jpg
Exists in destination: sansa_194.jpg
Exists in destination: sansa_2021_05_634.jpg
Exists in destination: sansa_2021_09_711.jpg
Exists in destination: sansa_2021_01_633.jpg
Exists in destination: sansa_055.jpg
Exists in destination: sansa_042.jpg
Exists in destination: suzanne_656.jpg
Exists in destination: suzanne_351.jpg
Exists in destination: suzanne_646.jpg
Exists in destination: boa_constrictor_27021503.jpg
Exists in destination: boa_co

In [75]:
import os
import pandas as pd

# Define file paths
trad_aug_csv = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\data_class.csv"
kgot_csv = r"D:\KGOT_github\KGOT-image-augmentation\data\kgot\data_class.csv"
images_folder = r"D:\Personal_Project\kgot_training_script\kgot_remote_server_training\data\trad_aug\images"
output_csv_file = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\data_class_trad_aug_consolidated.csv"

# Load the CSV files
trad_aug_df = pd.read_csv(trad_aug_csv)
kgot_df = pd.read_csv(kgot_csv)

# Check for necessary columns
required_columns = ['image_path', 'class', 'subset']

for col in required_columns:
    if col not in trad_aug_df.columns:
        raise KeyError(f"Column '{col}' is missing in data_class_trad_cleaned.csv")
    if col not in kgot_df.columns:
        raise KeyError(f"Column '{col}' is missing in data_class.csv")

# Get class counts by subset from both CSVs
trad_aug_counts = trad_aug_df.groupby(['subset', 'class']).size().reset_index(name='count_trad_aug')
kgot_counts = kgot_df.groupby(['subset', 'class']).size().reset_index(name='count_kgot')

# Merge the counts to compare
comparison_df = pd.merge(kgot_counts, trad_aug_counts, on=['subset', 'class'], how='left').fillna(0)

# Identify mismatched class counts
comparison_df['count_diff'] = comparison_df['count_kgot'] - comparison_df['count_trad_aug']
mismatched_classes = comparison_df[comparison_df['count_diff'] > 0]

# Initialize a list to hold new records for the consolidated CSV
new_records = []

# Process mismatched classes
for _, row in mismatched_classes.iterrows():
    subset = row['subset']
    class_label = row['class']
    needed_images = int(row['count_diff'])

    # Search for missing images in the images folder
    existing_images = set(trad_aug_df['image_path'])
    all_images_in_folder = [f for f in os.listdir(images_folder) if os.path.isfile(os.path.join(images_folder, f))]
    unique_images = [img for img in all_images_in_folder if img not in existing_images]

    # Map image class and subset from filenames and add to new records
    added_count = 0
    for img in unique_images:
        # Ensure the image belongs to the correct class by checking the filename
        if class_label in img:
            new_records.append({'image_path': img, 'subset': subset, 'class': class_label})
            added_count += 1
        if added_count >= needed_images:
            break

# Add new records to the current trad_aug_df
new_records_df = pd.DataFrame(new_records)
consolidated_df = pd.concat([trad_aug_df, new_records_df], ignore_index=True)

# Save the consolidated CSV
consolidated_df.to_csv(output_csv_file, index=False)

# Print summary
print(f"Number of mismatched classes: {len(mismatched_classes)}")
print(f"Number of new images added: {len(new_records)}")
print(f"Consolidated CSV file saved to: {output_csv_file}")

Number of mismatched classes: 0
Number of new images added: 0
Consolidated CSV file saved to: D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\data_class_trad_aug_consolidated.csv


In [71]:
import pandas as pd

# Define file paths
csv_file = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\data_class_trad.csv"
output_csv_file = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\data_class_trad_cleaned.csv"

# Load the CSV file
df = pd.read_csv(csv_file)

# Remove duplicates from 'image_path', prioritizing rows where subset != 'train'
# Sort so that rows where subset != 'train' come first
df_sorted = df.sort_values(by='subset', key=lambda col: col == 'train')

# Drop duplicates, keeping the first occurrence after sorting
df_cleaned = df_sorted.drop_duplicates(subset='image_path', keep='first')

# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv(output_csv_file, index=False)

# Print results
print(f"Cleaned CSV file saved to: {output_csv_file}")

Cleaned CSV file saved to: D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\data_class_trad_cleaned.csv


In [64]:
import os
import pandas as pd

# Define file paths
csv_file = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\data_class_trad.csv"
images_folder = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\images"
missing_images_file = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\missing_from_folder.txt"

# Read the CSV file
df = pd.read_csv(csv_file)

# Extract the image names from the CSV
image_names_csv = df['image_path'].dropna().astype(str).tolist()

# Get the list of image files in the folder
image_files_in_folder = set(os.listdir(images_folder))

# Find missing images
missing_images = [image_name for image_name in image_names_csv if image_name not in image_files_in_folder]

# Save the missing images to a text file
with open(missing_images_file, "w") as file:
    for missing_image in missing_images:
        file.write(missing_image + "\n")

# Print results
if missing_images:
    print(f"Missing images found: {len(missing_images)}")
    print(f"List of missing images saved to: {missing_images_file}")
else:
    print("No missing images. All images in the CSV exist in the folder.")

No missing images. All images in the CSV exist in the folder.


In [56]:
import os
import pandas as pd

# Define file paths
csv_file = r"D:\KGOT_github\KGOT-image-augmentation\data\orig\data_class_updated.csv"
images_folder = r"D:\KGOT_github\KGOT-image-augmentation\data\orig\images"
extra_images_csv_file = r"D:\KGOT_github\KGOT-image-augmentation\data\orig\extra_images.csv"

# Read the updated CSV file
df = pd.read_csv(csv_file)

# Get the list of image names from the CSV
image_names_csv = df['image_path'].dropna().astype(str).tolist()

# Get the list of images in the folder
image_files_in_folder = [f for f in os.listdir(images_folder) if os.path.isfile(os.path.join(images_folder, f))]

# Find images in the folder that are not in the CSV
extra_images = [image for image in image_files_in_folder if image not in image_names_csv]

# Save the list of extra images to a CSV file
extra_images_df = pd.DataFrame(extra_images, columns=['extra_image_path'])
extra_images_df.to_csv(extra_images_csv_file, index=False)

# Print results
if extra_images:
    print(f"Extra images found: {len(extra_images)}")
    print(f"List of extra images saved to: {extra_images_csv_file}")
else:
    print("No extra images found. All images in the folder are listed in the CSV.")

Extra images found: 76
List of extra images saved to: D:\KGOT_github\KGOT-image-augmentation\data\orig\extra_images.csv


In [61]:
import os
import pandas as pd
import shutil

# Define file paths
csv_file = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\data_class_updated.csv"
images_folder = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\images"
consolidated_folder = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\images_consolidate"
new_csv_file = r"D:\KGOT_github\KGOT-image-augmentation\data\trad_aug\data_class_updated.csv"

# Ensure the consolidated folder exists
os.makedirs(consolidated_folder, exist_ok=True)

# Read the updated CSV file
df = pd.read_csv(csv_file)

# Get the list of image names from the CSV
image_names_csv = df['image_path'].dropna().astype(str).tolist()

# Get the list of images in the folder
image_files_in_folder = [f for f in os.listdir(images_folder) if os.path.isfile(os.path.join(images_folder, f))]

# Keep track of valid images and invalid records
valid_images = []
invalid_records = []

# Consolidate images
for image_name in image_names_csv:
    source_image_path = os.path.join(images_folder, image_name)
    dest_image_path = os.path.join(consolidated_folder, image_name)
    
    if os.path.exists(source_image_path):
        # If the image exists in the folder, move it to the consolidated folder
        shutil.move(source_image_path, dest_image_path)
        valid_images.append(image_name)
        print(f"Moved: {image_name}")
    else:
        # If the image does not exist, mark the record as invalid
        invalid_records.append(image_name)
        print(f"Image missing: {image_name}")

# Remove invalid records from the DataFrame
df = df[~df['image_path'].isin(invalid_records)]

# Save the cleaned-up CSV
df.to_csv(new_csv_file, index=False)

# Remove leftover images not listed in the CSV
for image_name in image_files_in_folder:
    if image_name not in image_names_csv:
        image_path = os.path.join(images_folder, image_name)
        os.remove(image_path)
        print(f"Removed extra image: {image_name}")

# Print results
print(f"\nConsolidation complete. Consolidated images are saved in: {consolidated_folder}")
print(f"Updated CSV saved to: {new_csv_file}")

Moved: 2022_neonate_dump_034_919_aug.jpg
Moved: 2022_neonate_dump_030_1240_aug.jpg
Moved: 2022_neonate_dump_251_682_aug.jpg
Moved: 2022_neonate_dump_251_474_aug.jpg
Moved: 2022_neonate_dump_030_1912_aug.jpg
Moved: 2022_neonate_dump_030_2432_aug.jpg
Moved: 2022_neonate_dump_192_321_aug.jpg
Moved: 2022_neonate_dump_162_1701_aug.jpg
Moved: 2022_neonate_dump_192_1449_aug.jpg
Moved: 2022_neonate_dump_034_367_aug.jpg
Moved: 2022_neonate_dump_251_314_aug.jpg
Moved: 2022_neonate_dump_162_973_aug.jpg
Moved: 2022_neonate_dump_251_2106_aug.jpg
Moved: 2022_neonate_dump_251_2186_aug.jpg
Moved: 2022_neonate_dump_192_1721_aug.jpg
Moved: 2022_neonate_dump_034_103_aug.jpg
Moved: 2022_neonate_dump_034_2199_aug.jpg
Moved: 2022_neonate_dump_162_397_aug.jpg
Moved: 2022_neonate_dump_192_2313_aug.jpg
Moved: 2022_neonate_dump_180_2395_aug.jpg
Moved: 2022_neonate_dump_035_2254_aug.jpg
Moved: 2022_neonate_dump_162_757_aug.jpg
Moved: 2022_neonate_dump_251_714_aug.jpg
Moved: 2022_neonate_dump_192_2137_aug.jpg
Mov