In [None]:
import os

import pandas as pd
import tqdm
import shutil
import src.data.Dataset as dt

In [None]:
FROM_PATH = os.path.abspath("./dataset/emptyNonEmptyDataset")
RAW_DATASET = os.path.abspath("./dataset/emptyNonEmptyDataset/10000Images.csv")
TO_PATH = os.path.abspath("./data/raw/datasetFormatted")


print(f"FROM_PATH:   {FROM_PATH}")
print(f"TO_PATH:     {TO_PATH}")
print(f"RAW_DATASET: {RAW_DATASET}")

In [None]:
dataset_original = dt.load_from_csv(RAW_DATASET)
dataset_original

In [None]:
num_images = len(dataset_original["file_name"])
print(f"Number of images in dataset: {num_images}")

In [None]:
duplicate_routes = dataset_original["file_name"].duplicated().sum()
print(f"Number of duplicate routes: {duplicate_routes}")

In [None]:
dataset_without_duplicates = dataset_original.drop_duplicates(subset="file_name")
dataset_without_duplicates

In [None]:
class_counts = dataset_without_duplicates["label"].value_counts()
print(class_counts)

In [None]:
dataset_cleaned = dataset_without_duplicates[
    dataset_without_duplicates["label"] != "dudosa"
]
dataset_cleaned

In [None]:
dataset_cleaned = dataset_cleaned.copy()
dataset_cleaned["binary_label"] = dataset_cleaned["label"].apply(
    lambda x: "0" if x == "vacia" else "1"
)

dataset_cleaned

In [None]:
dataset_cleaned

In [None]:
# 18.605
class_counts = dataset_cleaned["binary_label"].value_counts()
print(class_counts)

In [None]:
total_images_processed = 0
total_images_copied = 0

filtered_dataset = pd.DataFrame({}, columns=dataset_cleaned.columns)

for _, row in tqdm.tqdm(dataset_cleaned.iterrows()):
    file_name = row["file_name"].replace("\\", "/")

    original_file = os.path.join(FROM_PATH, file_name[1:])

    filtered_file = file_name.lower()
    filtered_file = filtered_file.replace(" ", "")
    filtered_file = filtered_file.replace("(", "_")
    filtered_file = filtered_file.replace(")", "_")
    filtered_file = (
        filtered_file.replace("á", "a")
        .replace("é", "e")
        .replace("í", "i")
        .replace("ó", "o")
        .replace("ú", "u")
    )
    filtered_file = filtered_file.replace("ñ", "n")

    new_row = pd.DataFrame(
        {
            "file_name": [filtered_file[1:]],
            "label": [row["label"]],
            "binary_label": [row["binary_label"]],
        }
    )
    filtered_dataset = pd.concat([filtered_dataset, new_row], ignore_index=True)

    filtered_file = os.path.join(TO_PATH, filtered_file[1:])

    os.makedirs(os.path.dirname(filtered_file), exist_ok=True)

    try:
        shutil.copyfile(original_file, filtered_file)
        total_images_copied += 1
    except FileNotFoundError:
        print(f"File not found: {original_file}")

    total_images_processed += 1

print(f"Total images processed: {total_images_processed}")
print(f"Total images copied: {total_images_copied}")

number_samples = len(filtered_dataset)

dt.dataset_to_csv(
    filtered_dataset, (TO_PATH + "/" + str(number_samples) + "ImagesFormatted.csv")
)

In [None]:
from PIL import Image
import os

# Ruta base
base_path = TO_PATH

# Contadores para las imágenes que se pueden y no se pueden abrir
count_openable = 0
count_not_openable = 0

# Lista para almacenar las rutas de las imágenes que no se pueden abrir
not_openable_files = []

# Iterar sobre las rutas de las imágenes en el dataset
for file_path in filtered_dataset["file_name"]:
    absolute_path = base_path + "/" + file_path
    try:
        # Intentar abrir la imagen
        img = Image.open(absolute_path)
        img.verify()  # Verificar que la imagen se puede abrir
        count_openable += 1
    except (IOError, SyntaxError) as e:
        # Si hay un error, incrementar el contador de imágenes no abiertas
        count_not_openable += 1
        not_openable_files.append(absolute_path)

# Imprimir los resultados
print(f"Number of openable images: {count_openable}")
print(f"Number of not openable images: {count_not_openable}")
print(f"Not openable files: {not_openable_files}")