In [6]:
import os
import shutil

import pandas as pd
import tqdm
import src.data.Dataset as dt

In [7]:
FROM_PATH = os.path.abspath("./dataset/emptyNonEmptyDataset")
TO_PATH = os.path.abspath("./dataset/datasetFiltered")

ORIGINAL_DATASET = os.path.abspath("./data/raw/28570Images_binary.csv")
dataset_file_name = os.path.basename(ORIGINAL_DATASET).split("_")[0] + "_filtered.csv"

FILTERED_PATH = os.path.abspath("./data/interim")
FILTERED_DATASET = os.path.join(FILTERED_PATH, dataset_file_name)

print(f"FROM_PATH:         {FROM_PATH}")
print(f"TO_PATH:           {TO_PATH}")
print(f"ORIGINAL_DATASET:  {ORIGINAL_DATASET}")
print(f"FILTERED_PATH:     {FILTERED_PATH}")
print(f"FILTERED_DATASET:  {FILTERED_DATASET}")

FROM_PATH:         /Users/carlos/WORKSPACE/MegaClassifier/dataset/emptyNonEmptyDataset
TO_PATH:           /Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered
ORIGINAL_DATASET:  /Users/carlos/WORKSPACE/MegaClassifier/data/raw/28570Images_binary.csv
FILTERED_PATH:     /Users/carlos/WORKSPACE/MegaClassifier/data/interim
FILTERED_DATASET:  /Users/carlos/WORKSPACE/MegaClassifier/data/interim/28570Images_filtered.csv


In [8]:
dataset = dt.load_from_csv(ORIGINAL_DATASET)
dataset

The file /Users/carlos/WORKSPACE/MegaClassifier/data/raw/28570Images_binary.csv has been successfully opened.


Unnamed: 0,file_name,label
0,\vacia\NOANIMAL_ZOO_1_4\37_20210115 (85).JPG,0
1,\vacia\vacia_WellingtonCameraTraps_SS\25091507...,0
2,\vacia\vacias_2_de_2_PNC_2012_ISAAC\a15323IM00...,0
3,\cervidae\CERVIDREDORFALLOWDEER_ZOO_5_6_7_9\18...,1
4,\humanovehiculo\car_CaltechCameraTrap_ME\vehic...,1
...,...,...
28565,\humanovehiculo\car_CaltechCameraTrap_ME\vehic...,1
28566,\vacia\NOANIMAL_ZOO_5_6_7_9\2_20210417 (16892)...,0
28567,\vacia\empty_SnapshotMountainZebra\MTZ_S1_C04_...,0
28568,\mapache\raccoon_CaltechCameraTrap_ME\58b9d6a2...,1


In [9]:
total_images_processed = 0
total_images_copied = 0

filtered_dataset = pd.DataFrame({}, columns=dataset.columns)

for _, row in tqdm.tqdm(dataset.iterrows()):
    file_name = row["file_name"].replace("\\", "/")

    original_file = os.path.join(FROM_PATH, file_name[1:])

    filtered_file = file_name.lower()
    filtered_file = filtered_file.replace(" ", "")
    filtered_file = filtered_file.replace("(", "_")
    filtered_file = filtered_file.replace(")", "_")
    filtered_file = (
        filtered_file.replace("á", "a")
        .replace("é", "e")
        .replace("í", "i")
        .replace("ó", "o")
        .replace("ú", "u")
    )
    filtered_file = filtered_file.replace("ñ", "n")

    new_row = pd.DataFrame({"file_name": [filtered_file[1:]], "label": [row["label"]]})
    filtered_dataset = pd.concat([filtered_dataset, new_row], ignore_index=True)

    filtered_file = os.path.join(TO_PATH, filtered_file[1:])

    os.makedirs(os.path.dirname(filtered_file), exist_ok=True)

    try:
        shutil.copyfile(original_file, filtered_file)
        total_images_copied += 1
    except FileNotFoundError:
        print(f"File not found: {original_file}")

    total_images_processed += 1

print(f"Total images processed: {total_images_processed}")
print(f"Total images copied: {total_images_copied}")

dt.dataset_to_csv(filtered_dataset, FILTERED_DATASET)

28570it [00:20, 1397.42it/s]

Total images processed: 28570
Total images copied: 28570
The dataset has been successfully saved to /Users/carlos/WORKSPACE/MegaClassifier/data/interim/28570Images_filtered.csv





In [10]:
from PIL import Image
import os

# Ruta base
base_path = TO_PATH

# Contadores para las imágenes que se pueden y no se pueden abrir
count_openable = 0
count_not_openable = 0

# Lista para almacenar las rutas de las imágenes que no se pueden abrir
not_openable_files = []

# Iterar sobre las rutas de las imágenes en el dataset
for file_path in filtered_dataset["file_name"]:
    absolute_path = base_path + "/" + file_path
    try:
        # Intentar abrir la imagen
        img = Image.open(absolute_path)
        img.verify()  # Verificar que la imagen se puede abrir
        count_openable += 1
    except (IOError, SyntaxError) as e:
        # Si hay un error, incrementar el contador de imágenes no abiertas
        count_not_openable += 1
        not_openable_files.append(absolute_path)

# Imprimir los resultados
print(f"Number of openable images: {count_openable}")
print(f"Number of not openable images: {count_not_openable}")
print(f"Not openable files: {not_openable_files}")

Number of openable images: 28570
Number of not openable images: 0
Not openable files: []
