In [29]:
import os

import pandas as pd
import tqdm
import shutil
import src.data.Dataset as dt

In [30]:
FROM_PATH = os.path.abspath("./data/raw/emptyNonEmptyDataset")
RAW_DATASET = os.path.abspath("./data/raw/emptyNonEmptyDataset/10000Images.csv")
TO_PATH = os.path.abspath("./data/raw/emptyNonEmptyDataset_ETL")


print(f"FROM_PATH:   {FROM_PATH}")
print(f"TO_PATH:     {TO_PATH}")
print(f"RAW_DATASET: {RAW_DATASET}")

FROM_PATH:   /Users/carlos/WORKSPACE/MegaClassifier/data/raw/emptyNonEmptyDataset
TO_PATH:     /Users/carlos/WORKSPACE/MegaClassifier/data/raw/emptyNonEmptyDataset_ETL
RAW_DATASET: /Users/carlos/WORKSPACE/MegaClassifier/data/raw/emptyNonEmptyDataset/10000Images.csv


In [31]:
dataset_original = dt.load_from_csv(RAW_DATASET)
dataset_original

The file /Users/carlos/WORKSPACE/MegaClassifier/data/raw/emptyNonEmptyDataset/10000Images.csv has been successfully opened.


Unnamed: 0,file_name,label
0,\cervidae\CERVIDREDORFALLOWDEER_ZOO_1_4\20_202...,cervidae
1,\leporido\conejo_WellingtonCameraTraps_SS\1908...,leporido
2,\vacia\empty_SnapshotMountainZebra\MTZ_S1_D07_...,vacia
3,\vacia\NOANIMAL_ZOO_5_6_7_9\37_20210319 (943).JPG,vacia
4,\vacia\empty_IslandConservationCameraTraps\dom...,vacia
...,...,...
31665,\vacia\vacia_ss\27_20201023 (17).JPG,vacia
31666,\zorro\REDFOX_ZOO_1_4\32_20201218 (348).JPG,zorro
31667,\mapache\raccoon_CaltechCameraTrap_ME\58af767e...,mapache
31668,\cervidae\CERVIDREDORFALLOWDEER_ZOO_5_6_7_9\5_...,cervidae


In [32]:
num_images = len(dataset_original["file_name"])
print(f"Number of images in dataset: {num_images}")

Number of images in dataset: 31670


In [33]:
duplicate_routes = dataset_original["file_name"].duplicated().sum()
print(f"Number of duplicate routes: {duplicate_routes}")

Number of duplicate routes: 1131


In [34]:
dataset_without_duplicates = dataset_original.drop_duplicates(subset="file_name")
dataset_without_duplicates

Unnamed: 0,file_name,label
0,\cervidae\CERVIDREDORFALLOWDEER_ZOO_1_4\20_202...,cervidae
1,\leporido\conejo_WellingtonCameraTraps_SS\1908...,leporido
2,\vacia\empty_SnapshotMountainZebra\MTZ_S1_D07_...,vacia
3,\vacia\NOANIMAL_ZOO_5_6_7_9\37_20210319 (943).JPG,vacia
4,\vacia\empty_IslandConservationCameraTraps\dom...,vacia
...,...,...
31665,\vacia\vacia_ss\27_20201023 (17).JPG,vacia
31666,\zorro\REDFOX_ZOO_1_4\32_20201218 (348).JPG,zorro
31667,\mapache\raccoon_CaltechCameraTrap_ME\58af767e...,mapache
31668,\cervidae\CERVIDREDORFALLOWDEER_ZOO_5_6_7_9\5_...,cervidae


In [35]:
class_counts = dataset_without_duplicates["label"].value_counts()
print(class_counts)

label
vacia             9665
cervidae          5253
jabali            1979
dudosa            1969
ave               1688
zorro             1673
leporido          1569
humanovehiculo    1501
vaca              1106
lince              958
caballo            871
gato_domestico     777
micromamiferos     466
tejon              255
mapache            207
meloncillo         195
garduna            174
gineta             117
perro              116
Name: count, dtype: int64


In [36]:
dataset_cleaned = dataset_without_duplicates[
    dataset_without_duplicates["label"] != "dudosa"
]
dataset_cleaned

Unnamed: 0,file_name,label
0,\cervidae\CERVIDREDORFALLOWDEER_ZOO_1_4\20_202...,cervidae
1,\leporido\conejo_WellingtonCameraTraps_SS\1908...,leporido
2,\vacia\empty_SnapshotMountainZebra\MTZ_S1_D07_...,vacia
3,\vacia\NOANIMAL_ZOO_5_6_7_9\37_20210319 (943).JPG,vacia
4,\vacia\empty_IslandConservationCameraTraps\dom...,vacia
...,...,...
31665,\vacia\vacia_ss\27_20201023 (17).JPG,vacia
31666,\zorro\REDFOX_ZOO_1_4\32_20201218 (348).JPG,zorro
31667,\mapache\raccoon_CaltechCameraTrap_ME\58af767e...,mapache
31668,\cervidae\CERVIDREDORFALLOWDEER_ZOO_5_6_7_9\5_...,cervidae


In [37]:
dataset_cleaned = dataset_cleaned.copy()
dataset_cleaned["binary_label"] = dataset_cleaned["label"].apply(
    lambda x: "0" if x == "vacia" else "1"
)

dataset_cleaned

Unnamed: 0,file_name,label,binary_label
0,\cervidae\CERVIDREDORFALLOWDEER_ZOO_1_4\20_202...,cervidae,1
1,\leporido\conejo_WellingtonCameraTraps_SS\1908...,leporido,1
2,\vacia\empty_SnapshotMountainZebra\MTZ_S1_D07_...,vacia,0
3,\vacia\NOANIMAL_ZOO_5_6_7_9\37_20210319 (943).JPG,vacia,0
4,\vacia\empty_IslandConservationCameraTraps\dom...,vacia,0
...,...,...,...
31665,\vacia\vacia_ss\27_20201023 (17).JPG,vacia,0
31666,\zorro\REDFOX_ZOO_1_4\32_20201218 (348).JPG,zorro,1
31667,\mapache\raccoon_CaltechCameraTrap_ME\58af767e...,mapache,1
31668,\cervidae\CERVIDREDORFALLOWDEER_ZOO_5_6_7_9\5_...,cervidae,1


In [38]:
class_counts = dataset_cleaned["binary_label"].value_counts()
print(class_counts)

binary_label
1    18905
0     9665
Name: count, dtype: int64


In [39]:
total_images_processed = 0
total_images_copied = 0

filtered_dataset = pd.DataFrame({}, columns=dataset_cleaned.columns)

for _, row in tqdm.tqdm(dataset_cleaned.iterrows()):
    file_name = row["file_name"].replace("\\", "/")

    original_file = os.path.join(FROM_PATH, file_name[1:])

    filtered_file = file_name.lower()
    filtered_file = filtered_file.replace(" ", "")
    filtered_file = filtered_file.replace("(", "_")
    filtered_file = filtered_file.replace(")", "_")
    filtered_file = (
        filtered_file.replace("á", "a")
        .replace("é", "e")
        .replace("í", "i")
        .replace("ó", "o")
        .replace("ú", "u")
    )
    filtered_file = filtered_file.replace("ñ", "n")

    new_row = pd.DataFrame(
        {
            "file_name": [filtered_file[1:]],
            "label": [row["label"]],
            "binary_label": [row["binary_label"]],
        }
    )
    filtered_dataset = pd.concat([filtered_dataset, new_row], ignore_index=True)
    second_occurrences = filtered_dataset[
        filtered_dataset.duplicated(subset="file_name", keep="first")
    ].index

    filtered_dataset = filtered_dataset.drop(second_occurrences)

    filtered_file = os.path.join(TO_PATH, filtered_file[1:])
    os.makedirs(os.path.dirname(filtered_file), exist_ok=True)

    try:
        shutil.copyfile(original_file, filtered_file)
        total_images_copied += 1
    except FileNotFoundError:
        print(f"File not found: {original_file}")


number_samples = len(filtered_dataset)
print(f"Number of samples: {number_samples}")

dt.dataset_to_csv(
    filtered_dataset, (TO_PATH + "/" + str(number_samples) + "Images_binary.csv")
)

28570it [00:34, 816.34it/s] 


Number of samples: 28560
The dataset has been successfully saved to /Users/carlos/WORKSPACE/MegaClassifier/data/raw/emptyNonEmptyDataset_ETL/28560Images_binary.csv


In [40]:
filtered_dataset

Unnamed: 0,file_name,label,binary_label
0,cervidae/cervidredorfallowdeer_zoo_1_4/20_2020...,cervidae,1
1,leporido/conejo_wellingtoncameratraps_ss/19081...,leporido,1
2,vacia/empty_snapshotmountainzebra/mtz_s1_d07_r...,vacia,0
3,vacia/noanimal_zoo_5_6_7_9/37_20210319_943_.jpg,vacia,0
4,vacia/empty_islandconservationcameratraps/domi...,vacia,0
...,...,...,...
28555,vacia/vacia_ss/27_20201023_17_.jpg,vacia,0
28556,zorro/redfox_zoo_1_4/32_20201218_348_.jpg,zorro,1
28557,mapache/raccoon_caltechcameratrap_me/58af767e-...,mapache,1
28558,cervidae/cervidredorfallowdeer_zoo_5_6_7_9/5_2...,cervidae,1


In [48]:
from PIL import Image
import os

# Ruta base
base_path = TO_PATH

# Contadores para las imágenes que se pueden y no se pueden abrir
count_openable = 0
count_not_openable = 0

# Lista para almacenar las rutas de las imágenes que no se pueden abrir
not_openable_files = []

# Iterar sobre las rutas de las imágenes en el dataset
for file_path in filtered_dataset["file_name"]:
    absolute_path = base_path + "/" + file_path
    try:
        # Intentar abrir la imagen
        img = Image.open(absolute_path)
        img.verify()  # Verificar que la imagen se puede abrir
        count_openable += 1
    except (IOError, SyntaxError) as e:
        # Si hay un error, incrementar el contador de imágenes no abiertas
        count_not_openable += 1
        not_openable_files.append(absolute_path)

# Imprimir los resultados
print(f"Number of openable images: {count_openable}")
print(f"Number of not openable images: {count_not_openable}")
print(f"Not openable files: {not_openable_files}")

Number of openable images: 28560
Number of not openable images: 0
Not openable files: []
