In [6]:
import os
import shutil

import pandas as pd
import tqdm

import src.data.Dataset as dt


In [None]:
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)

In [None]:
FROM_PATH = os.path.abspath("./data/raw/emptyNonEmptyDataset")
RAW_DATASET = os.path.abspath("./data/raw/emptyNonEmptyDataset/10000Images.csv")

TO_PATH = os.path.join(os.path.abspath("./data/interim/"), os.path.basename(FROM_PATH))


print(f"FROM_PATH:   {FROM_PATH}")
print(f"TO_PATH:     {TO_PATH}")
print(f"RAW_DATASET: {RAW_DATASET}")

In [None]:
dataset_original = dt.load_from_csv(RAW_DATASET)
dataset_original

In [None]:
num_images = len(dataset_original["file_name"])
print(f"Number of images in dataset: {num_images}")

In [None]:
duplicate_routes = dataset_original["file_name"].duplicated().sum()
print(f"Number of duplicate routes: {duplicate_routes}")

In [None]:
dataset_without_duplicates = dataset_original.drop_duplicates(subset="file_name")
dataset_without_duplicates

In [None]:
class_counts = dataset_without_duplicates["label"].value_counts()
print(class_counts)

In [None]:
dataset_cleaned = dataset_without_duplicates[
    dataset_without_duplicates["label"] != "dudosa"
]
dataset_cleaned

In [None]:
dataset_cleaned = dataset_cleaned.copy()
dataset_cleaned["binary_label"] = dataset_cleaned["label"].apply(
    lambda x: "0" if x == "vacia" else "1"
)

dataset_cleaned

In [None]:
class_counts = dataset_cleaned["binary_label"].value_counts()
print(class_counts)

In [None]:
total_images_processed = 0
total_images_copied = 0

filtered_dataset = pd.DataFrame({}, columns=dataset_cleaned.columns)

for _, row in tqdm.tqdm(dataset_cleaned.iterrows()):
    file_name = row["file_name"].replace("\\", "/")

    original_file = os.path.join(FROM_PATH, file_name[1:])

    filtered_file = file_name.replace("(", "_")
    filtered_file = filtered_file.replace(")", "_")
    filtered_file = (
        filtered_file.replace("á", "a")
        .replace("Á", "A")
        .replace("é", "e")
        .replace("É", "E")
        .replace("í", "i")
        .replace("Í", "I")
        .replace("ó", "o")
        .replace("Ó", "O")
        .replace("ú", "u")
        .replace("Ú", "U")
    )
    filtered_file = filtered_file.replace("ñ", "n").replace("Ñ", "N")

    new_row = pd.DataFrame(
        {
            "file_name": [filtered_file[1:]],
            "label": [row["label"]],
            "binary_label": [row["binary_label"]],
        }
    )
    filtered_dataset = pd.concat([filtered_dataset, new_row], ignore_index=True)

    filtered_file = os.path.join(TO_PATH, filtered_file[1:])
    os.makedirs(os.path.dirname(filtered_file), exist_ok=True)

    try:
        shutil.copyfile(original_file, filtered_file)
        total_images_copied += 1
    except FileNotFoundError:
        print(f"File not found: {original_file}")


number_samples = len(filtered_dataset)
print(f"Number of samples: {number_samples}")

dt.dataset_to_csv(
    filtered_dataset, (TO_PATH + "/" + str(number_samples) + "Images_binary.csv")
)

In [None]:
filtered_dataset