In [None]:
import tensorboard

In [20]:
import json
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
import tqdm
from keras.callbacks import TensorBoard
from PIL import Image
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import src.data.Dataset as dt

In [7]:
required_paths = ["/ai4eutils", "/CameraTraps", "/yolov5"]
python_path = os.environ.get("PYTHONPATH", "")
root_path = os.getcwd()

for path in required_paths:
    if not any(p.endswith(path) for p in python_path.split(":")):
        python_path += f":{root_path}/data/external{path}"

os.environ["PYTHONPATH"] = python_path

!echo "PYTHONPATH: $PYTHONPATH"

PYTHONPATH: :/Users/carlos/WORKSPACE/MegaClassifier/data/external/ai4eutils:/Users/carlos/WORKSPACE/MegaClassifier/data/external/CameraTraps:/Users/carlos/WORKSPACE/MegaClassifier/data/external/yolov5


In [8]:
version = "v1.0"

img_weight = 224
img_height = 224

batch_size = 32

seed = 42

epochs = 50


In [9]:
datagen_train = ImageDataGenerator(rescale=1.0 / 255)

# mobilenetV2 = MobileNetV2(
#     include_top=False, input_shape=(img_weight, img_height, 3), weights="imagenet"
# )

url = "https://tfhub.dev/google/tf2-preview/mobilenet_v2/feature_vector/4"
mobilenetV2 = hub.KerasLayer(url, input_shape=(img_weight, img_height, 3))

mobilenetV2.trainable = False

model = Sequential(
    [
        mobilenetV2,
        Dense(1, activation="sigmoid", name="Output_Layer"),
    ],
    name=f"MegaClassifier_{version}",
)

model.summary()

Model: "MegaClassifier_v1.0"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 1280)              2257984   
                                                                 
 Output_Layer (Dense)        (None, 1)                 1281      
                                                                 
Total params: 2,259,265
Trainable params: 1,281
Non-trainable params: 2,257,984
_________________________________________________________________


In [10]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

checkpoint = ModelCheckpoint(
    f"./models/MegaClassifier_//best_MegaClassifier__{version}.h5",
    monitor="val_loss",
    save_best_only=True,
    mode="min",
    verbose=1,
)

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=4,
    mode="min",
    verbose=1,
    restore_best_weights=True,
)

tensorBoard = TensorBoard(log_dir=f"./logs/MegaClassifier_/{version}")

In [16]:
DATASET_PATH = os.path.abspath("./dataset/datasetFiltered")

TRAIN_CSV = os.path.abspath("./data/processed/train/29618Train.csv")
VALIDATION_CSV = os.path.abspath("./data/processed/validation/29618Validation.csv")
TEST_CSV = os.path.abspath("./data/processed/test/29618Test.csv")
FILTERED_CSV = os.path.abspath("./data/interim/29618Images_filtered.csv")

print(f"DATASET_PATH:   {DATASET_PATH}")
print(f"TRAIN_CSV:      {TRAIN_CSV}")
print(f"VALIDATION_CSV: {VALIDATION_CSV}")
print(f"TEST_CSV:       {TEST_CSV}")
print(f"FILTERED_CSV:   {FILTERED_CSV}")


DATASET_PATH:   /Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered
TRAIN_CSV:      /Users/carlos/WORKSPACE/MegaClassifier/data/processed/train/29618Train.csv
VALIDATION_CSV: /Users/carlos/WORKSPACE/MegaClassifier/data/processed/validation/29618Validation.csv
TEST_CSV:       /Users/carlos/WORKSPACE/MegaClassifier/data/processed/test/29618Test.csv
FILTERED_CSV:   /Users/carlos/WORKSPACE/MegaClassifier/data/interim/29618Images_filtered.csv


In [32]:
train_csv = dt.load_from_csv(TRAIN_CSV)
vali_csv = dt.load_from_csv(VALIDATION_CSV)
test_csv = dt.load_from_csv(TEST_CSV)
filtered_csv = dt.load_from_csv(FILTERED_CSV)

The file /Users/carlos/WORKSPACE/MegaClassifier/data/processed/train/29618Train.csv has been successfully opened.
The file /Users/carlos/WORKSPACE/MegaClassifier/data/processed/validation/29618Validation.csv has been successfully opened.
The file /Users/carlos/WORKSPACE/MegaClassifier/data/processed/test/29618Test.csv has been successfully opened.
The file /Users/carlos/WORKSPACE/MegaClassifier/data/interim/29618Images_filtered.csv has been successfully opened.


In [57]:
ruta = "/Users/carlos/WORKSPACE/MegaClassifier/dataset/emptyNonEmptyDataset"
ruta = "/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered"

image_count = sum([len(files) for r, d, files in os.walk(ruta)])
print(f"Total number of images: {image_count}")

Total number of images: 28560


In [None]:
unique_images_count = filtered_csv.drop_duplicates(subset="file_name").shape[0]
print(f"Number of unique images: {unique_images_count}")

Number of unique images: 28560


In [None]:
DATASET_PATH = os.path.abspath("./dataset/datasetFiltered")
FILTERED_CSV = os.path.abspath("./data/interim/29618Images_filtered.csv")
filtered_csv = dt.load_from_csv(FILTERED_CSV)

filtered_csv["file_name"] = filtered_csv["file_name"].apply(
    lambda x: os.path.join(DATASET_PATH, x)
)

# Buscar duplicados basados en la columna 'file_name'
duplicated_rows = filtered_csv[filtered_csv.duplicated(subset="file_name", keep=False)]

# Comprobar si los duplicated_rows tienen el mismo valor de label
duplicated_rows_same_label = (
    duplicated_rows.groupby("file_name")["label"].nunique().eq(1)
)

# Mostrar algunas rutas de las imágenes duplicadas
for file_name in duplicated_rows["file_name"].unique()[:10]:
    print(file_name)


The file /Users/carlos/WORKSPACE/MegaClassifier/data/interim/29618Images_filtered.csv has been successfully opened.
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/vacia/empty_caltechcameratraps/59ac6d47-23d2-11e8-a6a3-ec086b02610b.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/vacia/noanimal_zoo_1_4/34_20210115_2825_.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/leporido/rabbit_caltechcameratrap_me/58b8238c-23d2-11e8-a6a3-ec086b02610b.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/gineta/gineta_pnm_2012_isaac/d14639file0027.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/caballo/horse_zoo_1_4/15_20210117_13096_.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/caballo/caballos_javier_jc/2_20201117_3542_.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/zorro/zorro_pnc_2013-o_red_isaac/b19370imag0058.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetF

In [None]:
OUTPUT_FILE_PATH = os.path.abspath(
    "./resources/json/md_v5a/29618_images_0_003_threshold.json"
)

with open(OUTPUT_FILE_PATH, "r") as file:
    data = json.load(file)

In [None]:
predict_columns = [
    "file_name",
    "label",
    "detector_label",
]
dataset = pd.DataFrame(columns=predict_columns)

duplicate_row = ["file_name", "label"]
duplicates = pd.DataFrame(columns=duplicate_row)

for image in tqdm.tqdm(data["images"]):
    # image_file = DATASET_PATH + "/"+ image["file"]
    image_file = image["file"]
    indexes = filtered_csv[filtered_csv["file_name"] == image_file]

    if len(indexes) == 1:
        label = int(indexes["label"].iloc[0])
        detector_label = 0 if image["max_detection_conf"] == 0.0 else 1

        new_row = {
            "file_name": image_file,
            "label": label,
            "detector_label": detector_label,
        }
        dataset = pd.concat([dataset, pd.DataFrame([new_row])], ignore_index=True)

    elif len(indexes) > 1:
        for idx in indexes.index:
            duplicate_row = {
                "file_name": indexes.at[idx, "file_name"],
                "label": indexes.at[idx, "label"],
            }
            duplicates = pd.concat(
                [duplicates, pd.DataFrame([duplicate_row])], ignore_index=True
            )

duplicates

100%|██████████| 29618/29618 [00:32<00:00, 899.04it/s]


Unnamed: 0,file_name,label
0,vacia/vacia_paco_carro_jc/img_0412.jpg,0
1,vacia/vacia_paco_carro_jc/img_0412.jpg,0
2,meloncillo/egyptianmongoose_zoo_1_4/36_2021011...,1
3,meloncillo/egyptianmongoose_zoo_1_4/36_2021011...,1
4,vacia/noanimal_zoo_5_6_7_9/49_20210418_4768_.jpg,0
...,...,...
4252,ave/ave_ave_wellingtoncameratraps_ss/011016170...,1
4253,ave/ave_soraya_padilla_alumno_1_th/stc_0223_fr...,1
4254,ave/ave_soraya_padilla_alumno_1_th/stc_0223_fr...,1
4255,vacia/noanimal_zoo_1_4/6_20201812_11645_.jpg,0


In [None]:
# Filtrar el dataset para obtener las filas que cumplen con las condiciones
filtered_images = dataset[(dataset["detector_label"] == 1) & (dataset["label"] == 0)]
filtered_images["file_name"]

# for item in data["images"]:
#     if item["file"] in filtered_images["file_name"].values:
#         if item["detections"] == []:
#             print(item)
#             print(os.path.join(DATASET_PATH, item["file"]))
#             print(
#                 os.path.join(os.path.abspath("./dataset/datasetFiltered"), item["file"])
#             )
#             print(dataset[dataset["file_name"] == item["file"]])
#             print()


In [None]:
# Ruta base
base_path = DATASET_PATH

# Contadores para las imágenes que se pueden y no se pueden abrir
count_openable = 0
count_not_openable = 0

# Lista para almacenar las rutas de las imágenes que no se pueden abrir
not_openable_files = []

# Iterar sobre las rutas de las imágenes en el dataset
# for image in data["images"]:
# file_path = image["file"]
for file_path in dataset["file_name"]:
    absolute_path = base_path + "/" + file_path
    try:
        # Intentar abrir la imagen
        img = Image.open(absolute_path)
        img.verify()  # Verificar que la imagen se puede abrir
        count_openable += 1
    except (IOError, SyntaxError) as e:
        # Si hay un error, incrementar el contador de imágenes no abiertas
        count_not_openable += 1
        not_openable_files.append(absolute_path)

# Imprimir los resultados
print(f"Number of openable images: {count_openable}")
print(f"Number of not openable images: {count_not_openable}")
print(f"Not openable files: {not_openable_files}")

Number of openable images: 27527
Number of not openable images: 0
Not openable files: []


In [None]:
columns = ["file_name", "label"]

count = 0
data = []
for _, row in train_csv.iterrows():
    if (
        dataset[dataset["file_name"] == row["file_name"]]["detector_label"].values.size
        == 0
    ):
        print(os.path.join(os.path.abspath("./resources/"), row["file_name"]))
        count += 1
print(count)
#     if dataset[dataset['file_name'] == row["file_name"]]['detector_label'].values[0] == 1:
#         file_name = os.path.join(DATASET_PATH, row["file_name"])
#         label = row["label"]
#         data.append([file_name, label])
#     else:
#         continue
# train_dataset = pd.DataFrame(data, columns=columns)
# train_dataset["label"] = train_dataset["label"].astype(str)
# train_dataset

# data = []
# for _, row in tqdm.tqdm(vali_csv.iterrows()):
#     if dataset[dataset['file_name'] == row["file_name"]]['detector_label'].values[0] == 1:
#         file_name = os.path.join(DATASET_PATH, row["file_name"])
#         label = row["label"]
#         data.append([file_name, label])
#     else:
#         continue
# vali_dataset = pd.DataFrame(data, columns=columns)
# vali_dataset["label"] = vali_dataset["label"].astype(str)

# data = []
# for _, row in tqdm.tqdm(test_csv.iterrows()):
#     if dataset[dataset['file_name'] == row["file_name"]]['detector_label'].values[0] == 1:
#         file_name = os.path.join(DATASET_PATH, row["file_name"])
#         label = row["label"]
#         data.append([file_name, label])
#     else:
#         continue
# test_dataset = pd.DataFrame(data, columns=columns)
# test_dataset["label"] = test_dataset["label"].astype(str)

In [None]:
# img_weight = 224
# img_height = 224
img_weight = 1000
img_height = 1000

batch_size = 32

seed = 42

epochs = 50

In [None]:
datagen_train = ImageDataGenerator(
    rescale=1.0 / 255, horizontal_flip=True, brightness_range=[0.8, 1.2]
)
datagen_val_test = ImageDataGenerator(rescale=1.0 / 255)

train_generator = datagen_train.flow_from_dataframe(
    dataframe=train_dataset,
    x_col="file_name",
    y_col="label",
    target_size=(img_weight, img_height),
    batch_size=batch_size,
    class_mode="binary",
    shuffle=True,
    seed=seed,
)

val_generator = datagen_val_test.flow_from_dataframe(
    dataframe=vali_dataset,
    x_col="file_name",
    y_col="label",
    target_size=(img_weight, img_height),
    batch_size=batch_size,
    class_mode="binary",
    shuffle=True,
    seed=seed,
)

test_generator = datagen_val_test.flow_from_dataframe(
    dataframe=test_dataset,
    x_col="file_name",
    y_col="label",
    target_size=(img_weight, img_height),
    batch_size=batch_size,
    class_mode="binary",
    shuffle=False,
)

In [None]:
# Filtrar el dataset para obtener las filas que cumplen con las condiciones
filtered_images = dataset[(dataset["detector_label"] == 1) & (dataset["label"] == 0)]

# Mostrar las rutas de las imágenes
image_paths = filtered_images["file_name"].tolist()
for image_path in image_paths:
    detections = next(
        (img["detections"] for img in data["images"] if img["file"] == image_path), None
    )
    print(f"Image: {image_path}, Detections: {detections}")

In [None]:
mobilenetV2 = MobileNetV2(
    include_top=False,
    input_shape=(img_weight, img_height, 3),
    weights="imagenet",
)

mobilenetV2.trainable = False

model = Sequential(
    [mobilenetV2, GlobalAveragePooling2D(), Dense(1, activation="sigmoid")]
)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()

In [None]:
checkpoint = ModelCheckpoint(
    "./models/MegaClassifier/md_v5a_MobileNetV2_V1.h5",
    monitor="val_loss",
    save_best_only=True,
    mode="min",
    verbose=1,
)

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=4,
    mode="min",
    verbose=1,
    restore_best_weights=True,
)

tensorBoard = TensorBoard(log_dir="./logs/Megaclassifier/MobileNetV2/version_1")

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=epochs,
    steps_per_epoch=2000 // batch_size,
    callbacks=[checkpoint, early_stop, tensorBoard],
    # verbose=2,
)

In [None]:
import matplotlib.pyplot as plt


def plot_training_history(history):
    acc = history.history["accuracy"]
    val_acc = history.history["val_accuracy"]
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    epochs_range = range(len(acc))

    plt.figure(figsize=(20, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label="Accuracy")
    plt.plot(epochs_range, val_acc, label="Validation Accuracy")
    plt.legend(loc="lower right")
    plt.title("Training and Validation Accuracy")

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label="Loss")
    plt.plot(epochs_range, val_loss, label="Validation Loss")
    plt.legend(loc="upper right")
    plt.title("Training and Validation Loss")
    plt.show()


plot_training_history(history)

In [None]:
from tensorflow.keras.models import load_model

megaclassifier_v1 = load_model("./models/MegaClassifier/md_v5a_MobileNetV2_V1.h5")
evaluate = megaclassifier_v1.evaluate(test_generator)


In [None]:
test_labels = test_generator.classes

predict = megaclassifier_v1.predict(test_generator)
predict_flatten = predict.flatten()

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve

random_flatten = np.zeros_like(test_labels)

random_auc = roc_auc_score(test_labels, random_flatten)
megaclassifier_auc = roc_auc_score(test_labels, predict_flatten)

print("Random Classifier: ROC AUC=%.1f" % (random_auc))
print("Megaclassifier V1: ROC AUC=%.4f" % (megaclassifier_auc))

random_false_positive_rate, random_true_positive_rate, _ = roc_curve(
    test_labels, random_flatten
)
megaclassifier_false_positive_rate, megaclassifier_true_positive_rate, _ = roc_curve(
    test_labels, predict_flatten
)

plt.plot(
    random_false_positive_rate,
    random_true_positive_rate,
    linestyle="--",
    label="Aleatorio: ROC AUC=%.1f" % (random_auc),
)
plt.plot(
    megaclassifier_false_positive_rate,
    megaclassifier_true_positive_rate,
    label="Megaclassifier V1: ROC AUC=%.4f" % (megaclassifier_auc),
)

plt.xlabel("Tasa de Falsos Positivos")
plt.ylabel("Tasa de Verdaderos Positivos")
plt.legend()
plt.show()

In [None]:
loss, accuracy = evaluate

df_results = pd.DataFrame(
    {
        "False Positive Rate": megaclassifier_false_positive_rate,
        "True Positive Rate": megaclassifier_true_positive_rate,
        "AUC": [megaclassifier_auc] * len(megaclassifier_false_positive_rate),
        "Loss": [loss] * len(megaclassifier_false_positive_rate),
        "Accuracy": [accuracy] * len(megaclassifier_false_positive_rate),
    }
)

os.makedirs("./logs/Megaclassifier/MobileNetV2/version_1/test", exist_ok=True)
df_results.to_csv(
    "./logs/Megaclassifier/MobileNetV2/version_1/test/megaclassifier_v1_results.csv",
    index=False,
)