In [1]:
import json
import os

import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import tqdm
from keras.callbacks import TensorBoard
from tensorflow.keras.applications import MobileNetV2, ResNet50

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Flatten, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import src.data.Dataset as dt

In [2]:
required_paths = ["/ai4eutils", "/CameraTraps", "/yolov5"]
python_path = os.environ.get("PYTHONPATH", "")
root_path = os.getcwd()

for path in required_paths:
    if not any(p.endswith(path) for p in python_path.split(":")):
        python_path += f":{root_path}/data/external{path}"

os.environ["PYTHONPATH"] = python_path

!echo "PYTHONPATH: $PYTHONPATH"

PYTHONPATH: :/Users/carlos/WORKSPACE/MegaClassifier/data/external/ai4eutils:/Users/carlos/WORKSPACE/MegaClassifier/data/external/CameraTraps:/Users/carlos/WORKSPACE/MegaClassifier/data/external/yolov5


In [3]:
DATASET_PATH = os.path.abspath("./resources/fit/datasetFiltered/md_v5a")

TRAIN_CSV = os.path.abspath("./data/processed/train/10000Train.csv")
VALIDATION_CSV = os.path.abspath("./data/processed/validation/10000Validation.csv")
TEST_CSV = os.path.abspath("./data/processed/test/10000Test.csv")

FILTERED_CSV = os.path.abspath("./data/interim/10000Images_filtered.csv")

print(f"DATASET_PATH:   {DATASET_PATH}")
print(f"TRAIN_CSV:      {TRAIN_CSV}")
print(f"VALIDATION_CSV: {VALIDATION_CSV}")
print(f"TEST_CSV:       {TEST_CSV}")
print(f"FILTERED_CSV:    {FILTERED_CSV}")


DATASET_PATH:   /Users/carlos/WORKSPACE/MegaClassifier/resources/fit/datasetFiltered/md_v5a
TRAIN_CSV:      /Users/carlos/WORKSPACE/MegaClassifier/data/processed/train/10000Train.csv
VALIDATION_CSV: /Users/carlos/WORKSPACE/MegaClassifier/data/processed/validation/10000Validation.csv
TEST_CSV:       /Users/carlos/WORKSPACE/MegaClassifier/data/processed/test/10000Test.csv
FILTERED_CSV:    /Users/carlos/WORKSPACE/MegaClassifier/data/interim/10000Images_filtered.csv


In [4]:
train_csv = dt.load_from_csv(TRAIN_CSV)
vali_csv = dt.load_from_csv(VALIDATION_CSV)
test_csv = dt.load_from_csv(TEST_CSV)
filtered_csv = dt.load_from_csv(FILTERED_CSV)

The file /Users/carlos/WORKSPACE/MegaClassifier/data/processed/train/10000Train.csv has been successfully opened.
The file /Users/carlos/WORKSPACE/MegaClassifier/data/processed/validation/10000Validation.csv has been successfully opened.
The file /Users/carlos/WORKSPACE/MegaClassifier/data/processed/test/10000Test.csv has been successfully opened.
The file /Users/carlos/WORKSPACE/MegaClassifier/data/interim/10000Images_filtered.csv has been successfully opened.


In [5]:
filtered_csv.head()

Unnamed: 0,file_name,label
0,cervidae/cervidredorfallowdeer_zoo_1_4/20_2020...,1
1,leporido/conejo_wellingtoncameratraps_ss/19081...,1
2,vacia/empty_snapshotmountainzebra/mtz_s1_d07_r...,0
3,vacia/noanimal_zoo_5_6_7_9/37_20210319_943_.jpg,0
4,vacia/empty_islandconservationcameratraps/domi...,0


In [None]:
OUTPUT_FILE_PATH = os.path.abspath(
    "./resources/json/md_v5a/29618_images_0_003_threshold.json"
)

with open(OUTPUT_FILE_PATH, "r") as file:
    data = json.load(file)

In [None]:
predict_columns = [
    "file_name",
    "label",
    "detector_label",
]
dataset = pd.DataFrame(columns=predict_columns)

for image in tqdm.tqdm(data["images"]):
    image_file = image["file"]
    indexes = filtered_csv[filtered_csv["file_name"] == image_file]

    if len(indexes) == 1:
        label = int(indexes["label"].iloc[0])
        detector_label = 0 if image["max_detection_conf"] == 0.0 else 1

        new_row = {
            "file_name": image_file,
            "label": label,
            "detector_label": detector_label,
        }
        dataset = pd.concat([dataset, pd.DataFrame([new_row])], ignore_index=True)

dataset

100%|██████████| 29618/29618 [00:32<00:00, 903.21it/s]


Unnamed: 0,file_name,label,detector_label
0,cervidae/cervidredorfallowdeer_zoo_1_4/20_2020...,1,1
1,leporido/conejo_wellingtoncameratraps_ss/19081...,1,1
2,vacia/empty_snapshotmountainzebra/mtz_s1_d07_r...,0,1
3,vacia/noanimal_zoo_5_6_7_9/37_20210319_943_.jpg,0,1
4,vacia/empty_islandconservationcameratraps/domi...,0,1
...,...,...,...
27522,vacia/vacia_ss/27_20201023_17_.jpg,0,1
27523,zorro/redfox_zoo_1_4/32_20201218_348_.jpg,1,1
27524,mapache/raccoon_caltechcameratrap_me/58af767e-...,1,1
27525,cervidae/cervidredorfallowdeer_zoo_5_6_7_9/5_2...,1,1


In [None]:
# Filtrar el dataset para obtener las filas que cumplen con las condiciones
filtered_images = dataset[(dataset["detector_label"] == 1) & (dataset["label"] == 0)]
filtered_images["file_name"]

for item in data["images"]:
    if item["file"] in filtered_images["file_name"].values:
        if item["detections"] == []:
            print(item)
            print(os.path.join(DATASET_PATH, item["file"]))
            print(
                os.path.join(os.path.abspath("./dataset/datasetFiltered"), item["file"])
            )
            print(dataset[dataset["file_name"] == item["file"]])
            print()


In [None]:
columns = ["file_name", "label"]

count = 0
data = []
for _, row in train_csv.iterrows():
    if (
        dataset[dataset["file_name"] == row["file_name"]]["detector_label"].values.size
        == 0
    ):
        print(os.path.join(os.path.abspath("./resources/"), row["file_name"]))
        count += 1
print(count)
#     if dataset[dataset['file_name'] == row["file_name"]]['detector_label'].values[0] == 1:
#         file_name = os.path.join(DATASET_PATH, row["file_name"])
#         label = row["label"]
#         data.append([file_name, label])
#     else:
#         continue
# train_dataset = pd.DataFrame(data, columns=columns)
# train_dataset["label"] = train_dataset["label"].astype(str)
# train_dataset

# data = []
# for _, row in tqdm.tqdm(vali_csv.iterrows()):
#     if dataset[dataset['file_name'] == row["file_name"]]['detector_label'].values[0] == 1:
#         file_name = os.path.join(DATASET_PATH, row["file_name"])
#         label = row["label"]
#         data.append([file_name, label])
#     else:
#         continue
# vali_dataset = pd.DataFrame(data, columns=columns)
# vali_dataset["label"] = vali_dataset["label"].astype(str)

# data = []
# for _, row in tqdm.tqdm(test_csv.iterrows()):
#     if dataset[dataset['file_name'] == row["file_name"]]['detector_label'].values[0] == 1:
#         file_name = os.path.join(DATASET_PATH, row["file_name"])
#         label = row["label"]
#         data.append([file_name, label])
#     else:
#         continue
# test_dataset = pd.DataFrame(data, columns=columns)
# test_dataset["label"] = test_dataset["label"].astype(str)

/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/vacia/noanimal_zoo_5_6_7_9/19_20210319_2024_.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/lince/linces_fecha_linces2_leo_jc/2_lince_flecha_linces2_leo_jc/lince204_23_.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/ave/ave_ave_wellingtoncameratraps_ss/080816142308023a6241.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/leporido/rabbit_mammalweb/6403.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/gato_domestico/gato_wellingtoncameratraps_ss/110816184544011a5902.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/humanovehiculo/humanorvehicle_zoo_5_6_7_9/18_20210219_3016__2021_04_1506_16_22utc_.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/mapache/raccoon_caltechcameratrap_me/5a1b15ed-23d2-11e8-a6a3-ec086b02610b.jpg
/Users/carlos/WORKSPACE/MegaClassifier/dataset/datasetFiltered/lince/lince_b_linces2_leo_jc/lince

In [50]:
# img_weight = 224
# img_height = 224
img_weight = 1000
img_height = 1000

batch_size = 32

seed = 42

epochs = 50

In [None]:
datagen_train = ImageDataGenerator(
    rescale=1.0 / 255, horizontal_flip=True, brightness_range=[0.8, 1.2]
)
datagen_val_test = ImageDataGenerator(rescale=1.0 / 255)

train_generator = datagen_train.flow_from_dataframe(
    dataframe=train_dataset,
    x_col="file_name",
    y_col="label",
    target_size=(img_weight, img_height),
    batch_size=batch_size,
    class_mode="binary",
    shuffle=True,
    seed=seed,
)

val_generator = datagen_val_test.flow_from_dataframe(
    dataframe=vali_dataset,
    x_col="file_name",
    y_col="label",
    target_size=(img_weight, img_height),
    batch_size=batch_size,
    class_mode="binary",
    shuffle=True,
    seed=seed,
)

test_generator = datagen_val_test.flow_from_dataframe(
    dataframe=test_dataset,
    x_col="file_name",
    y_col="label",
    target_size=(img_weight, img_height),
    batch_size=batch_size,
    class_mode="binary",
    shuffle=False,
)

In [None]:
import matplotlib.pyplot as plt

# def show_random_images_from_generator(generator, title):
#     images, labels = next(generator)  # Toma un batch de imágenes y etiquetas del generador
#     plt.figure(figsize=(15, 5))
#     for i in range(5):  # Muestra 5 imágenes del batch
#         plt.subplot(1, 5, i + 1)
#         plt.imshow(images[i])
#         plt.title(f"Label: {int(labels[i])}")
#         plt.axis("off")
#     plt.suptitle(title)
#     plt.show()

# show_random_images_from_generator(train_generator, "Conjunto de Entrenamiento")
# show_random_images_from_generator(val_generator, "Conjunto de Validación")
# show_random_images_from_generator(test_generator, "Conjunto de Prueba")


In [None]:
# Filtrar el dataset para obtener las filas que cumplen con las condiciones
filtered_images = dataset[(dataset["detector_label"] == 1) & (dataset["label"] == 0)]

# Mostrar las rutas de las imágenes
image_paths = filtered_images["file_name"].tolist()
for image_path in image_paths:
    detections = next(
        (img["detections"] for img in data["images"] if img["file"] == image_path), None
    )
    print(f"Image: {image_path}, Detections: {detections}")

In [None]:
mobilenetV2 = MobileNetV2(
    include_top=False,
    input_shape=(img_weight, img_height, 3),
    weights="imagenet",
)

mobilenetV2.trainable = False

model = Sequential(
    [mobilenetV2, GlobalAveragePooling2D(), Dense(1, activation="sigmoid")]
)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()

In [None]:
checkpoint = ModelCheckpoint(
    "./models/MegaClassifier/md_v5a_MobileNetV2_V1.h5",
    monitor="val_loss",
    save_best_only=True,
    mode="min",
    verbose=1,
)

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=4,
    mode="min",
    verbose=1,
    restore_best_weights=True,
)

tensorBoard = TensorBoard(log_dir="./logs/Megaclassifier/MobileNetV2/version_1")

history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=epochs,
    steps_per_epoch=2000 // batch_size,
    callbacks=[checkpoint, early_stop, tensorBoard],
    # verbose=2,
)

In [None]:
import matplotlib.pyplot as plt


def plot_training_history(history):
    acc = history.history["accuracy"]
    val_acc = history.history["val_accuracy"]
    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    epochs_range = range(len(acc))

    plt.figure(figsize=(20, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label="Accuracy")
    plt.plot(epochs_range, val_acc, label="Validation Accuracy")
    plt.legend(loc="lower right")
    plt.title("Training and Validation Accuracy")

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label="Loss")
    plt.plot(epochs_range, val_loss, label="Validation Loss")
    plt.legend(loc="upper right")
    plt.title("Training and Validation Loss")
    plt.show()


plot_training_history(history)

In [None]:
from tensorflow.keras.models import load_model

megaclassifier_v1 = load_model("./models/MegaClassifier/md_v5a_MobileNetV2_V1.h5")
evaluate = megaclassifier_v1.evaluate(test_generator)


In [18]:
test_labels = test_generator.classes

predict = megaclassifier_v1.predict(test_generator)
predict_flatten = predict.flatten()

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve

random_flatten = np.zeros_like(test_labels)

random_auc = roc_auc_score(test_labels, random_flatten)
megaclassifier_auc = roc_auc_score(test_labels, predict_flatten)

print("Random Classifier: ROC AUC=%.1f" % (random_auc))
print("Megaclassifier V1: ROC AUC=%.4f" % (megaclassifier_auc))

random_false_positive_rate, random_true_positive_rate, _ = roc_curve(
    test_labels, random_flatten
)
megaclassifier_false_positive_rate, megaclassifier_true_positive_rate, _ = roc_curve(
    test_labels, predict_flatten
)

plt.plot(
    random_false_positive_rate,
    random_true_positive_rate,
    linestyle="--",
    label="Aleatorio: ROC AUC=%.1f" % (random_auc),
)
plt.plot(
    megaclassifier_false_positive_rate,
    megaclassifier_true_positive_rate,
    label="Megaclassifier V1: ROC AUC=%.4f" % (megaclassifier_auc),
)

plt.xlabel("Tasa de Falsos Positivos")
plt.ylabel("Tasa de Verdaderos Positivos")
plt.legend()
plt.show()

In [27]:
loss, accuracy = evaluate

df_results = pd.DataFrame(
    {
        "False Positive Rate": megaclassifier_false_positive_rate,
        "True Positive Rate": megaclassifier_true_positive_rate,
        "AUC": [megaclassifier_auc] * len(megaclassifier_false_positive_rate),
        "Loss": [loss] * len(megaclassifier_false_positive_rate),
        "Accuracy": [accuracy] * len(megaclassifier_false_positive_rate),
    }
)

os.makedirs("./logs/Megaclassifier/MobileNetV2/version_1/test", exist_ok=True)
df_results.to_csv(
    "./logs/Megaclassifier/MobileNetV2/version_1/test/megaclassifier_v1_results.csv",
    index=False,
)